summaryrefslogtreecommitdiffstats
path: root/src/runtime/trace2.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/trace2.go')
-rw-r--r--src/runtime/trace2.go1001
1 files changed, 1001 insertions, 0 deletions
diff --git a/src/runtime/trace2.go b/src/runtime/trace2.go
new file mode 100644
index 0000000..673205d
--- /dev/null
+++ b/src/runtime/trace2.go
@@ -0,0 +1,1001 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.exectracer2
+
+// Go execution tracer.
+// The tracer captures a wide range of execution events like goroutine
+// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
+// changes of heap size, processor start/stop, etc and writes them to a buffer
+// in a compact form. A precise nanosecond-precision timestamp and a stack
+// trace is captured for most events.
+//
+// Tracer invariants (to keep the synchronization making sense):
+// - An m that has a trace buffer must be on either the allm or sched.freem lists.
+// - Any trace buffer mutation must either be happening in traceAdvance or between
+// a traceAcquire and a subsequent traceRelease.
+// - traceAdvance cannot return until the previous generation's buffers are all flushed.
+//
+// See https://go.dev/issue/60773 for a link to the full design.
+
+package runtime
+
+import (
+ "runtime/internal/atomic"
+ "unsafe"
+)
+
+// Trace state.
+
+// trace is global tracing context.
+var trace struct {
+ // trace.lock must only be acquired on the system stack where
+ // stack splits cannot happen while it is held.
+ lock mutex
+
+ // Trace buffer management.
+ //
+ // First we check the empty list for any free buffers. If not, buffers
+ // are allocated directly from the OS. Once they're filled up and/or
+ // flushed, they end up on the full queue for trace.gen%2.
+ //
+ // The trace reader takes buffers off the full list one-by-one and
+ // places them into reading until they're finished being read from.
+ // Then they're placed onto the empty list.
+ //
+ // Protected by trace.lock.
+ reading *traceBuf // buffer currently handed off to user
+ empty *traceBuf // stack of empty buffers
+ full [2]traceBufQueue
+ workAvailable atomic.Bool
+
+ // State for the trace reader goroutine.
+ //
+ // Protected by trace.lock.
+ readerGen atomic.Uintptr // the generation the reader is currently reading for
+ flushedGen atomic.Uintptr // the last completed generation
+ headerWritten bool // whether ReadTrace has emitted trace header
+
+ // doneSema is used to synchronize the reader and traceAdvance. Specifically,
+ // it notifies traceAdvance that the reader is done with a generation.
+ // Both semaphores are 0 by default (so, acquires block). traceAdvance
+ // attempts to acquire for gen%2 after flushing the last buffers for gen.
+ // Meanwhile the reader releases the sema for gen%2 when it has finished
+ // processing gen.
+ doneSema [2]uint32
+
+ // Trace data tables for deduplicating data going into the trace.
+ // There are 2 of each: one for gen%2, one for 1-gen%2.
+ stackTab [2]traceStackTable // maps stack traces to unique ids
+ stringTab [2]traceStringTable // maps strings to unique ids
+
+ // cpuLogRead accepts CPU profile samples from the signal handler where
+ // they're generated. There are two profBufs here: one for gen%2, one for
+ // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G,
+ // and M (respectively) that were active at the time of the sample. Because
+ // profBuf uses a record with all zeros in its header to indicate overflow,
+ // we make sure to make the P field always non-zero: The ID of a real P will
+ // start at bit 1, and bit 0 will be set. Samples that arrive while no P is
+ // running (such as near syscalls) will set the first header field to 0b10.
+ // This careful handling of the first header field allows us to store ID of
+ // the active G directly in the second field, even though that will be 0
+ // when sampling g0.
+ //
+ // Initialization and teardown of these fields is protected by traceAdvanceSema.
+ cpuLogRead [2]*profBuf
+ signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers
+ cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock
+ cpuSleep *wakeableSleep
+ cpuLogDone <-chan struct{}
+ cpuBuf [2]*traceBuf
+
+ reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
+
+ // Fast mappings from enumerations to string IDs that are prepopulated
+ // in the trace.
+ markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg
+ goStopReasons [2][len(traceGoStopReasonStrings)]traceArg
+ goBlockReasons [2][len(traceBlockReasonStrings)]traceArg
+
+ // Trace generation counter.
+ gen atomic.Uintptr
+ lastNonZeroGen uintptr // last non-zero value of gen
+
+ // shutdown is set when we are waiting for trace reader to finish after setting gen to 0
+ //
+ // Writes protected by trace.lock.
+ shutdown atomic.Bool
+
+ // Number of goroutines in syscall exiting slow path.
+ exitingSyscall atomic.Int32
+
+ // seqGC is the sequence counter for GC begin/end.
+ //
+ // Mutated only during stop-the-world.
+ seqGC uint64
+}
+
+// Trace public API.
+
+var (
+ traceAdvanceSema uint32 = 1
+ traceShutdownSema uint32 = 1
+)
+
+// StartTrace enables tracing for the current process.
+// While tracing, the data will be buffered and available via [ReadTrace].
+// StartTrace returns an error if tracing is already enabled.
+// Most clients should use the [runtime/trace] package or the [testing] package's
+// -test.trace flag instead of calling StartTrace directly.
+func StartTrace() error {
+ if traceEnabled() || traceShuttingDown() {
+ return errorString("tracing is already enabled")
+ }
+ // Block until cleanup of the last trace is done.
+ semacquire(&traceShutdownSema)
+ semrelease(&traceShutdownSema)
+
+ // Hold traceAdvanceSema across trace start, since we'll want it on
+ // the other side of tracing being enabled globally.
+ semacquire(&traceAdvanceSema)
+
+ // Initialize CPU profile -> trace ingestion.
+ traceInitReadCPU()
+
+ // Compute the first generation for this StartTrace.
+ //
+ // Note: we start from the last non-zero generation rather than 1 so we
+ // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's
+ // more than one of each per m, p, and goroutine.
+ firstGen := traceNextGen(trace.lastNonZeroGen)
+
+ // Reset GC sequencer.
+ trace.seqGC = 1
+
+ // Reset trace reader state.
+ trace.headerWritten = false
+ trace.readerGen.Store(firstGen)
+ trace.flushedGen.Store(0)
+
+ // Register some basic strings in the string tables.
+ traceRegisterLabelsAndReasons(firstGen)
+
+ // Stop the world.
+ //
+ // The purpose of stopping the world is to make sure that no goroutine is in a
+ // context where it could emit an event by bringing all goroutines to a safe point
+ // with no opportunity to transition.
+ //
+ // The exception to this rule are goroutines that are concurrently exiting a syscall.
+ // Those will all be forced into the syscalling slow path, and we'll just make sure
+ // that we don't observe any goroutines in that critical section before starting
+ // the world again.
+ //
+ // A good follow-up question to this is why stopping the world is necessary at all
+ // given that we have traceAcquire and traceRelease. Unfortunately, those only help
+ // us when tracing is already active (for performance, so when tracing is off the
+ // tracing seqlock is left untouched). The main issue here is subtle: we're going to
+ // want to obtain a correct starting status for each goroutine, but there are windows
+ // of time in which we could read and emit an incorrect status. Specifically:
+ //
+ // trace := traceAcquire()
+ // // <----> problem window
+ // casgstatus(gp, _Gwaiting, _Grunnable)
+ // if trace.ok() {
+ // trace.GoUnpark(gp, 2)
+ // traceRelease(trace)
+ // }
+ //
+ // More precisely, if we readgstatus for a gp while another goroutine is in the problem
+ // window and that goroutine didn't observe that tracing had begun, then we might write
+ // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
+ // the transition from GoWaiting to GoRunnable. The trace will then be broken, because
+ // future events will be emitted assuming the tracer sees GoRunnable.
+ //
+ // In short, what we really need here is to make sure that the next time *any goroutine*
+ // hits a traceAcquire, it sees that the trace is enabled.
+ //
+ // Note also that stopping the world is necessary to make sure sweep-related events are
+ // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
+ // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this.
+ stw := stopTheWorld(stwStartTrace)
+
+ // Prevent sysmon from running any code that could generate events.
+ lock(&sched.sysmonlock)
+
+ // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled.
+ for _, pp := range allp {
+ pp.trace.mSyscallID = -1
+ }
+
+ // Start tracing.
+ //
+ // After this executes, other Ms may start creating trace buffers and emitting
+ // data into them.
+ trace.gen.Store(firstGen)
+
+ // Wait for exitingSyscall to drain.
+ //
+ // It may not monotonically decrease to zero, but in the limit it will always become
+ // zero because the world is stopped and there are no available Ps for syscall-exited
+ // goroutines to run on.
+ //
+ // Because we set gen before checking this, and because exitingSyscall is always incremented
+ // *after* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero
+ // that any goroutine that goes to exit a syscall from then on *must* observe the new gen.
+ //
+ // The critical section on each goroutine here is going to be quite short, so the likelihood
+ // that we observe a zero value is high.
+ for trace.exitingSyscall.Load() != 0 {
+ osyield()
+ }
+
+ // Record some initial pieces of information.
+ //
+ // N.B. This will also emit a status event for this goroutine.
+ tl := traceAcquire()
+ tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance.
+ tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now.
+
+ // Record the fact that a GC is active, if applicable.
+ if gcphase == _GCmark || gcphase == _GCmarktermination {
+ tl.GCActive()
+ }
+
+ // Record the heap goal so we have it at the very beginning of the trace.
+ tl.HeapGoal()
+
+ // Make sure a ProcStatus is emitted for every P, while we're here.
+ for _, pp := range allp {
+ tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end()
+ }
+ traceRelease(tl)
+
+ unlock(&sched.sysmonlock)
+ startTheWorld(stw)
+
+ traceStartReadCPU()
+ traceAdvancer.start()
+
+ semrelease(&traceAdvanceSema)
+ return nil
+}
+
+// StopTrace stops tracing, if it was previously enabled.
+// StopTrace only returns after all the reads for the trace have completed.
+func StopTrace() {
+ traceAdvance(true)
+}
+
+// traceAdvance moves tracing to the next generation, and cleans up the current generation,
+// ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing
+// altogether instead of advancing to the next generation.
+//
+// traceAdvanceSema must not be held.
+func traceAdvance(stopTrace bool) {
+ semacquire(&traceAdvanceSema)
+
+ // Get the gen that we're advancing from. In this function we don't really care much
+ // about the generation we're advancing _into_ since we'll do all the cleanup in this
+ // generation for the next advancement.
+ gen := trace.gen.Load()
+ if gen == 0 {
+ // We may end up here traceAdvance is called concurrently with StopTrace.
+ semrelease(&traceAdvanceSema)
+ return
+ }
+
+ // Write an EvFrequency event for this generation.
+ //
+ // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do
+ // this here and not e.g. on the trace reader.
+ traceFrequency(gen)
+
+ // Collect all the untraced Gs.
+ type untracedG struct {
+ gp *g
+ goid uint64
+ mid int64
+ status uint32
+ waitreason waitReason
+ inMarkAssist bool
+ }
+ var untracedGs []untracedG
+ forEachGRace(func(gp *g) {
+ // Make absolutely sure all Gs are ready for the next
+ // generation. We need to do this even for dead Gs because
+ // they may come alive with a new identity, and its status
+ // traced bookkeeping might end up being stale.
+ // We may miss totally new goroutines, but they'll always
+ // have clean bookkeeping.
+ gp.trace.readyNextGen(gen)
+ // If the status was traced, nothing else to do.
+ if gp.trace.statusWasTraced(gen) {
+ return
+ }
+ // Scribble down information about this goroutine.
+ ug := untracedG{gp: gp, mid: -1}
+ systemstack(func() {
+ me := getg().m.curg
+ // We don't have to handle this G status transition because we
+ // already eliminated ourselves from consideration above.
+ casGToWaiting(me, _Grunning, waitReasonTraceGoroutineStatus)
+ // We need to suspend and take ownership of the G to safely read its
+ // goid. Note that we can't actually emit the event at this point
+ // because we might stop the G in a window where it's unsafe to write
+ // events based on the G's status. We need the global trace buffer flush
+ // coming up to make sure we're not racing with the G.
+ //
+ // It should be very unlikely that we try to preempt a running G here.
+ // The only situation that we might is that we're racing with a G
+ // that's running for the first time in this generation. Therefore,
+ // this should be relatively fast.
+ s := suspendG(gp)
+ if !s.dead {
+ ug.goid = s.g.goid
+ if s.g.m != nil {
+ ug.mid = int64(s.g.m.procid)
+ }
+ ug.status = readgstatus(s.g) &^ _Gscan
+ ug.waitreason = s.g.waitreason
+ ug.inMarkAssist = s.g.inMarkAssist
+ }
+ resumeG(s)
+ casgstatus(me, _Gwaiting, _Grunning)
+ })
+ if ug.goid != 0 {
+ untracedGs = append(untracedGs, ug)
+ }
+ })
+
+ if !stopTrace {
+ // Re-register runtime goroutine labels and stop/block reasons.
+ traceRegisterLabelsAndReasons(traceNextGen(gen))
+ }
+
+ // Now that we've done some of the heavy stuff, prevent the world from stopping.
+ // This is necessary to ensure the consistency of the STW events. If we're feeling
+ // adventurous we could lift this restriction and add a STWActive event, but the
+ // cost of maintaining this consistency is low. We're not going to hold this semaphore
+ // for very long and most STW periods are very short.
+ // Once we hold worldsema, prevent preemption as well so we're not interrupted partway
+ // through this. We want to get this done as soon as possible.
+ semacquire(&worldsema)
+ mp := acquirem()
+
+ // Advance the generation or stop the trace.
+ trace.lastNonZeroGen = gen
+ if stopTrace {
+ systemstack(func() {
+ // Ordering is important here. Set shutdown first, then disable tracing,
+ // so that conditions like (traceEnabled() || traceShuttingDown()) have
+ // no opportunity to be false. Hold the trace lock so this update appears
+ // atomic to the trace reader.
+ lock(&trace.lock)
+ trace.shutdown.Store(true)
+ trace.gen.Store(0)
+ unlock(&trace.lock)
+ })
+ } else {
+ trace.gen.Store(traceNextGen(gen))
+ }
+
+ // Emit a ProcsChange event so we have one on record for each generation.
+ // Let's emit it as soon as possible so that downstream tools can rely on the value
+ // being there fairly soon in a generation.
+ //
+ // It's important that we do this before allowing stop-the-worlds again,
+ // because the procs count could change.
+ if !stopTrace {
+ tl := traceAcquire()
+ tl.Gomaxprocs(gomaxprocs)
+ traceRelease(tl)
+ }
+
+ // Emit a GCActive event in the new generation if necessary.
+ //
+ // It's important that we do this before allowing stop-the-worlds again,
+ // because that could emit global GC-related events.
+ if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) {
+ tl := traceAcquire()
+ tl.GCActive()
+ traceRelease(tl)
+ }
+
+ // Preemption is OK again after this. If the world stops or whatever it's fine.
+ // We're just cleaning up the last generation after this point.
+ //
+ // We also don't care if the GC starts again after this for the same reasons.
+ releasem(mp)
+ semrelease(&worldsema)
+
+ // Snapshot allm and freem.
+ //
+ // Snapshotting after the generation counter update is sufficient.
+ // Because an m must be on either allm or sched.freem if it has an active trace
+ // buffer, new threads added to allm after this point must necessarily observe
+ // the new generation number (sched.lock acts as a barrier).
+ //
+ // Threads that exit before this point and are on neither list explicitly
+ // flush their own buffers in traceThreadDestroy.
+ //
+ // Snapshotting freem is necessary because Ms can continue to emit events
+ // while they're still on that list. Removal from sched.freem is serialized with
+ // this snapshot, so either we'll capture an m on sched.freem and race with
+ // the removal to flush its buffers (resolved by traceThreadDestroy acquiring
+ // the thread's seqlock, which one of us must win, so at least its old gen buffer
+ // will be flushed in time for the new generation) or it will have flushed its
+ // buffers before we snapshotted it to begin with.
+ lock(&sched.lock)
+ mToFlush := allm
+ for mp := mToFlush; mp != nil; mp = mp.alllink {
+ mp.trace.link = mp.alllink
+ }
+ for mp := sched.freem; mp != nil; mp = mp.freelink {
+ mp.trace.link = mToFlush
+ mToFlush = mp
+ }
+ unlock(&sched.lock)
+
+ // Iterate over our snapshot, flushing every buffer until we're done.
+ //
+ // Because trace writers read the generation while the seqlock is
+ // held, we can be certain that when there are no writers there are
+ // also no stale generation values left. Therefore, it's safe to flush
+ // any buffers that remain in that generation's slot.
+ const debugDeadlock = false
+ systemstack(func() {
+ // Track iterations for some rudimentary deadlock detection.
+ i := 0
+ detectedDeadlock := false
+
+ for mToFlush != nil {
+ prev := &mToFlush
+ for mp := *prev; mp != nil; {
+ if mp.trace.seqlock.Load()%2 != 0 {
+ // The M is writing. Come back to it later.
+ prev = &mp.trace.link
+ mp = mp.trace.link
+ continue
+ }
+ // Flush the trace buffer.
+ //
+ // trace.lock needed for traceBufFlush, but also to synchronize
+ // with traceThreadDestroy, which flushes both buffers unconditionally.
+ lock(&trace.lock)
+ bufp := &mp.trace.buf[gen%2]
+ if *bufp != nil {
+ traceBufFlush(*bufp, gen)
+ *bufp = nil
+ }
+ unlock(&trace.lock)
+
+ // Remove the m from the flush list.
+ *prev = mp.trace.link
+ mp.trace.link = nil
+ mp = *prev
+ }
+ // Yield only if we're going to be going around the loop again.
+ if mToFlush != nil {
+ osyield()
+ }
+
+ if debugDeadlock {
+ // Try to detect a deadlock. We probably shouldn't loop here
+ // this many times.
+ if i > 100000 && !detectedDeadlock {
+ detectedDeadlock = true
+ println("runtime: failing to flush")
+ for mp := mToFlush; mp != nil; mp = mp.trace.link {
+ print("runtime: m=", mp.id, "\n")
+ }
+ }
+ i++
+ }
+ }
+ })
+
+ // At this point, the old generation is fully flushed minus stack and string
+ // tables, CPU samples, and goroutines that haven't run at all during the last
+ // generation.
+
+ // Check to see if any Gs still haven't had events written out for them.
+ statusWriter := unsafeTraceWriter(gen, nil)
+ for _, ug := range untracedGs {
+ if ug.gp.trace.statusWasTraced(gen) {
+ // It was traced, we don't need to do anything.
+ continue
+ }
+ // It still wasn't traced. Because we ensured all Ms stopped writing trace
+ // events to the last generation, that must mean the G never had its status
+ // traced in gen between when we recorded it and now. If that's true, the goid
+ // and status we recorded then is exactly what we want right now.
+ status := goStatusToTraceGoStatus(ug.status, ug.waitreason)
+ statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist)
+ }
+ statusWriter.flush().end()
+
+ // Read everything out of the last gen's CPU profile buffer.
+ traceReadCPU(gen)
+
+ systemstack(func() {
+ // Flush CPU samples, stacks, and strings for the last generation. This is safe,
+ // because we're now certain no M is writing to the last generation.
+ //
+ // Ordering is important here. traceCPUFlush may generate new stacks and dumping
+ // stacks may generate new strings.
+ traceCPUFlush(gen)
+ trace.stackTab[gen%2].dump(gen)
+ trace.stringTab[gen%2].reset(gen)
+
+ // That's it. This generation is done producing buffers.
+ lock(&trace.lock)
+ trace.flushedGen.Store(gen)
+ unlock(&trace.lock)
+ })
+
+ if stopTrace {
+ semacquire(&traceShutdownSema)
+
+ // Finish off CPU profile reading.
+ traceStopReadCPU()
+ } else {
+ // Go over each P and emit a status event for it if necessary.
+ //
+ // We do this at the beginning of the new generation instead of the
+ // end like we do for goroutines because forEachP doesn't give us a
+ // hook to skip Ps that have already been traced. Since we have to
+ // preempt all Ps anyway, might as well stay consistent with StartTrace
+ // which does this during the STW.
+ semacquire(&worldsema)
+ forEachP(waitReasonTraceProcStatus, func(pp *p) {
+ tl := traceAcquire()
+ if !pp.trace.statusWasTraced(tl.gen) {
+ tl.writer().writeProcStatusForP(pp, false).end()
+ }
+ traceRelease(tl)
+ })
+ // Perform status reset on dead Ps because they just appear as idle.
+ //
+ // Holding worldsema prevents allp from changing.
+ //
+ // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy
+ // events to indicate whether a P exists, rather than just making its
+ // existence implicit.
+ for _, pp := range allp[len(allp):cap(allp)] {
+ pp.trace.readyNextGen(traceNextGen(gen))
+ }
+ semrelease(&worldsema)
+ }
+
+ // Block until the trace reader has finished processing the last generation.
+ semacquire(&trace.doneSema[gen%2])
+ if raceenabled {
+ raceacquire(unsafe.Pointer(&trace.doneSema[gen%2]))
+ }
+
+ // Double-check that things look as we expect after advancing and perform some
+ // final cleanup if the trace has fully stopped.
+ systemstack(func() {
+ lock(&trace.lock)
+ if !trace.full[gen%2].empty() {
+ throw("trace: non-empty full trace buffer for done generation")
+ }
+ if stopTrace {
+ if !trace.full[1-(gen%2)].empty() {
+ throw("trace: non-empty full trace buffer for next generation")
+ }
+ if trace.reading != nil || trace.reader.Load() != nil {
+ throw("trace: reading after shutdown")
+ }
+ // Free all the empty buffers.
+ for trace.empty != nil {
+ buf := trace.empty
+ trace.empty = buf.link
+ sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
+ }
+ // Clear trace.shutdown and other flags.
+ trace.headerWritten = false
+ trace.shutdown.Store(false)
+ }
+ unlock(&trace.lock)
+ })
+
+ if stopTrace {
+ // Clear the sweep state on every P for the next time tracing is enabled.
+ //
+ // It may be stale in the next trace because we may have ended tracing in
+ // the middle of a sweep on a P.
+ //
+ // It's fine not to call forEachP here because tracing is disabled and we
+ // know at this point that nothing is calling into the tracer, but we do
+ // need to look at dead Ps too just because GOMAXPROCS could have been called
+ // at any point since we stopped tracing, and we have to ensure there's no
+ // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that
+ // might mutate allp by making ourselves briefly non-preemptible.
+ mp := acquirem()
+ for _, pp := range allp[:cap(allp)] {
+ pp.trace.inSweep = false
+ pp.trace.maySweep = false
+ pp.trace.swept = 0
+ pp.trace.reclaimed = 0
+ }
+ releasem(mp)
+ }
+
+ // Release the advance semaphore. If stopTrace is true we're still holding onto
+ // traceShutdownSema.
+ //
+ // Do a direct handoff. Don't let one caller of traceAdvance starve
+ // other calls to traceAdvance.
+ semrelease1(&traceAdvanceSema, true, 0)
+
+ if stopTrace {
+ // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because
+ // we'll deadlock (we're blocked on the advancer goroutine exiting, but it
+ // may be currently trying to acquire traceAdvanceSema).
+ traceAdvancer.stop()
+ semrelease(&traceShutdownSema)
+ }
+}
+
+func traceNextGen(gen uintptr) uintptr {
+ if gen == ^uintptr(0) {
+ // gen is used both %2 and %3 and we want both patterns to continue when we loop around.
+ // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation
+ // we want is even and one more than a multiple of 3. The smallest such number is 4.
+ return 4
+ }
+ return gen + 1
+}
+
+// traceRegisterLabelsAndReasons re-registers mark worker labels and
+// goroutine stop/block reasons in the string table for the provided
+// generation. Note: the provided generation must not have started yet.
+func traceRegisterLabelsAndReasons(gen uintptr) {
+ for i, label := range gcMarkWorkerModeStrings[:] {
+ trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label))
+ }
+ for i, str := range traceBlockReasonStrings[:] {
+ trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
+ }
+ for i, str := range traceGoStopReasonStrings[:] {
+ trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
+ }
+}
+
+// ReadTrace returns the next chunk of binary tracing data, blocking until data
+// is available. If tracing is turned off and all the data accumulated while it
+// was on has been returned, ReadTrace returns nil. The caller must copy the
+// returned data before calling ReadTrace again.
+// ReadTrace must be called from one goroutine at a time.
+func ReadTrace() []byte {
+top:
+ var buf []byte
+ var park bool
+ systemstack(func() {
+ buf, park = readTrace0()
+ })
+ if park {
+ gopark(func(gp *g, _ unsafe.Pointer) bool {
+ if !trace.reader.CompareAndSwapNoWB(nil, gp) {
+ // We're racing with another reader.
+ // Wake up and handle this case.
+ return false
+ }
+
+ if g2 := traceReader(); gp == g2 {
+ // New data arrived between unlocking
+ // and the CAS and we won the wake-up
+ // race, so wake up directly.
+ return false
+ } else if g2 != nil {
+ printlock()
+ println("runtime: got trace reader", g2, g2.goid)
+ throw("unexpected trace reader")
+ }
+
+ return true
+ }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2)
+ goto top
+ }
+
+ return buf
+}
+
+// readTrace0 is ReadTrace's continuation on g0. This must run on the
+// system stack because it acquires trace.lock.
+//
+//go:systemstack
+func readTrace0() (buf []byte, park bool) {
+ if raceenabled {
+ // g0 doesn't have a race context. Borrow the user G's.
+ if getg().racectx != 0 {
+ throw("expected racectx == 0")
+ }
+ getg().racectx = getg().m.curg.racectx
+ // (This defer should get open-coded, which is safe on
+ // the system stack.)
+ defer func() { getg().racectx = 0 }()
+ }
+
+ // This function must not allocate while holding trace.lock:
+ // allocation can call heap allocate, which will try to emit a trace
+ // event while holding heap lock.
+ lock(&trace.lock)
+
+ if trace.reader.Load() != nil {
+ // More than one goroutine reads trace. This is bad.
+ // But we rather do not crash the program because of tracing,
+ // because tracing can be enabled at runtime on prod servers.
+ unlock(&trace.lock)
+ println("runtime: ReadTrace called from multiple goroutines simultaneously")
+ return nil, false
+ }
+ // Recycle the old buffer.
+ if buf := trace.reading; buf != nil {
+ buf.link = trace.empty
+ trace.empty = buf
+ trace.reading = nil
+ }
+ // Write trace header.
+ if !trace.headerWritten {
+ trace.headerWritten = true
+ unlock(&trace.lock)
+ return []byte("go 1.22 trace\x00\x00\x00"), false
+ }
+
+ // Read the next buffer.
+
+ if trace.readerGen.Load() == 0 {
+ trace.readerGen.Store(1)
+ }
+ var gen uintptr
+ for {
+ assertLockHeld(&trace.lock)
+ gen = trace.readerGen.Load()
+
+ // Check to see if we need to block for more data in this generation
+ // or if we need to move our generation forward.
+ if !trace.full[gen%2].empty() {
+ break
+ }
+ // Most of the time readerGen is one generation ahead of flushedGen, as the
+ // current generation is being read from. Then, once the last buffer is flushed
+ // into readerGen, flushedGen will rise to meet it. At this point, the tracer
+ // is waiting on the reader to finish flushing the last generation so that it
+ // can continue to advance.
+ if trace.flushedGen.Load() == gen {
+ if trace.shutdown.Load() {
+ unlock(&trace.lock)
+
+ // Wake up anyone waiting for us to be done with this generation.
+ //
+ // Do this after reading trace.shutdown, because the thread we're
+ // waking up is going to clear trace.shutdown.
+ if raceenabled {
+ // Model synchronization on trace.doneSema, which te race
+ // detector does not see. This is required to avoid false
+ // race reports on writer passed to trace.Start.
+ racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
+ }
+ semrelease(&trace.doneSema[gen%2])
+
+ // We're shutting down, and the last generation is fully
+ // read. We're done.
+ return nil, false
+ }
+ // The previous gen has had all of its buffers flushed, and
+ // there's nothing else for us to read. Advance the generation
+ // we're reading from and try again.
+ trace.readerGen.Store(trace.gen.Load())
+ unlock(&trace.lock)
+
+ // Wake up anyone waiting for us to be done with this generation.
+ //
+ // Do this after reading gen to make sure we can't have the trace
+ // advance until we've read it.
+ if raceenabled {
+ // See comment above in the shutdown case.
+ racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
+ }
+ semrelease(&trace.doneSema[gen%2])
+
+ // Reacquire the lock and go back to the top of the loop.
+ lock(&trace.lock)
+ continue
+ }
+ // Wait for new data.
+ //
+ // We don't simply use a note because the scheduler
+ // executes this goroutine directly when it wakes up
+ // (also a note would consume an M).
+ //
+ // Before we drop the lock, clear the workAvailable flag. Work can
+ // only be queued with trace.lock held, so this is at least true until
+ // we drop the lock.
+ trace.workAvailable.Store(false)
+ unlock(&trace.lock)
+ return nil, true
+ }
+ // Pull a buffer.
+ tbuf := trace.full[gen%2].pop()
+ trace.reading = tbuf
+ unlock(&trace.lock)
+ return tbuf.arr[:tbuf.pos], false
+}
+
+// traceReader returns the trace reader that should be woken up, if any.
+// Callers should first check (traceEnabled() || traceShuttingDown()).
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
+func traceReader() *g {
+ gp := traceReaderAvailable()
+ if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
+ return nil
+ }
+ return gp
+}
+
+// traceReaderAvailable returns the trace reader if it is not currently
+// scheduled and should be. Callers should first check that
+// (traceEnabled() || traceShuttingDown()) is true.
+func traceReaderAvailable() *g {
+ // There are three conditions under which we definitely want to schedule
+ // the reader:
+ // - The reader is lagging behind in finishing off the last generation.
+ // In this case, trace buffers could even be empty, but the trace
+ // advancer will be waiting on the reader, so we have to make sure
+ // to schedule the reader ASAP.
+ // - The reader has pending work to process for it's reader generation
+ // (assuming readerGen is not lagging behind). Note that we also want
+ // to be careful *not* to schedule the reader if there's no work to do.
+ // - The trace is shutting down. The trace stopper blocks on the reader
+ // to finish, much like trace advancement.
+ //
+ // We also want to be careful not to schedule the reader if there's no
+ // reason to.
+ if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() {
+ return trace.reader.Load()
+ }
+ return nil
+}
+
+// Trace advancer goroutine.
+var traceAdvancer traceAdvancerState
+
+type traceAdvancerState struct {
+ timer *wakeableSleep
+ done chan struct{}
+}
+
+// start starts a new traceAdvancer.
+func (s *traceAdvancerState) start() {
+ // Start a goroutine to periodically advance the trace generation.
+ s.done = make(chan struct{})
+ s.timer = newWakeableSleep()
+ go func() {
+ for traceEnabled() {
+ // Set a timer to wake us up
+ s.timer.sleep(int64(debug.traceadvanceperiod))
+
+ // Try to advance the trace.
+ traceAdvance(false)
+ }
+ s.done <- struct{}{}
+ }()
+}
+
+// stop stops a traceAdvancer and blocks until it exits.
+func (s *traceAdvancerState) stop() {
+ s.timer.wake()
+ <-s.done
+ close(s.done)
+ s.timer.close()
+}
+
+// traceAdvancePeriod is the approximate period between
+// new generations.
+const defaultTraceAdvancePeriod = 1e9 // 1 second.
+
+// wakeableSleep manages a wakeable goroutine sleep.
+//
+// Users of this type must call init before first use and
+// close to free up resources. Once close is called, init
+// must be called before another use.
+type wakeableSleep struct {
+ timer *timer
+
+ // lock protects access to wakeup, but not send/recv on it.
+ lock mutex
+ wakeup chan struct{}
+}
+
+// newWakeableSleep initializes a new wakeableSleep and returns it.
+func newWakeableSleep() *wakeableSleep {
+ s := new(wakeableSleep)
+ lockInit(&s.lock, lockRankWakeableSleep)
+ s.wakeup = make(chan struct{}, 1)
+ s.timer = new(timer)
+ s.timer.arg = s
+ s.timer.f = func(s any, _ uintptr) {
+ s.(*wakeableSleep).wake()
+ }
+ return s
+}
+
+// sleep sleeps for the provided duration in nanoseconds or until
+// another goroutine calls wake.
+//
+// Must not be called by more than one goroutine at a time and
+// must not be called concurrently with close.
+func (s *wakeableSleep) sleep(ns int64) {
+ resetTimer(s.timer, nanotime()+ns)
+ lock(&s.lock)
+ if raceenabled {
+ raceacquire(unsafe.Pointer(&s.lock))
+ }
+ wakeup := s.wakeup
+ if raceenabled {
+ racerelease(unsafe.Pointer(&s.lock))
+ }
+ unlock(&s.lock)
+ <-wakeup
+ stopTimer(s.timer)
+}
+
+// wake awakens any goroutine sleeping on the timer.
+//
+// Safe for concurrent use with all other methods.
+func (s *wakeableSleep) wake() {
+ // Grab the wakeup channel, which may be nil if we're
+ // racing with close.
+ lock(&s.lock)
+ if raceenabled {
+ raceacquire(unsafe.Pointer(&s.lock))
+ }
+ if s.wakeup != nil {
+ // Non-blocking send.
+ //
+ // Others may also write to this channel and we don't
+ // want to block on the receiver waking up. This also
+ // effectively batches together wakeup notifications.
+ select {
+ case s.wakeup <- struct{}{}:
+ default:
+ }
+ }
+ if raceenabled {
+ racerelease(unsafe.Pointer(&s.lock))
+ }
+ unlock(&s.lock)
+}
+
+// close wakes any goroutine sleeping on the timer and prevents
+// further sleeping on it.
+//
+// Once close is called, the wakeableSleep must no longer be used.
+//
+// It must only be called once no goroutine is sleeping on the
+// timer *and* nothing else will call wake concurrently.
+func (s *wakeableSleep) close() {
+ // Set wakeup to nil so that a late timer ends up being a no-op.
+ lock(&s.lock)
+ if raceenabled {
+ raceacquire(unsafe.Pointer(&s.lock))
+ }
+ wakeup := s.wakeup
+ s.wakeup = nil
+
+ // Close the channel.
+ close(wakeup)
+
+ if raceenabled {
+ racerelease(unsafe.Pointer(&s.lock))
+ }
+ unlock(&s.lock)
+ return
+}