9 files changed, 5050 insertions, 0 deletions
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
new file mode 100644
index 0000000..e7dfab1
--- /dev/null
+++ b/src/runtime/trace.go
@@ -0,0 +1,1579 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Go execution tracer.
+// The tracer captures a wide range of execution events like goroutine
+// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
+// changes of heap size, processor start/stop, etc and writes them to a buffer
+// in a compact form. A precise nanosecond-precision timestamp and a stack
+// trace is captured for most events.
+// See https://golang.org/s/go15trace for more info.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Event types in the trace, args are given in square brackets.
+const (
+	traceEvNone              = 0  // unused
+	traceEvBatch             = 1  // start of per-P batch of events [pid, timestamp]
+	traceEvFrequency         = 2  // contains tracer timer frequency [frequency (ticks per second)]
+	traceEvStack             = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
+	traceEvGomaxprocs        = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
+	traceEvProcStart         = 5  // start of P [timestamp, thread id]
+	traceEvProcStop          = 6  // stop of P [timestamp]
+	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
+	traceEvGCDone            = 8  // GC done [timestamp]
+	traceEvGCSTWStart        = 9  // GC STW start [timestamp, kind]
+	traceEvGCSTWDone         = 10 // GC STW done [timestamp]
+	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
+	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
+	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart           = 14 // goroutine starts running [timestamp, goroutine id, seq]
+	traceEvGoEnd             = 15 // goroutine ends [timestamp]
+	traceEvGoStop            = 16 // goroutine stops (like in select{}) [timestamp, stack]
+	traceEvGoSched           = 17 // goroutine calls Gosched [timestamp, stack]
+	traceEvGoPreempt         = 18 // goroutine is preempted [timestamp, stack]
+	traceEvGoSleep           = 19 // goroutine calls Sleep [timestamp, stack]
+	traceEvGoBlock           = 20 // goroutine blocks [timestamp, stack]
+	traceEvGoUnblock         = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
+	traceEvGoBlockSend       = 22 // goroutine blocks on chan send [timestamp, stack]
+	traceEvGoBlockRecv       = 23 // goroutine blocks on chan recv [timestamp, stack]
+	traceEvGoBlockSelect     = 24 // goroutine blocks on select [timestamp, stack]
+	traceEvGoBlockSync       = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
+	traceEvGoBlockCond       = 26 // goroutine blocks on Cond [timestamp, stack]
+	traceEvGoBlockNet        = 27 // goroutine blocks on network [timestamp, stack]
+	traceEvGoSysCall         = 28 // syscall enter [timestamp, stack]
+	traceEvGoSysExit         = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
+	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
+	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
+	traceEvHeapAlloc         = 33 // gcController.heapLive change [timestamp, heap_alloc]
+	traceEvHeapGoal          = 34 // gcController.heapGoal() (formerly next_gc) change [timestamp, heap goal in bytes]
+	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
+	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
+	traceEvString            = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal    = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal    = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvGoStartLabel      = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
+	traceEvGoBlockGC         = 42 // goroutine blocks on GC assist [timestamp, stack]
+	traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack]
+	traceEvGCMarkAssistDone  = 44 // GC mark assist done [timestamp]
+	traceEvUserTaskCreate    = 45 // trace.NewContext [timestamp, internal task id, internal parent task id, stack, name string]
+	traceEvUserTaskEnd       = 46 // end of a task [timestamp, internal task id, stack]
+	traceEvUserRegion        = 47 // trace.WithRegion [timestamp, internal task id, mode(0:start, 1:end), stack, name string]
+	traceEvUserLog           = 48 // trace.Log [timestamp, internal task id, key string id, stack, value string]
+	traceEvCPUSample         = 49 // CPU profiling sample [timestamp, stack, real timestamp, real P id (-1 when absent), goroutine id]
+	traceEvCount             = 50
+	// Byte is used but only 6 bits are available for event type.
+	// The remaining 2 bits are used to specify the number of arguments.
+	// That means, the max event type value is 63.
+)
+
+const (
+	// Timestamps in trace are cputicks/traceTickDiv.
+	// This makes absolute values of timestamp diffs smaller,
+	// and so they are encoded in less number of bytes.
+	// 64 on x86 is somewhat arbitrary (one tick is ~20ns on a 3GHz machine).
+	// The suggested increment frequency for PowerPC's time base register is
+	// 512 MHz according to Power ISA v2.07 section 6.2, so we use 16 on ppc64
+	// and ppc64le.
+	// Tracing won't work reliably for architectures where cputicks is emulated
+	// by nanotime, so the value doesn't matter for those architectures.
+	traceTickDiv = 16 + 48*(goarch.Is386|goarch.IsAmd64)
+	// Maximum number of PCs in a single stack trace.
+	// Since events contain only stack id rather than whole stack trace,
+	// we can allow quite large values here.
+	traceStackSize = 128
+	// Identifier of a fake P that is used when we trace without a real P.
+	traceGlobProc = -1
+	// Maximum number of bytes to encode uint64 in base-128.
+	traceBytesPerNumber = 10
+	// Shift of the number of arguments in the first event byte.
+	traceArgCountShift = 6
+	// Flag passed to traceGoPark to denote that the previous wakeup of this
+	// goroutine was futile. For example, a goroutine was unblocked on a mutex,
+	// but another goroutine got ahead and acquired the mutex before the first
+	// goroutine is scheduled, so the first goroutine has to block again.
+	// Such wakeups happen on buffered channels and sync.Mutex,
+	// but are generally not interesting for end user.
+	traceFutileWakeup byte = 128
+)
+
+// trace is global tracing context.
+var trace struct {
+	// trace.lock must only be acquired on the system stack where
+	// stack splits cannot happen while it is held.
+	lock          mutex       // protects the following members
+	lockOwner     *g          // to avoid deadlocks during recursive lock locks
+	enabled       bool        // when set runtime traces events
+	shutdown      bool        // set when we are waiting for trace reader to finish after setting enabled to false
+	headerWritten bool        // whether ReadTrace has emitted trace header
+	footerWritten bool        // whether ReadTrace has emitted trace footer
+	shutdownSema  uint32      // used to wait for ReadTrace completion
+	seqStart      uint64      // sequence number when tracing was started
+	ticksStart    int64       // cputicks when tracing was started
+	ticksEnd      int64       // cputicks when tracing was stopped
+	timeStart     int64       // nanotime when tracing was started
+	timeEnd       int64       // nanotime when tracing was stopped
+	seqGC         uint64      // GC start/done sequencer
+	reading       traceBufPtr // buffer currently handed off to user
+	empty         traceBufPtr // stack of empty buffers
+	fullHead      traceBufPtr // queue of full buffers
+	fullTail      traceBufPtr
+	stackTab      traceStackTable // maps stack traces to unique ids
+	// cpuLogRead accepts CPU profile samples from the signal handler where
+	// they're generated. It uses a two-word header to hold the IDs of the P and
+	// G (respectively) that were active at the time of the sample. Because
+	// profBuf uses a record with all zeros in its header to indicate overflow,
+	// we make sure to make the P field always non-zero: The ID of a real P will
+	// start at bit 1, and bit 0 will be set. Samples that arrive while no P is
+	// running (such as near syscalls) will set the first header field to 0b10.
+	// This careful handling of the first header field allows us to store ID of
+	// the active G directly in the second field, even though that will be 0
+	// when sampling g0.
+	cpuLogRead *profBuf
+	// cpuLogBuf is a trace buffer to hold events corresponding to CPU profile
+	// samples, which arrive out of band and not directly connected to a
+	// specific P.
+	cpuLogBuf traceBufPtr
+
+	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
+
+	signalLock  atomic.Uint32 // protects use of the following member, only usable in signal handlers
+	cpuLogWrite *profBuf      // copy of cpuLogRead for use in signal handlers, set without signalLock
+
+	// Dictionary for traceEvString.
+	//
+	// TODO: central lock to access the map is not ideal.
+	//   option: pre-assign ids to all user annotation region names and tags
+	//   option: per-P cache
+	//   option: sync.Map like data structure
+	stringsLock mutex
+	strings     map[string]uint64
+	stringSeq   uint64
+
+	// markWorkerLabels maps gcMarkWorkerMode to string ID.
+	markWorkerLabels [len(gcMarkWorkerModeStrings)]uint64
+
+	bufLock mutex       // protects buf
+	buf     traceBufPtr // global trace buffer, used when running without a p
+}
+
+// traceBufHeader is per-P tracing buffer.
+type traceBufHeader struct {
+	link      traceBufPtr             // in trace.empty/full
+	lastTicks uint64                  // when we wrote the last event
+	pos       int                     // next write offset in arr
+	stk       [traceStackSize]uintptr // scratch buffer for traceback
+}
+
+// traceBuf is per-P tracing buffer.
+type traceBuf struct {
+	_ sys.NotInHeap
+	traceBufHeader
+	arr [64<<10 - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
+}
+
+// traceBufPtr is a *traceBuf that is not traced by the garbage
+// collector and doesn't have write barriers. traceBufs are not
+// allocated from the GC'd heap, so this is safe, and are often
+// manipulated in contexts where write barriers are not allowed, so
+// this is necessary.
+//
+// TODO: Since traceBuf is now embedded runtime/internal/sys.NotInHeap, this isn't necessary.
+type traceBufPtr uintptr
+
+func (tp traceBufPtr) ptr() *traceBuf   { return (*traceBuf)(unsafe.Pointer(tp)) }
+func (tp *traceBufPtr) set(b *traceBuf) { *tp = traceBufPtr(unsafe.Pointer(b)) }
+func traceBufPtrOf(b *traceBuf) traceBufPtr {
+	return traceBufPtr(unsafe.Pointer(b))
+}
+
+// StartTrace enables tracing for the current process.
+// While tracing, the data will be buffered and available via ReadTrace.
+// StartTrace returns an error if tracing is already enabled.
+// Most clients should use the runtime/trace package or the testing package's
+// -test.trace flag instead of calling StartTrace directly.
+func StartTrace() error {
+	// Stop the world so that we can take a consistent snapshot
+	// of all goroutines at the beginning of the trace.
+	// Do not stop the world during GC so we ensure we always see
+	// a consistent view of GC-related events (e.g. a start is always
+	// paired with an end).
+	stopTheWorldGC("start tracing")
+
+	// Prevent sysmon from running any code that could generate events.
+	lock(&sched.sysmonlock)
+
+	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
+	// Exitsyscall could check trace.enabled long before and then suddenly wake up
+	// and decide to write to trace at a random point in time.
+	// However, such syscall will use the global trace.buf buffer, because we've
+	// acquired all p's by doing stop-the-world. So this protects us from such races.
+	lock(&trace.bufLock)
+
+	if trace.enabled || trace.shutdown {
+		unlock(&trace.bufLock)
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
+		return errorString("tracing is already enabled")
+	}
+
+	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
+	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
+	// That would lead to an inconsistent trace:
+	// - either GoSysExit appears before EvGoInSyscall,
+	// - or GoSysExit appears for a goroutine for which we don't emit EvGoInSyscall below.
+	// To instruct traceEvent that it must not ignore events below, we set startingtrace.
+	// trace.enabled is set afterwards once we have emitted all preliminary events.
+	mp := getg().m
+	mp.startingtrace = true
+
+	// Obtain current stack ID to use in all traceEvGoCreate events below.
+	stkBuf := make([]uintptr, traceStackSize)
+	stackID := traceStackID(mp, stkBuf, 2)
+
+	profBuf := newProfBuf(2, profBufWordCount, profBufTagCount) // after the timestamp, header is [pp.id, gp.goid]
+	trace.cpuLogRead = profBuf
+
+	// We must not acquire trace.signalLock outside of a signal handler: a
+	// profiling signal may arrive at any time and try to acquire it, leading to
+	// deadlock. Because we can't use that lock to protect updates to
+	// trace.cpuLogWrite (only use of the structure it references), reads and
+	// writes of the pointer must be atomic. (And although this field is never
+	// the sole pointer to the profBuf value, it's best to allow a write barrier
+	// here.)
+	atomicstorep(unsafe.Pointer(&trace.cpuLogWrite), unsafe.Pointer(profBuf))
+
+	// World is stopped, no need to lock.
+	forEachGRace(func(gp *g) {
+		status := readgstatus(gp)
+		if status != _Gdead {
+			gp.traceseq = 0
+			gp.tracelastp = getg().m.p
+			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+			id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum})
+			traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
+		}
+		if status == _Gwaiting {
+			// traceEvGoWaiting is implied to have seq=1.
+			gp.traceseq++
+			traceEvent(traceEvGoWaiting, -1, gp.goid)
+		}
+		if status == _Gsyscall {
+			gp.traceseq++
+			traceEvent(traceEvGoInSyscall, -1, gp.goid)
+		} else if status == _Gdead && gp.m != nil && gp.m.isextra {
+			// Trigger two trace events for the dead g in the extra m,
+			// since the next event of the g will be traceEvGoSysExit in exitsyscall,
+			// while calling from C thread to Go.
+			gp.traceseq = 0
+			gp.tracelastp = getg().m.p
+			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+			id := trace.stackTab.put([]uintptr{startPCforTrace(0) + sys.PCQuantum}) // no start pc
+			traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
+			gp.traceseq++
+			traceEvent(traceEvGoInSyscall, -1, gp.goid)
+		} else {
+			gp.sysblocktraced = false
+		}
+	})
+	traceProcStart()
+	traceGoStart()
+	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
+	// If we do it the other way around, it is possible that exitsyscall will
+	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
+	// It will lead to a false conclusion that cputicks is broken.
+	trace.ticksStart = cputicks()
+	trace.timeStart = nanotime()
+	trace.headerWritten = false
+	trace.footerWritten = false
+
+	// string to id mapping
+	//  0 : reserved for an empty string
+	//  remaining: other strings registered by traceString
+	trace.stringSeq = 0
+	trace.strings = make(map[string]uint64)
+
+	trace.seqGC = 0
+	mp.startingtrace = false
+	trace.enabled = true
+
+	// Register runtime goroutine labels.
+	_, pid, bufp := traceAcquireBuffer()
+	for i, label := range gcMarkWorkerModeStrings[:] {
+		trace.markWorkerLabels[i], bufp = traceString(bufp, pid, label)
+	}
+	traceReleaseBuffer(pid)
+
+	unlock(&trace.bufLock)
+
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
+	return nil
+}
+
+// StopTrace stops tracing, if it was previously enabled.
+// StopTrace only returns after all the reads for the trace have completed.
+func StopTrace() {
+	// Stop the world so that we can collect the trace buffers from all p's below,
+	// and also to avoid races with traceEvent.
+	stopTheWorldGC("stop tracing")
+
+	// See the comment in StartTrace.
+	lock(&sched.sysmonlock)
+
+	// See the comment in StartTrace.
+	lock(&trace.bufLock)
+
+	if !trace.enabled {
+		unlock(&trace.bufLock)
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
+		return
+	}
+
+	traceGoSched()
+
+	atomicstorep(unsafe.Pointer(&trace.cpuLogWrite), nil)
+	trace.cpuLogRead.close()
+	traceReadCPU()
+
+	// Loop over all allocated Ps because dead Ps may still have
+	// trace buffers.
+	for _, p := range allp[:cap(allp)] {
+		buf := p.tracebuf
+		if buf != 0 {
+			traceFullQueue(buf)
+			p.tracebuf = 0
+		}
+	}
+	if trace.buf != 0 {
+		buf := trace.buf
+		trace.buf = 0
+		if buf.ptr().pos != 0 {
+			traceFullQueue(buf)
+		}
+	}
+	if trace.cpuLogBuf != 0 {
+		buf := trace.cpuLogBuf
+		trace.cpuLogBuf = 0
+		if buf.ptr().pos != 0 {
+			traceFullQueue(buf)
+		}
+	}
+
+	for {
+		trace.ticksEnd = cputicks()
+		trace.timeEnd = nanotime()
+		// Windows time can tick only every 15ms, wait for at least one tick.
+		if trace.timeEnd != trace.timeStart {
+			break
+		}
+		osyield()
+	}
+
+	trace.enabled = false
+	trace.shutdown = true
+	unlock(&trace.bufLock)
+
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
+
+	// The world is started but we've set trace.shutdown, so new tracing can't start.
+	// Wait for the trace reader to flush pending buffers and stop.
+	semacquire(&trace.shutdownSema)
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&trace.shutdownSema))
+	}
+
+	systemstack(func() {
+		// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
+		lock(&trace.lock)
+		for _, p := range allp[:cap(allp)] {
+			if p.tracebuf != 0 {
+				throw("trace: non-empty trace buffer in proc")
+			}
+		}
+		if trace.buf != 0 {
+			throw("trace: non-empty global trace buffer")
+		}
+		if trace.fullHead != 0 || trace.fullTail != 0 {
+			throw("trace: non-empty full trace buffer")
+		}
+		if trace.reading != 0 || trace.reader.Load() != nil {
+			throw("trace: reading after shutdown")
+		}
+		for trace.empty != 0 {
+			buf := trace.empty
+			trace.empty = buf.ptr().link
+			sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
+		}
+		trace.strings = nil
+		trace.shutdown = false
+		trace.cpuLogRead = nil
+		unlock(&trace.lock)
+	})
+}
+
+// ReadTrace returns the next chunk of binary tracing data, blocking until data
+// is available. If tracing is turned off and all the data accumulated while it
+// was on has been returned, ReadTrace returns nil. The caller must copy the
+// returned data before calling ReadTrace again.
+// ReadTrace must be called from one goroutine at a time.
+func ReadTrace() []byte {
+top:
+	var buf []byte
+	var park bool
+	systemstack(func() {
+		buf, park = readTrace0()
+	})
+	if park {
+		gopark(func(gp *g, _ unsafe.Pointer) bool {
+			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
+				// We're racing with another reader.
+				// Wake up and handle this case.
+				return false
+			}
+
+			if g2 := traceReader(); gp == g2 {
+				// New data arrived between unlocking
+				// and the CAS and we won the wake-up
+				// race, so wake up directly.
+				return false
+			} else if g2 != nil {
+				printlock()
+				println("runtime: got trace reader", g2, g2.goid)
+				throw("unexpected trace reader")
+			}
+
+			return true
+		}, nil, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
+		goto top
+	}
+
+	return buf
+}
+
+// readTrace0 is ReadTrace's continuation on g0. This must run on the
+// system stack because it acquires trace.lock.
+//
+//go:systemstack
+func readTrace0() (buf []byte, park bool) {
+	if raceenabled {
+		// g0 doesn't have a race context. Borrow the user G's.
+		if getg().racectx != 0 {
+			throw("expected racectx == 0")
+		}
+		getg().racectx = getg().m.curg.racectx
+		// (This defer should get open-coded, which is safe on
+		// the system stack.)
+		defer func() { getg().racectx = 0 }()
+	}
+
+	// This function may need to lock trace.lock recursively
+	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
+	// To allow this we use trace.lockOwner.
+	// Also this function must not allocate while holding trace.lock:
+	// allocation can call heap allocate, which will try to emit a trace
+	// event while holding heap lock.
+	lock(&trace.lock)
+	trace.lockOwner = getg().m.curg
+
+	if trace.reader.Load() != nil {
+		// More than one goroutine reads trace. This is bad.
+		// But we rather do not crash the program because of tracing,
+		// because tracing can be enabled at runtime on prod servers.
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		println("runtime: ReadTrace called from multiple goroutines simultaneously")
+		return nil, false
+	}
+	// Recycle the old buffer.
+	if buf := trace.reading; buf != 0 {
+		buf.ptr().link = trace.empty
+		trace.empty = buf
+		trace.reading = 0
+	}
+	// Write trace header.
+	if !trace.headerWritten {
+		trace.headerWritten = true
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return []byte("go 1.19 trace\x00\x00\x00"), false
+	}
+	// Optimistically look for CPU profile samples. This may write new stack
+	// records, and may write new tracing buffers.
+	if !trace.footerWritten && !trace.shutdown {
+		traceReadCPU()
+	}
+	// Wait for new data.
+	if trace.fullHead == 0 && !trace.shutdown {
+		// We don't simply use a note because the scheduler
+		// executes this goroutine directly when it wakes up
+		// (also a note would consume an M).
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return nil, true
+	}
+newFull:
+	assertLockHeld(&trace.lock)
+	// Write a buffer.
+	if trace.fullHead != 0 {
+		buf := traceFullDequeue()
+		trace.reading = buf
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return buf.ptr().arr[:buf.ptr().pos], false
+	}
+
+	// Write footer with timer frequency.
+	if !trace.footerWritten {
+		trace.footerWritten = true
+		// Use float64 because (trace.ticksEnd - trace.ticksStart) * 1e9 can overflow int64.
+		freq := float64(trace.ticksEnd-trace.ticksStart) * 1e9 / float64(trace.timeEnd-trace.timeStart) / traceTickDiv
+		if freq <= 0 {
+			throw("trace: ReadTrace got invalid frequency")
+		}
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+
+		// Write frequency event.
+		bufp := traceFlush(0, 0)
+		buf := bufp.ptr()
+		buf.byte(traceEvFrequency | 0<<traceArgCountShift)
+		buf.varint(uint64(freq))
+
+		// Dump stack table.
+		// This will emit a bunch of full buffers, we will pick them up
+		// on the next iteration.
+		bufp = trace.stackTab.dump(bufp)
+
+		// Flush final buffer.
+		lock(&trace.lock)
+		traceFullQueue(bufp)
+		goto newFull // trace.lock should be held at newFull
+	}
+	// Done.
+	if trace.shutdown {
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		if raceenabled {
+			// Model synchronization on trace.shutdownSema, which race
+			// detector does not see. This is required to avoid false
+			// race reports on writer passed to trace.Start.
+			racerelease(unsafe.Pointer(&trace.shutdownSema))
+		}
+		// trace.enabled is already reset, so can call traceable functions.
+		semrelease(&trace.shutdownSema)
+		return nil, false
+	}
+	// Also bad, but see the comment above.
+	trace.lockOwner = nil
+	unlock(&trace.lock)
+	println("runtime: spurious wakeup of trace reader")
+	return nil, false
+}
+
+// traceReader returns the trace reader that should be woken up, if any.
+// Callers should first check that trace.enabled or trace.shutdown is set.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
+func traceReader() *g {
+	// Optimistic check first
+	if traceReaderAvailable() == nil {
+		return nil
+	}
+	lock(&trace.lock)
+	gp := traceReaderAvailable()
+	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
+		unlock(&trace.lock)
+		return nil
+	}
+	unlock(&trace.lock)
+	return gp
+}
+
+// traceReaderAvailable returns the trace reader if it is not currently
+// scheduled and should be. Callers should first check that trace.enabled
+// or trace.shutdown is set.
+func traceReaderAvailable() *g {
+	if trace.fullHead != 0 || trace.shutdown {
+		return trace.reader.Load()
+	}
+	return nil
+}
+
+// traceProcFree frees trace buffer associated with pp.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
+func traceProcFree(pp *p) {
+	buf := pp.tracebuf
+	pp.tracebuf = 0
+	if buf == 0 {
+		return
+	}
+	lock(&trace.lock)
+	traceFullQueue(buf)
+	unlock(&trace.lock)
+}
+
+// traceFullQueue queues buf into queue of full buffers.
+func traceFullQueue(buf traceBufPtr) {
+	buf.ptr().link = 0
+	if trace.fullHead == 0 {
+		trace.fullHead = buf
+	} else {
+		trace.fullTail.ptr().link = buf
+	}
+	trace.fullTail = buf
+}
+
+// traceFullDequeue dequeues from queue of full buffers.
+func traceFullDequeue() traceBufPtr {
+	buf := trace.fullHead
+	if buf == 0 {
+		return 0
+	}
+	trace.fullHead = buf.ptr().link
+	if trace.fullHead == 0 {
+		trace.fullTail = 0
+	}
+	buf.ptr().link = 0
+	return buf
+}
+
+// traceEvent writes a single event to trace buffer, flushing the buffer if necessary.
+// ev is event type.
+// If skip > 0, write current stack id as the last argument (skipping skip top frames).
+// If skip = 0, this event type should contain a stack, but we don't want
+// to collect and remember it for this particular call.
+func traceEvent(ev byte, skip int, args ...uint64) {
+	mp, pid, bufp := traceAcquireBuffer()
+	// Double-check trace.enabled now that we've done m.locks++ and acquired bufLock.
+	// This protects from races between traceEvent and StartTrace/StopTrace.
+
+	// The caller checked that trace.enabled == true, but trace.enabled might have been
+	// turned off between the check and now. Check again. traceLockBuffer did mp.locks++,
+	// StopTrace does stopTheWorld, and stopTheWorld waits for mp.locks to go back to zero,
+	// so if we see trace.enabled == true now, we know it's true for the rest of the function.
+	// Exitsyscall can run even during stopTheWorld. The race with StartTrace/StopTrace
+	// during tracing in exitsyscall is resolved by locking trace.bufLock in traceLockBuffer.
+	//
+	// Note trace_userTaskCreate runs the same check.
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	if skip > 0 {
+		if getg() == mp.curg {
+			skip++ // +1 because stack is captured in traceEventLocked.
+		}
+	}
+	traceEventLocked(0, mp, pid, bufp, ev, 0, skip, args...)
+	traceReleaseBuffer(pid)
+}
+
+// traceEventLocked writes a single event of type ev to the trace buffer bufp,
+// flushing the buffer if necessary. pid is the id of the current P, or
+// traceGlobProc if we're tracing without a real P.
+//
+// Preemption is disabled, and if running without a real P the global tracing
+// buffer is locked.
+//
+// Events types that do not include a stack set skip to -1. Event types that
+// include a stack may explicitly reference a stackID from the trace.stackTab
+// (obtained by an earlier call to traceStackID). Without an explicit stackID,
+// this function will automatically capture the stack of the goroutine currently
+// running on mp, skipping skip top frames or, if skip is 0, writing out an
+// empty stack record.
+//
+// It records the event's args to the traceBuf, and also makes an effort to
+// reserve extraBytes bytes of additional space immediately following the event,
+// in the same traceBuf.
+func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev byte, stackID uint32, skip int, args ...uint64) {
+	buf := bufp.ptr()
+	// TODO: test on non-zero extraBytes param.
+	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
+	if buf == nil || len(buf.arr)-buf.pos < maxSize {
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		})
+		bufp.set(buf)
+	}
+
+	// NOTE: ticks might be same after tick division, although the real cputicks is
+	// linear growth.
+	ticks := uint64(cputicks()) / traceTickDiv
+	tickDiff := ticks - buf.lastTicks
+	if tickDiff == 0 {
+		ticks = buf.lastTicks + 1
+		tickDiff = 1
+	}
+
+	buf.lastTicks = ticks
+	narg := byte(len(args))
+	if stackID != 0 || skip >= 0 {
+		narg++
+	}
+	// We have only 2 bits for number of arguments.
+	// If number is >= 3, then the event type is followed by event length in bytes.
+	if narg > 3 {
+		narg = 3
+	}
+	startPos := buf.pos
+	buf.byte(ev | narg<<traceArgCountShift)
+	var lenp *byte
+	if narg == 3 {
+		// Reserve the byte for length assuming that length < 128.
+		buf.varint(0)
+		lenp = &buf.arr[buf.pos-1]
+	}
+	buf.varint(tickDiff)
+	for _, a := range args {
+		buf.varint(a)
+	}
+	if stackID != 0 {
+		buf.varint(uint64(stackID))
+	} else if skip == 0 {
+		buf.varint(0)
+	} else if skip > 0 {
+		buf.varint(traceStackID(mp, buf.stk[:], skip))
+	}
+	evSize := buf.pos - startPos
+	if evSize > maxSize {
+		throw("invalid length of trace event")
+	}
+	if lenp != nil {
+		// Fill in actual length.
+		*lenp = byte(evSize - 2)
+	}
+}
+
+// traceCPUSample writes a CPU profile sample stack to the execution tracer's
+// profiling buffer. It is called from a signal handler, so is limited in what
+// it can do.
+func traceCPUSample(gp *g, pp *p, stk []uintptr) {
+	if !trace.enabled {
+		// Tracing is usually turned off; don't spend time acquiring the signal
+		// lock unless it's active.
+		return
+	}
+
+	// Match the clock used in traceEventLocked
+	now := cputicks()
+	// The "header" here is the ID of the P that was running the profiled code,
+	// followed by the ID of the goroutine. (For normal CPU profiling, it's
+	// usually the number of samples with the given stack.) Near syscalls, pp
+	// may be nil. Reporting goid of 0 is fine for either g0 or a nil gp.
+	var hdr [2]uint64
+	if pp != nil {
+		// Overflow records in profBuf have all header values set to zero. Make
+		// sure that real headers have at least one bit set.
+		hdr[0] = uint64(pp.id)<<1 | 0b1
+	} else {
+		hdr[0] = 0b10
+	}
+	if gp != nil {
+		hdr[1] = gp.goid
+	}
+
+	// Allow only one writer at a time
+	for !trace.signalLock.CompareAndSwap(0, 1) {
+		// TODO: Is it safe to osyield here? https://go.dev/issue/52672
+		osyield()
+	}
+
+	if log := (*profBuf)(atomic.Loadp(unsafe.Pointer(&trace.cpuLogWrite))); log != nil {
+		// Note: we don't pass a tag pointer here (how should profiling tags
+		// interact with the execution tracer?), but if we did we'd need to be
+		// careful about write barriers. See the long comment in profBuf.write.
+		log.write(nil, now, hdr[:], stk)
+	}
+
+	trace.signalLock.Store(0)
+}
+
+func traceReadCPU() {
+	bufp := &trace.cpuLogBuf
+
+	for {
+		data, tags, _ := trace.cpuLogRead.read(profBufNonBlocking)
+		if len(data) == 0 {
+			break
+		}
+		for len(data) > 0 {
+			if len(data) < 4 || data[0] > uint64(len(data)) {
+				break // truncated profile
+			}
+			if data[0] < 4 || tags != nil && len(tags) < 1 {
+				break // malformed profile
+			}
+			if len(tags) < 1 {
+				break // mismatched profile records and tags
+			}
+			timestamp := data[1]
+			ppid := data[2] >> 1
+			if hasP := (data[2] & 0b1) != 0; !hasP {
+				ppid = ^uint64(0)
+			}
+			goid := data[3]
+			stk := data[4:data[0]]
+			empty := len(stk) == 1 && data[2] == 0 && data[3] == 0
+			data = data[data[0]:]
+			// No support here for reporting goroutine tags at the moment; if
+			// that information is to be part of the execution trace, we'd
+			// probably want to see when the tags are applied and when they
+			// change, instead of only seeing them when we get a CPU sample.
+			tags = tags[1:]
+
+			if empty {
+				// Looks like an overflow record from the profBuf. Not much to
+				// do here, we only want to report full records.
+				//
+				// TODO: should we start a goroutine to drain the profBuf,
+				// rather than relying on a high-enough volume of tracing events
+				// to keep ReadTrace busy? https://go.dev/issue/52674
+				continue
+			}
+
+			buf := bufp.ptr()
+			if buf == nil {
+				systemstack(func() {
+					*bufp = traceFlush(*bufp, 0)
+				})
+				buf = bufp.ptr()
+			}
+			for i := range stk {
+				if i >= len(buf.stk) {
+					break
+				}
+				buf.stk[i] = uintptr(stk[i])
+			}
+			stackID := trace.stackTab.put(buf.stk[:len(stk)])
+
+			traceEventLocked(0, nil, 0, bufp, traceEvCPUSample, stackID, 1, timestamp/traceTickDiv, ppid, goid)
+		}
+	}
+}
+
+func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
+	gp := getg()
+	curgp := mp.curg
+	var nstk int
+	if curgp == gp {
+		nstk = callers(skip+1, buf)
+	} else if curgp != nil {
+		nstk = gcallers(curgp, skip, buf)
+	}
+	if nstk > 0 {
+		nstk-- // skip runtime.goexit
+	}
+	if nstk > 0 && curgp.goid == 1 {
+		nstk-- // skip runtime.main
+	}
+	id := trace.stackTab.put(buf[:nstk])
+	return uint64(id)
+}
+
+// traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
+func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
+	// Any time we acquire a buffer, we may end up flushing it,
+	// but flushes are rare. Record the lock edge even if it
+	// doesn't happen this time.
+	lockRankMayTraceFlush()
+
+	mp = acquirem()
+	if p := mp.p.ptr(); p != nil {
+		return mp, p.id, &p.tracebuf
+	}
+	lock(&trace.bufLock)
+	return mp, traceGlobProc, &trace.buf
+}
+
+// traceReleaseBuffer releases a buffer previously acquired with traceAcquireBuffer.
+func traceReleaseBuffer(pid int32) {
+	if pid == traceGlobProc {
+		unlock(&trace.bufLock)
+	}
+	releasem(getg().m)
+}
+
+// lockRankMayTraceFlush records the lock ranking effects of a
+// potential call to traceFlush.
+func lockRankMayTraceFlush() {
+	owner := trace.lockOwner
+	dolock := owner == nil || owner != getg().m.curg
+	if dolock {
+		lockWithRankMayAcquire(&trace.lock, getLockRank(&trace.lock))
+	}
+}
+
+// traceFlush puts buf onto stack of full buffers and returns an empty buffer.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
+func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
+	owner := trace.lockOwner
+	dolock := owner == nil || owner != getg().m.curg
+	if dolock {
+		lock(&trace.lock)
+	}
+	if buf != 0 {
+		traceFullQueue(buf)
+	}
+	if trace.empty != 0 {
+		buf = trace.empty
+		trace.empty = buf.ptr().link
+	} else {
+		buf = traceBufPtr(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
+		if buf == 0 {
+			throw("trace: out of memory")
+		}
+	}
+	bufp := buf.ptr()
+	bufp.link.set(nil)
+	bufp.pos = 0
+
+	// initialize the buffer for a new batch
+	ticks := uint64(cputicks()) / traceTickDiv
+	if ticks == bufp.lastTicks {
+		ticks = bufp.lastTicks + 1
+	}
+	bufp.lastTicks = ticks
+	bufp.byte(traceEvBatch | 1<<traceArgCountShift)
+	bufp.varint(uint64(pid))
+	bufp.varint(ticks)
+
+	if dolock {
+		unlock(&trace.lock)
+	}
+	return buf
+}
+
+// traceString adds a string to the trace.strings and returns the id.
+func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr) {
+	if s == "" {
+		return 0, bufp
+	}
+
+	lock(&trace.stringsLock)
+	if raceenabled {
+		// raceacquire is necessary because the map access
+		// below is race annotated.
+		raceacquire(unsafe.Pointer(&trace.stringsLock))
+	}
+
+	if id, ok := trace.strings[s]; ok {
+		if raceenabled {
+			racerelease(unsafe.Pointer(&trace.stringsLock))
+		}
+		unlock(&trace.stringsLock)
+
+		return id, bufp
+	}
+
+	trace.stringSeq++
+	id := trace.stringSeq
+	trace.strings[s] = id
+
+	if raceenabled {
+		racerelease(unsafe.Pointer(&trace.stringsLock))
+	}
+	unlock(&trace.stringsLock)
+
+	// memory allocation in above may trigger tracing and
+	// cause *bufp changes. Following code now works with *bufp,
+	// so there must be no memory allocation or any activities
+	// that causes tracing after this point.
+
+	buf := bufp.ptr()
+	size := 1 + 2*traceBytesPerNumber + len(s)
+	if buf == nil || len(buf.arr)-buf.pos < size {
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+			bufp.set(buf)
+		})
+	}
+	buf.byte(traceEvString)
+	buf.varint(id)
+
+	// double-check the string and the length can fit.
+	// Otherwise, truncate the string.
+	slen := len(s)
+	if room := len(buf.arr) - buf.pos; room < slen+traceBytesPerNumber {
+		slen = room
+	}
+
+	buf.varint(uint64(slen))
+	buf.pos += copy(buf.arr[buf.pos:], s[:slen])
+
+	bufp.set(buf)
+	return id, bufp
+}
+
+// varint appends v to buf in little-endian-base-128 encoding.
+func (buf *traceBuf) varint(v uint64) {
+	pos := buf.pos
+	for ; v >= 0x80; v >>= 7 {
+		buf.arr[pos] = 0x80 | byte(v)
+		pos++
+	}
+	buf.arr[pos] = byte(v)
+	pos++
+	buf.pos = pos
+}
+
+// varintAt writes varint v at byte position pos in buf. This always
+// consumes traceBytesPerNumber bytes. This is intended for when the
+// caller needs to reserve space for a varint but can't populate it
+// until later.
+func (buf *traceBuf) varintAt(pos int, v uint64) {
+	for i := 0; i < traceBytesPerNumber; i++ {
+		if i < traceBytesPerNumber-1 {
+			buf.arr[pos] = 0x80 | byte(v)
+		} else {
+			buf.arr[pos] = byte(v)
+		}
+		v >>= 7
+		pos++
+	}
+}
+
+// byte appends v to buf.
+func (buf *traceBuf) byte(v byte) {
+	buf.arr[buf.pos] = v
+	buf.pos++
+}
+
+// traceStackTable maps stack traces (arrays of PC's) to unique uint32 ids.
+// It is lock-free for reading.
+type traceStackTable struct {
+	lock mutex // Must be acquired on the system stack
+	seq  uint32
+	mem  traceAlloc
+	tab  [1 << 13]traceStackPtr
+}
+
+// traceStack is a single stack in traceStackTable.
+type traceStack struct {
+	link traceStackPtr
+	hash uintptr
+	id   uint32
+	n    int
+	stk  [0]uintptr // real type [n]uintptr
+}
+
+type traceStackPtr uintptr
+
+func (tp traceStackPtr) ptr() *traceStack { return (*traceStack)(unsafe.Pointer(tp)) }
+
+// stack returns slice of PCs.
+func (ts *traceStack) stack() []uintptr {
+	return (*[traceStackSize]uintptr)(unsafe.Pointer(&ts.stk))[:ts.n]
+}
+
+// put returns a unique id for the stack trace pcs and caches it in the table,
+// if it sees the trace for the first time.
+func (tab *traceStackTable) put(pcs []uintptr) uint32 {
+	if len(pcs) == 0 {
+		return 0
+	}
+	hash := memhash(unsafe.Pointer(&pcs[0]), 0, uintptr(len(pcs))*unsafe.Sizeof(pcs[0]))
+	// First, search the hashtable w/o the mutex.
+	if id := tab.find(pcs, hash); id != 0 {
+		return id
+	}
+	// Now, double check under the mutex.
+	// Switch to the system stack so we can acquire tab.lock
+	var id uint32
+	systemstack(func() {
+		lock(&tab.lock)
+		if id = tab.find(pcs, hash); id != 0 {
+			unlock(&tab.lock)
+			return
+		}
+		// Create new record.
+		tab.seq++
+		stk := tab.newStack(len(pcs))
+		stk.hash = hash
+		stk.id = tab.seq
+		id = stk.id
+		stk.n = len(pcs)
+		stkpc := stk.stack()
+		for i, pc := range pcs {
+			stkpc[i] = pc
+		}
+		part := int(hash % uintptr(len(tab.tab)))
+		stk.link = tab.tab[part]
+		atomicstorep(unsafe.Pointer(&tab.tab[part]), unsafe.Pointer(stk))
+		unlock(&tab.lock)
+	})
+	return id
+}
+
+// find checks if the stack trace pcs is already present in the table.
+func (tab *traceStackTable) find(pcs []uintptr, hash uintptr) uint32 {
+	part := int(hash % uintptr(len(tab.tab)))
+Search:
+	for stk := tab.tab[part].ptr(); stk != nil; stk = stk.link.ptr() {
+		if stk.hash == hash && stk.n == len(pcs) {
+			for i, stkpc := range stk.stack() {
+				if stkpc != pcs[i] {
+					continue Search
+				}
+			}
+			return stk.id
+		}
+	}
+	return 0
+}
+
+// newStack allocates a new stack of size n.
+func (tab *traceStackTable) newStack(n int) *traceStack {
+	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*goarch.PtrSize))
+}
+
+// traceFrames returns the frames corresponding to pcs. It may
+// allocate and may emit trace events.
+func traceFrames(bufp traceBufPtr, pcs []uintptr) ([]traceFrame, traceBufPtr) {
+	frames := make([]traceFrame, 0, len(pcs))
+	ci := CallersFrames(pcs)
+	for {
+		var frame traceFrame
+		f, more := ci.Next()
+		frame, bufp = traceFrameForPC(bufp, 0, f)
+		frames = append(frames, frame)
+		if !more {
+			return frames, bufp
+		}
+	}
+}
+
+// dump writes all previously cached stacks to trace buffers,
+// releases all memory and resets state.
+//
+// This must run on the system stack because it calls traceFlush.
+//
+//go:systemstack
+func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
+	for i := range tab.tab {
+		stk := tab.tab[i].ptr()
+		for ; stk != nil; stk = stk.link.ptr() {
+			var frames []traceFrame
+			frames, bufp = traceFrames(bufp, stk.stack())
+
+			// Estimate the size of this record. This
+			// bound is pretty loose, but avoids counting
+			// lots of varint sizes.
+			maxSize := 1 + traceBytesPerNumber + (2+4*len(frames))*traceBytesPerNumber
+			// Make sure we have enough buffer space.
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < maxSize {
+				bufp = traceFlush(bufp, 0)
+			}
+
+			// Emit header, with space reserved for length.
+			buf := bufp.ptr()
+			buf.byte(traceEvStack | 3<<traceArgCountShift)
+			lenPos := buf.pos
+			buf.pos += traceBytesPerNumber
+
+			// Emit body.
+			recPos := buf.pos
+			buf.varint(uint64(stk.id))
+			buf.varint(uint64(len(frames)))
+			for _, frame := range frames {
+				buf.varint(uint64(frame.PC))
+				buf.varint(frame.funcID)
+				buf.varint(frame.fileID)
+				buf.varint(frame.line)
+			}
+
+			// Fill in size header.
+			buf.varintAt(lenPos, uint64(buf.pos-recPos))
+		}
+	}
+
+	tab.mem.drop()
+	*tab = traceStackTable{}
+	lockInit(&((*tab).lock), lockRankTraceStackTab)
+
+	return bufp
+}
+
+type traceFrame struct {
+	PC     uintptr
+	funcID uint64
+	fileID uint64
+	line   uint64
+}
+
+// traceFrameForPC records the frame information.
+// It may allocate memory.
+func traceFrameForPC(buf traceBufPtr, pid int32, f Frame) (traceFrame, traceBufPtr) {
+	bufp := &buf
+	var frame traceFrame
+	frame.PC = f.PC
+
+	fn := f.Function
+	const maxLen = 1 << 10
+	if len(fn) > maxLen {
+		fn = fn[len(fn)-maxLen:]
+	}
+	frame.funcID, bufp = traceString(bufp, pid, fn)
+	frame.line = uint64(f.Line)
+	file := f.File
+	if len(file) > maxLen {
+		file = file[len(file)-maxLen:]
+	}
+	frame.fileID, bufp = traceString(bufp, pid, file)
+	return frame, (*bufp)
+}
+
+// traceAlloc is a non-thread-safe region allocator.
+// It holds a linked list of traceAllocBlock.
+type traceAlloc struct {
+	head traceAllocBlockPtr
+	off  uintptr
+}
+
+// traceAllocBlock is a block in traceAlloc.
+//
+// traceAllocBlock is allocated from non-GC'd memory, so it must not
+// contain heap pointers. Writes to pointers to traceAllocBlocks do
+// not need write barriers.
+type traceAllocBlock struct {
+	_    sys.NotInHeap
+	next traceAllocBlockPtr
+	data [64<<10 - goarch.PtrSize]byte
+}
+
+// TODO: Since traceAllocBlock is now embedded runtime/internal/sys.NotInHeap, this isn't necessary.
+type traceAllocBlockPtr uintptr
+
+func (p traceAllocBlockPtr) ptr() *traceAllocBlock   { return (*traceAllocBlock)(unsafe.Pointer(p)) }
+func (p *traceAllocBlockPtr) set(x *traceAllocBlock) { *p = traceAllocBlockPtr(unsafe.Pointer(x)) }
+
+// alloc allocates n-byte block.
+func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
+	n = alignUp(n, goarch.PtrSize)
+	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
+		if n > uintptr(len(a.head.ptr().data)) {
+			throw("trace: alloc too large")
+		}
+		block := (*traceAllocBlock)(sysAlloc(unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys))
+		if block == nil {
+			throw("trace: out of memory")
+		}
+		block.next.set(a.head.ptr())
+		a.head.set(block)
+		a.off = 0
+	}
+	p := &a.head.ptr().data[a.off]
+	a.off += n
+	return unsafe.Pointer(p)
+}
+
+// drop frees all previously allocated memory and resets the allocator.
+func (a *traceAlloc) drop() {
+	for a.head != 0 {
+		block := a.head.ptr()
+		a.head.set(block.next.ptr())
+		sysFree(unsafe.Pointer(block), unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys)
+	}
+}
+
+// The following functions write specific events to trace.
+
+func traceGomaxprocs(procs int32) {
+	traceEvent(traceEvGomaxprocs, 1, uint64(procs))
+}
+
+func traceProcStart() {
+	traceEvent(traceEvProcStart, -1, uint64(getg().m.id))
+}
+
+func traceProcStop(pp *p) {
+	// Sysmon and stopTheWorld can stop Ps blocked in syscalls,
+	// to handle this we temporary employ the P.
+	mp := acquirem()
+	oldp := mp.p
+	mp.p.set(pp)
+	traceEvent(traceEvProcStop, -1)
+	mp.p = oldp
+	releasem(mp)
+}
+
+func traceGCStart() {
+	traceEvent(traceEvGCStart, 3, trace.seqGC)
+	trace.seqGC++
+}
+
+func traceGCDone() {
+	traceEvent(traceEvGCDone, -1)
+}
+
+func traceGCSTWStart(kind int) {
+	traceEvent(traceEvGCSTWStart, -1, uint64(kind))
+}
+
+func traceGCSTWDone() {
+	traceEvent(traceEvGCSTWDone, -1)
+}
+
+// traceGCSweepStart prepares to trace a sweep loop. This does not
+// emit any events until traceGCSweepSpan is called.
+//
+// traceGCSweepStart must be paired with traceGCSweepDone and there
+// must be no preemption points between these two calls.
+func traceGCSweepStart() {
+	// Delay the actual GCSweepStart event until the first span
+	// sweep. If we don't sweep anything, don't emit any events.
+	pp := getg().m.p.ptr()
+	if pp.traceSweep {
+		throw("double traceGCSweepStart")
+	}
+	pp.traceSweep, pp.traceSwept, pp.traceReclaimed = true, 0, 0
+}
+
+// traceGCSweepSpan traces the sweep of a single page.
+//
+// This may be called outside a traceGCSweepStart/traceGCSweepDone
+// pair; however, it will not emit any trace events in this case.
+func traceGCSweepSpan(bytesSwept uintptr) {
+	pp := getg().m.p.ptr()
+	if pp.traceSweep {
+		if pp.traceSwept == 0 {
+			traceEvent(traceEvGCSweepStart, 1)
+		}
+		pp.traceSwept += bytesSwept
+	}
+}
+
+func traceGCSweepDone() {
+	pp := getg().m.p.ptr()
+	if !pp.traceSweep {
+		throw("missing traceGCSweepStart")
+	}
+	if pp.traceSwept != 0 {
+		traceEvent(traceEvGCSweepDone, -1, uint64(pp.traceSwept), uint64(pp.traceReclaimed))
+	}
+	pp.traceSweep = false
+}
+
+func traceGCMarkAssistStart() {
+	traceEvent(traceEvGCMarkAssistStart, 1)
+}
+
+func traceGCMarkAssistDone() {
+	traceEvent(traceEvGCMarkAssistDone, -1)
+}
+
+func traceGoCreate(newg *g, pc uintptr) {
+	newg.traceseq = 0
+	newg.tracelastp = getg().m.p
+	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+	id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum})
+	traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id))
+}
+
+func traceGoStart() {
+	gp := getg().m.curg
+	pp := gp.m.p
+	gp.traceseq++
+	if pp.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
+		traceEvent(traceEvGoStartLabel, -1, gp.goid, gp.traceseq, trace.markWorkerLabels[pp.ptr().gcMarkWorkerMode])
+	} else if gp.tracelastp == pp {
+		traceEvent(traceEvGoStartLocal, -1, gp.goid)
+	} else {
+		gp.tracelastp = pp
+		traceEvent(traceEvGoStart, -1, gp.goid, gp.traceseq)
+	}
+}
+
+func traceGoEnd() {
+	traceEvent(traceEvGoEnd, -1)
+}
+
+func traceGoSched() {
+	gp := getg()
+	gp.tracelastp = gp.m.p
+	traceEvent(traceEvGoSched, 1)
+}
+
+func traceGoPreempt() {
+	gp := getg()
+	gp.tracelastp = gp.m.p
+	traceEvent(traceEvGoPreempt, 1)
+}
+
+func traceGoPark(traceEv byte, skip int) {
+	if traceEv&traceFutileWakeup != 0 {
+		traceEvent(traceEvFutileWakeup, -1)
+	}
+	traceEvent(traceEv & ^traceFutileWakeup, skip)
+}
+
+func traceGoUnpark(gp *g, skip int) {
+	pp := getg().m.p
+	gp.traceseq++
+	if gp.tracelastp == pp {
+		traceEvent(traceEvGoUnblockLocal, skip, gp.goid)
+	} else {
+		gp.tracelastp = pp
+		traceEvent(traceEvGoUnblock, skip, gp.goid, gp.traceseq)
+	}
+}
+
+func traceGoSysCall() {
+	traceEvent(traceEvGoSysCall, 1)
+}
+
+func traceGoSysExit(ts int64) {
+	if ts != 0 && ts < trace.ticksStart {
+		// There is a race between the code that initializes sysexitticks
+		// (in exitsyscall, which runs without a P, and therefore is not
+		// stopped with the rest of the world) and the code that initializes
+		// a new trace. The recorded sysexitticks must therefore be treated
+		// as "best effort". If they are valid for this trace, then great,
+		// use them for greater accuracy. But if they're not valid for this
+		// trace, assume that the trace was started after the actual syscall
+		// exit (but before we actually managed to start the goroutine,
+		// aka right now), and assign a fresh time stamp to keep the log consistent.
+		ts = 0
+	}
+	gp := getg().m.curg
+	gp.traceseq++
+	gp.tracelastp = gp.m.p
+	traceEvent(traceEvGoSysExit, -1, gp.goid, gp.traceseq, uint64(ts)/traceTickDiv)
+}
+
+func traceGoSysBlock(pp *p) {
+	// Sysmon and stopTheWorld can declare syscalls running on remote Ps as blocked,
+	// to handle this we temporary employ the P.
+	mp := acquirem()
+	oldp := mp.p
+	mp.p.set(pp)
+	traceEvent(traceEvGoSysBlock, -1)
+	mp.p = oldp
+	releasem(mp)
+}
+
+func traceHeapAlloc(live uint64) {
+	traceEvent(traceEvHeapAlloc, -1, live)
+}
+
+func traceHeapGoal() {
+	heapGoal := gcController.heapGoal()
+	if heapGoal == ^uint64(0) {
+		// Heap-based triggering is disabled.
+		traceEvent(traceEvHeapGoal, -1, 0)
+	} else {
+		traceEvent(traceEvHeapGoal, -1, heapGoal)
+	}
+}
+
+// To access runtime functions from runtime/trace.
+// See runtime/trace/annotation.go
+
+//go:linkname trace_userTaskCreate runtime/trace.userTaskCreate
+func trace_userTaskCreate(id, parentID uint64, taskType string) {
+	if !trace.enabled {
+		return
+	}
+
+	// Same as in traceEvent.
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	typeStringID, bufp := traceString(bufp, pid, taskType)
+	traceEventLocked(0, mp, pid, bufp, traceEvUserTaskCreate, 0, 3, id, parentID, typeStringID)
+	traceReleaseBuffer(pid)
+}
+
+//go:linkname trace_userTaskEnd runtime/trace.userTaskEnd
+func trace_userTaskEnd(id uint64) {
+	traceEvent(traceEvUserTaskEnd, 2, id)
+}
+
+//go:linkname trace_userRegion runtime/trace.userRegion
+func trace_userRegion(id, mode uint64, name string) {
+	if !trace.enabled {
+		return
+	}
+
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	nameStringID, bufp := traceString(bufp, pid, name)
+	traceEventLocked(0, mp, pid, bufp, traceEvUserRegion, 0, 3, id, mode, nameStringID)
+	traceReleaseBuffer(pid)
+}
+
+//go:linkname trace_userLog runtime/trace.userLog
+func trace_userLog(id uint64, category, message string) {
+	if !trace.enabled {
+		return
+	}
+
+	mp, pid, bufp := traceAcquireBuffer()
+	if !trace.enabled && !mp.startingtrace {
+		traceReleaseBuffer(pid)
+		return
+	}
+
+	categoryID, bufp := traceString(bufp, pid, category)
+
+	extraSpace := traceBytesPerNumber + len(message) // extraSpace for the value string
+	traceEventLocked(extraSpace, mp, pid, bufp, traceEvUserLog, 0, 3, id, categoryID)
+	// traceEventLocked reserved extra space for val and len(val)
+	// in buf, so buf now has room for the following.
+	buf := bufp.ptr()
+
+	// double-check the message and its length can fit.
+	// Otherwise, truncate the message.
+	slen := len(message)
+	if room := len(buf.arr) - buf.pos; room < slen+traceBytesPerNumber {
+		slen = room
+	}
+	buf.varint(uint64(slen))
+	buf.pos += copy(buf.arr[buf.pos:], message[:slen])
+
+	traceReleaseBuffer(pid)
+}
+
+// the start PC of a goroutine for tracing purposes. If pc is a wrapper,
+// it returns the PC of the wrapped function. Otherwise it returns pc.
+func startPCforTrace(pc uintptr) uintptr {
+	f := findfunc(pc)
+	if !f.valid() {
+		return pc // may happen for locked g in extra M since its pc is 0.
+	}
+	w := funcdata(f, _FUNCDATA_WrapInfo)
+	if w == nil {
+		return pc // not a wrapper
+	}
+	return f.datap.textAddr(*(*uint32)(w))
+}
diff --git a/src/runtime/trace/annotation.go b/src/runtime/trace/annotation.go
new file mode 100644
index 0000000..d47cb85
--- /dev/null
+++ b/src/runtime/trace/annotation.go
@@ -0,0 +1,198 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace
+
+import (
+	"context"
+	"fmt"
+	"sync/atomic"
+	_ "unsafe"
+)
+
+type traceContextKey struct{}
+
+// NewTask creates a task instance with the type taskType and returns
+// it along with a Context that carries the task.
+// If the input context contains a task, the new task is its subtask.
+//
+// The taskType is used to classify task instances. Analysis tools
+// like the Go execution tracer may assume there are only a bounded
+// number of unique task types in the system.
+//
+// The returned end function is used to mark the task's end.
+// The trace tool measures task latency as the time between task creation
+// and when the end function is called, and provides the latency
+// distribution per task type.
+// If the end function is called multiple times, only the first
+// call is used in the latency measurement.
+//
+//	ctx, task := trace.NewTask(ctx, "awesomeTask")
+//	trace.WithRegion(ctx, "preparation", prepWork)
+//	// preparation of the task
+//	go func() {  // continue processing the task in a separate goroutine.
+//	    defer task.End()
+//	    trace.WithRegion(ctx, "remainingWork", remainingWork)
+//	}()
+func NewTask(pctx context.Context, taskType string) (ctx context.Context, task *Task) {
+	pid := fromContext(pctx).id
+	id := newID()
+	userTaskCreate(id, pid, taskType)
+	s := &Task{id: id}
+	return context.WithValue(pctx, traceContextKey{}, s), s
+
+	// We allocate a new task and the end function even when
+	// the tracing is disabled because the context and the detach
+	// function can be used across trace enable/disable boundaries,
+	// which complicates the problem.
+	//
+	// For example, consider the following scenario:
+	//   - trace is enabled.
+	//   - trace.WithRegion is called, so a new context ctx
+	//     with a new region is created.
+	//   - trace is disabled.
+	//   - trace is enabled again.
+	//   - trace APIs with the ctx is called. Is the ID in the task
+	//   a valid one to use?
+	//
+	// TODO(hyangah): reduce the overhead at least when
+	// tracing is disabled. Maybe the id can embed a tracing
+	// round number and ignore ids generated from previous
+	// tracing round.
+}
+
+func fromContext(ctx context.Context) *Task {
+	if s, ok := ctx.Value(traceContextKey{}).(*Task); ok {
+		return s
+	}
+	return &bgTask
+}
+
+// Task is a data type for tracing a user-defined, logical operation.
+type Task struct {
+	id uint64
+	// TODO(hyangah): record parent id?
+}
+
+// End marks the end of the operation represented by the Task.
+func (t *Task) End() {
+	userTaskEnd(t.id)
+}
+
+var lastTaskID uint64 = 0 // task id issued last time
+
+func newID() uint64 {
+	// TODO(hyangah): use per-P cache
+	return atomic.AddUint64(&lastTaskID, 1)
+}
+
+var bgTask = Task{id: uint64(0)}
+
+// Log emits a one-off event with the given category and message.
+// Category can be empty and the API assumes there are only a handful of
+// unique categories in the system.
+func Log(ctx context.Context, category, message string) {
+	id := fromContext(ctx).id
+	userLog(id, category, message)
+}
+
+// Logf is like Log, but the value is formatted using the specified format spec.
+func Logf(ctx context.Context, category, format string, args ...any) {
+	if IsEnabled() {
+		// Ideally this should be just Log, but that will
+		// add one more frame in the stack trace.
+		id := fromContext(ctx).id
+		userLog(id, category, fmt.Sprintf(format, args...))
+	}
+}
+
+const (
+	regionStartCode = uint64(0)
+	regionEndCode   = uint64(1)
+)
+
+// WithRegion starts a region associated with its calling goroutine, runs fn,
+// and then ends the region. If the context carries a task, the region is
+// associated with the task. Otherwise, the region is attached to the background
+// task.
+//
+// The regionType is used to classify regions, so there should be only a
+// handful of unique region types.
+func WithRegion(ctx context.Context, regionType string, fn func()) {
+	// NOTE:
+	// WithRegion helps avoiding misuse of the API but in practice,
+	// this is very restrictive:
+	// - Use of WithRegion makes the stack traces captured from
+	//   region start and end are identical.
+	// - Refactoring the existing code to use WithRegion is sometimes
+	//   hard and makes the code less readable.
+	//     e.g. code block nested deep in the loop with various
+	//          exit point with return values
+	// - Refactoring the code to use this API with closure can
+	//   cause different GC behavior such as retaining some parameters
+	//   longer.
+	// This causes more churns in code than I hoped, and sometimes
+	// makes the code less readable.
+
+	id := fromContext(ctx).id
+	userRegion(id, regionStartCode, regionType)
+	defer userRegion(id, regionEndCode, regionType)
+	fn()
+}
+
+// StartRegion starts a region and returns a function for marking the
+// end of the region. The returned Region's End function must be called
+// from the same goroutine where the region was started.
+// Within each goroutine, regions must nest. That is, regions started
+// after this region must be ended before this region can be ended.
+// Recommended usage is
+//
+//	defer trace.StartRegion(ctx, "myTracedRegion").End()
+func StartRegion(ctx context.Context, regionType string) *Region {
+	if !IsEnabled() {
+		return noopRegion
+	}
+	id := fromContext(ctx).id
+	userRegion(id, regionStartCode, regionType)
+	return &Region{id, regionType}
+}
+
+// Region is a region of code whose execution time interval is traced.
+type Region struct {
+	id         uint64
+	regionType string
+}
+
+var noopRegion = &Region{}
+
+// End marks the end of the traced code region.
+func (r *Region) End() {
+	if r == noopRegion {
+		return
+	}
+	userRegion(r.id, regionEndCode, r.regionType)
+}
+
+// IsEnabled reports whether tracing is enabled.
+// The information is advisory only. The tracing status
+// may have changed by the time this function returns.
+func IsEnabled() bool {
+	return tracing.enabled.Load()
+}
+
+//
+// Function bodies are defined in runtime/trace.go
+//
+
+// emits UserTaskCreate event.
+func userTaskCreate(id, parentID uint64, taskType string)
+
+// emits UserTaskEnd event.
+func userTaskEnd(id uint64)
+
+// emits UserRegion event.
+func userRegion(id, mode uint64, regionType string)
+
+// emits UserLog event.
+func userLog(id uint64, category, message string)
diff --git a/src/runtime/trace/annotation_test.go b/src/runtime/trace/annotation_test.go
new file mode 100644
index 0000000..69ea8f2
--- /dev/null
+++ b/src/runtime/trace/annotation_test.go
@@ -0,0 +1,156 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace_test
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"internal/trace"
+	"reflect"
+	. "runtime/trace"
+	"strings"
+	"sync"
+	"testing"
+)
+
+func BenchmarkStartRegion(b *testing.B) {
+	b.ReportAllocs()
+	ctx, task := NewTask(context.Background(), "benchmark")
+	defer task.End()
+
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			StartRegion(ctx, "region").End()
+		}
+	})
+}
+
+func BenchmarkNewTask(b *testing.B) {
+	b.ReportAllocs()
+	pctx, task := NewTask(context.Background(), "benchmark")
+	defer task.End()
+
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			_, task := NewTask(pctx, "task")
+			task.End()
+		}
+	})
+}
+
+func TestUserTaskRegion(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	bgctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	preExistingRegion := StartRegion(bgctx, "pre-existing region")
+
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+
+	// Beginning of traced execution
+	var wg sync.WaitGroup
+	ctx, task := NewTask(bgctx, "task0") // EvUserTaskCreate("task0")
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		defer task.End() // EvUserTaskEnd("task0")
+
+		WithRegion(ctx, "region0", func() {
+			// EvUserRegionCreate("region0", start)
+			WithRegion(ctx, "region1", func() {
+				Log(ctx, "key0", "0123456789abcdef") // EvUserLog("task0", "key0", "0....f")
+			})
+			// EvUserRegion("region0", end)
+		})
+	}()
+
+	wg.Wait()
+
+	preExistingRegion.End()
+	postExistingRegion := StartRegion(bgctx, "post-existing region")
+
+	// End of traced execution
+	Stop()
+
+	postExistingRegion.End()
+
+	saveTrace(t, buf, "TestUserTaskRegion")
+	res, err := trace.Parse(buf, "")
+	if err == trace.ErrTimeOrder {
+		// golang.org/issues/16755
+		t.Skipf("skipping trace: %v", err)
+	}
+	if err != nil {
+		t.Fatalf("Parse failed: %v", err)
+	}
+
+	// Check whether we see all user annotation related records in order
+	type testData struct {
+		typ     byte
+		strs    []string
+		args    []uint64
+		setLink bool
+	}
+
+	var got []testData
+	tasks := map[uint64]string{}
+	for _, e := range res.Events {
+		t.Logf("%s", e)
+		switch e.Type {
+		case trace.EvUserTaskCreate:
+			taskName := e.SArgs[0]
+			got = append(got, testData{trace.EvUserTaskCreate, []string{taskName}, nil, e.Link != nil})
+			if e.Link != nil && e.Link.Type != trace.EvUserTaskEnd {
+				t.Errorf("Unexpected linked event %q->%q", e, e.Link)
+			}
+			tasks[e.Args[0]] = taskName
+		case trace.EvUserLog:
+			key, val := e.SArgs[0], e.SArgs[1]
+			taskName := tasks[e.Args[0]]
+			got = append(got, testData{trace.EvUserLog, []string{taskName, key, val}, nil, e.Link != nil})
+		case trace.EvUserTaskEnd:
+			taskName := tasks[e.Args[0]]
+			got = append(got, testData{trace.EvUserTaskEnd, []string{taskName}, nil, e.Link != nil})
+			if e.Link != nil && e.Link.Type != trace.EvUserTaskCreate {
+				t.Errorf("Unexpected linked event %q->%q", e, e.Link)
+			}
+		case trace.EvUserRegion:
+			taskName := tasks[e.Args[0]]
+			regionName := e.SArgs[0]
+			got = append(got, testData{trace.EvUserRegion, []string{taskName, regionName}, []uint64{e.Args[1]}, e.Link != nil})
+			if e.Link != nil && (e.Link.Type != trace.EvUserRegion || e.Link.SArgs[0] != regionName) {
+				t.Errorf("Unexpected linked event %q->%q", e, e.Link)
+			}
+		}
+	}
+	want := []testData{
+		{trace.EvUserTaskCreate, []string{"task0"}, nil, true},
+		{trace.EvUserRegion, []string{"task0", "region0"}, []uint64{0}, true},
+		{trace.EvUserRegion, []string{"task0", "region1"}, []uint64{0}, true},
+		{trace.EvUserLog, []string{"task0", "key0", "0123456789abcdef"}, nil, false},
+		{trace.EvUserRegion, []string{"task0", "region1"}, []uint64{1}, false},
+		{trace.EvUserRegion, []string{"task0", "region0"}, []uint64{1}, false},
+		{trace.EvUserTaskEnd, []string{"task0"}, nil, false},
+		//  Currently, pre-existing region is not recorded to avoid allocations.
+		//  {trace.EvUserRegion, []string{"", "pre-existing region"}, []uint64{1}, false},
+		{trace.EvUserRegion, []string{"", "post-existing region"}, []uint64{0}, false},
+	}
+	if !reflect.DeepEqual(got, want) {
+		pretty := func(data []testData) string {
+			var s strings.Builder
+			for _, d := range data {
+				fmt.Fprintf(&s, "\t%+v\n", d)
+			}
+			return s.String()
+		}
+		t.Errorf("Got user region related events\n%+v\nwant:\n%+v", pretty(got), pretty(want))
+	}
+}
diff --git a/src/runtime/trace/example_test.go b/src/runtime/trace/example_test.go
new file mode 100644
index 0000000..ba96a82
--- /dev/null
+++ b/src/runtime/trace/example_test.go
@@ -0,0 +1,39 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace_test
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"runtime/trace"
+)
+
+// Example demonstrates the use of the trace package to trace
+// the execution of a Go program. The trace output will be
+// written to the file trace.out
+func Example() {
+	f, err := os.Create("trace.out")
+	if err != nil {
+		log.Fatalf("failed to create trace output file: %v", err)
+	}
+	defer func() {
+		if err := f.Close(); err != nil {
+			log.Fatalf("failed to close trace file: %v", err)
+		}
+	}()
+
+	if err := trace.Start(f); err != nil {
+		log.Fatalf("failed to start trace: %v", err)
+	}
+	defer trace.Stop()
+
+	// your program here
+	RunMyProgram()
+}
+
+func RunMyProgram() {
+	fmt.Printf("this function will be traced")
+}
diff --git a/src/runtime/trace/trace.go b/src/runtime/trace/trace.go
new file mode 100644
index 0000000..86c97e2
--- /dev/null
+++ b/src/runtime/trace/trace.go
@@ -0,0 +1,154 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package trace contains facilities for programs to generate traces
+// for the Go execution tracer.
+//
+// # Tracing runtime activities
+//
+// The execution trace captures a wide range of execution events such as
+// goroutine creation/blocking/unblocking, syscall enter/exit/block,
+// GC-related events, changes of heap size, processor start/stop, etc.
+// When CPU profiling is active, the execution tracer makes an effort to
+// include those samples as well.
+// A precise nanosecond-precision timestamp and a stack trace is
+// captured for most events. The generated trace can be interpreted
+// using `go tool trace`.
+//
+// Support for tracing tests and benchmarks built with the standard
+// testing package is built into `go test`. For example, the following
+// command runs the test in the current directory and writes the trace
+// file (trace.out).
+//
+//	go test -trace=trace.out
+//
+// This runtime/trace package provides APIs to add equivalent tracing
+// support to a standalone program. See the Example that demonstrates
+// how to use this API to enable tracing.
+//
+// There is also a standard HTTP interface to trace data. Adding the
+// following line will install a handler under the /debug/pprof/trace URL
+// to download a live trace:
+//
+//	import _ "net/http/pprof"
+//
+// See the net/http/pprof package for more details about all of the
+// debug endpoints installed by this import.
+//
+// # User annotation
+//
+// Package trace provides user annotation APIs that can be used to
+// log interesting events during execution.
+//
+// There are three types of user annotations: log messages, regions,
+// and tasks.
+//
+// Log emits a timestamped message to the execution trace along with
+// additional information such as the category of the message and
+// which goroutine called Log. The execution tracer provides UIs to filter
+// and group goroutines using the log category and the message supplied
+// in Log.
+//
+// A region is for logging a time interval during a goroutine's execution.
+// By definition, a region starts and ends in the same goroutine.
+// Regions can be nested to represent subintervals.
+// For example, the following code records four regions in the execution
+// trace to trace the durations of sequential steps in a cappuccino making
+// operation.
+//
+//	trace.WithRegion(ctx, "makeCappuccino", func() {
+//
+//	   // orderID allows to identify a specific order
+//	   // among many cappuccino order region records.
+//	   trace.Log(ctx, "orderID", orderID)
+//
+//	   trace.WithRegion(ctx, "steamMilk", steamMilk)
+//	   trace.WithRegion(ctx, "extractCoffee", extractCoffee)
+//	   trace.WithRegion(ctx, "mixMilkCoffee", mixMilkCoffee)
+//	})
+//
+// A task is a higher-level component that aids tracing of logical
+// operations such as an RPC request, an HTTP request, or an
+// interesting local operation which may require multiple goroutines
+// working together. Since tasks can involve multiple goroutines,
+// they are tracked via a context.Context object. NewTask creates
+// a new task and embeds it in the returned context.Context object.
+// Log messages and regions are attached to the task, if any, in the
+// Context passed to Log and WithRegion.
+//
+// For example, assume that we decided to froth milk, extract coffee,
+// and mix milk and coffee in separate goroutines. With a task,
+// the trace tool can identify the goroutines involved in a specific
+// cappuccino order.
+//
+//	ctx, task := trace.NewTask(ctx, "makeCappuccino")
+//	trace.Log(ctx, "orderID", orderID)
+//
+//	milk := make(chan bool)
+//	espresso := make(chan bool)
+//
+//	go func() {
+//	        trace.WithRegion(ctx, "steamMilk", steamMilk)
+//	        milk <- true
+//	}()
+//	go func() {
+//	        trace.WithRegion(ctx, "extractCoffee", extractCoffee)
+//	        espresso <- true
+//	}()
+//	go func() {
+//	        defer task.End() // When assemble is done, the order is complete.
+//	        <-espresso
+//	        <-milk
+//	        trace.WithRegion(ctx, "mixMilkCoffee", mixMilkCoffee)
+//	}()
+//
+// The trace tool computes the latency of a task by measuring the
+// time between the task creation and the task end and provides
+// latency distributions for each task type found in the trace.
+package trace
+
+import (
+	"io"
+	"runtime"
+	"sync"
+	"sync/atomic"
+)
+
+// Start enables tracing for the current program.
+// While tracing, the trace will be buffered and written to w.
+// Start returns an error if tracing is already enabled.
+func Start(w io.Writer) error {
+	tracing.Lock()
+	defer tracing.Unlock()
+
+	if err := runtime.StartTrace(); err != nil {
+		return err
+	}
+	go func() {
+		for {
+			data := runtime.ReadTrace()
+			if data == nil {
+				break
+			}
+			w.Write(data)
+		}
+	}()
+	tracing.enabled.Store(true)
+	return nil
+}
+
+// Stop stops the current tracing, if any.
+// Stop only returns after all the writes for the trace have completed.
+func Stop() {
+	tracing.Lock()
+	defer tracing.Unlock()
+	tracing.enabled.Store(false)
+
+	runtime.StopTrace()
+}
+
+var tracing struct {
+	sync.Mutex // gate mutators (Start, Stop)
+	enabled    atomic.Bool
+}
diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
new file mode 100644
index 0000000..be3adc9
--- /dev/null
+++ b/src/runtime/trace/trace_stack_test.go
@@ -0,0 +1,333 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace_test
+
+import (
+	"bytes"
+	"fmt"
+	"internal/testenv"
+	"internal/trace"
+	"net"
+	"os"
+	"runtime"
+	. "runtime/trace"
+	"strings"
+	"sync"
+	"testing"
+	"text/tabwriter"
+	"time"
+)
+
+// TestTraceSymbolize tests symbolization and that events has proper stacks.
+// In particular that we strip bottom uninteresting frames like goexit,
+// top uninteresting frames (runtime guts).
+func TestTraceSymbolize(t *testing.T) {
+	skipTraceSymbolizeTestIfNecessary(t)
+
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+	defer Stop() // in case of early return
+
+	// Now we will do a bunch of things for which we verify stacks later.
+	// It is impossible to ensure that a goroutine has actually blocked
+	// on a channel, in a select or otherwise. So we kick off goroutines
+	// that need to block first in the hope that while we are executing
+	// the rest of the test, they will block.
+	go func() { // func1
+		select {}
+	}()
+	go func() { // func2
+		var c chan int
+		c <- 0
+	}()
+	go func() { // func3
+		var c chan int
+		<-c
+	}()
+	done1 := make(chan bool)
+	go func() { // func4
+		<-done1
+	}()
+	done2 := make(chan bool)
+	go func() { // func5
+		done2 <- true
+	}()
+	c1 := make(chan int)
+	c2 := make(chan int)
+	go func() { // func6
+		select {
+		case <-c1:
+		case <-c2:
+		}
+	}()
+	var mu sync.Mutex
+	mu.Lock()
+	go func() { // func7
+		mu.Lock()
+		mu.Unlock()
+	}()
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() { // func8
+		wg.Wait()
+	}()
+	cv := sync.NewCond(&sync.Mutex{})
+	go func() { // func9
+		cv.L.Lock()
+		cv.Wait()
+		cv.L.Unlock()
+	}()
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to listen: %v", err)
+	}
+	go func() { // func10
+		c, err := ln.Accept()
+		if err != nil {
+			t.Errorf("failed to accept: %v", err)
+			return
+		}
+		c.Close()
+	}()
+	rp, wp, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("failed to create a pipe: %v", err)
+	}
+	defer rp.Close()
+	defer wp.Close()
+	pipeReadDone := make(chan bool)
+	go func() { // func11
+		var data [1]byte
+		rp.Read(data[:])
+		pipeReadDone <- true
+	}()
+
+	time.Sleep(100 * time.Millisecond)
+	runtime.GC()
+	runtime.Gosched()
+	time.Sleep(100 * time.Millisecond) // the last chance for the goroutines above to block
+	done1 <- true
+	<-done2
+	select {
+	case c1 <- 0:
+	case c2 <- 0:
+	}
+	mu.Unlock()
+	wg.Done()
+	cv.Signal()
+	c, err := net.Dial("tcp", ln.Addr().String())
+	if err != nil {
+		t.Fatalf("failed to dial: %v", err)
+	}
+	c.Close()
+	var data [1]byte
+	wp.Write(data[:])
+	<-pipeReadDone
+
+	oldGoMaxProcs := runtime.GOMAXPROCS(0)
+	runtime.GOMAXPROCS(oldGoMaxProcs + 1)
+
+	Stop()
+
+	runtime.GOMAXPROCS(oldGoMaxProcs)
+
+	events, _ := parseTrace(t, buf)
+
+	// Now check that the stacks are correct.
+	type eventDesc struct {
+		Type byte
+		Stk  []frame
+	}
+	want := []eventDesc{
+		{trace.EvGCStart, []frame{
+			{"runtime.GC", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 0},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoStart, []frame{
+			{"runtime/trace_test.TestTraceSymbolize.func1", 0},
+		}},
+		{trace.EvGoSched, []frame{
+			{"runtime/trace_test.TestTraceSymbolize", 111},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoCreate, []frame{
+			{"runtime/trace_test.TestTraceSymbolize", 40},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoStop, []frame{
+			{"runtime.block", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func1", 0},
+		}},
+		{trace.EvGoStop, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func2", 0},
+		}},
+		{trace.EvGoStop, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func3", 0},
+		}},
+		{trace.EvGoBlockRecv, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func4", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 113},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoBlockSend, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func5", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 114},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoBlockSelect, []frame{
+			{"runtime.selectgo", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func6", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.selectgo", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 115},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoBlockSync, []frame{
+			{"sync.(*Mutex).Lock", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func7", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*Mutex).Unlock", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 0},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoBlockSync, []frame{
+			{"sync.(*WaitGroup).Wait", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func8", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*WaitGroup).Add", 0},
+			{"sync.(*WaitGroup).Done", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 120},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoBlockCond, []frame{
+			{"sync.(*Cond).Wait", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func9", 0},
+		}},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*Cond).Signal", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 0},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoSleep, []frame{
+			{"time.Sleep", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 0},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGomaxprocs, []frame{
+			{"runtime.startTheWorld", 0}, // this is when the current gomaxprocs is logged.
+			{"runtime.startTheWorldGC", 0},
+			{"runtime.GOMAXPROCS", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 0},
+			{"testing.tRunner", 0},
+		}},
+	}
+	// Stacks for the following events are OS-dependent due to OS-specific code in net package.
+	if runtime.GOOS != "windows" && runtime.GOOS != "plan9" {
+		want = append(want, []eventDesc{
+			{trace.EvGoBlockNet, []frame{
+				{"internal/poll.(*FD).Accept", 0},
+				{"net.(*netFD).accept", 0},
+				{"net.(*TCPListener).accept", 0},
+				{"net.(*TCPListener).Accept", 0},
+				{"runtime/trace_test.TestTraceSymbolize.func10", 0},
+			}},
+			{trace.EvGoSysCall, []frame{
+				{"syscall.read", 0},
+				{"syscall.Read", 0},
+				{"internal/poll.ignoringEINTRIO", 0},
+				{"internal/poll.(*FD).Read", 0},
+				{"os.(*File).read", 0},
+				{"os.(*File).Read", 0},
+				{"runtime/trace_test.TestTraceSymbolize.func11", 0},
+			}},
+		}...)
+	}
+	matched := make([]bool, len(want))
+	for _, ev := range events {
+	wantLoop:
+		for i, w := range want {
+			if matched[i] || w.Type != ev.Type || len(w.Stk) != len(ev.Stk) {
+				continue
+			}
+
+			for fi, f := range ev.Stk {
+				wf := w.Stk[fi]
+				if wf.Fn != f.Fn || wf.Line != 0 && wf.Line != f.Line {
+					continue wantLoop
+				}
+			}
+			matched[i] = true
+		}
+	}
+	for i, w := range want {
+		if matched[i] {
+			continue
+		}
+		seen, n := dumpEventStacks(w.Type, events)
+		t.Errorf("Did not match event %v with stack\n%s\nSeen %d events of the type\n%s",
+			trace.EventDescriptions[w.Type].Name, dumpFrames(w.Stk), n, seen)
+	}
+}
+
+func skipTraceSymbolizeTestIfNecessary(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+}
+
+func dumpEventStacks(typ byte, events []*trace.Event) ([]byte, int) {
+	matched := 0
+	o := new(bytes.Buffer)
+	tw := tabwriter.NewWriter(o, 0, 8, 0, '\t', 0)
+	for _, ev := range events {
+		if ev.Type != typ {
+			continue
+		}
+		matched++
+		fmt.Fprintf(tw, "Offset %d\n", ev.Off)
+		for _, f := range ev.Stk {
+			fname := f.File
+			if idx := strings.Index(fname, "/go/src/"); idx > 0 {
+				fname = fname[idx:]
+			}
+			fmt.Fprintf(tw, "  %v\t%s:%d\n", f.Fn, fname, f.Line)
+		}
+	}
+	tw.Flush()
+	return o.Bytes(), matched
+}
+
+type frame struct {
+	Fn   string
+	Line int
+}
+
+func dumpFrames(frames []frame) []byte {
+	o := new(bytes.Buffer)
+	tw := tabwriter.NewWriter(o, 0, 8, 0, '\t', 0)
+
+	for _, f := range frames {
+		fmt.Fprintf(tw, "  %v\t :%d\n", f.Fn, f.Line)
+	}
+	tw.Flush()
+	return o.Bytes()
+}
diff --git a/src/runtime/trace/trace_test.go b/src/runtime/trace/trace_test.go
new file mode 100644
index 0000000..19f7dbe
--- /dev/null
+++ b/src/runtime/trace/trace_test.go
@@ -0,0 +1,792 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package trace_test
+
+import (
+	"bytes"
+	"context"
+	"flag"
+	"fmt"
+	"internal/profile"
+	"internal/race"
+	"internal/trace"
+	"io"
+	"net"
+	"os"
+	"runtime"
+	"runtime/pprof"
+	. "runtime/trace"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+var (
+	saveTraces = flag.Bool("savetraces", false, "save traces collected by tests")
+)
+
+// TestEventBatch tests Flush calls that happen during Start
+// don't produce corrupted traces.
+func TestEventBatch(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skipping in race mode")
+	}
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	if testing.Short() {
+		t.Skip("skipping in short mode")
+	}
+	// During Start, bunch of records are written to reflect the current
+	// snapshot of the program, including state of each goroutines.
+	// And some string constants are written to the trace to aid trace
+	// parsing. This test checks Flush of the buffer occurred during
+	// this process doesn't cause corrupted traces.
+	// When a Flush is called during Start is complicated
+	// so we test with a range of number of goroutines hoping that one
+	// of them triggers Flush.
+	// This range was chosen to fill up a ~64KB buffer with traceEvGoCreate
+	// and traceEvGoWaiting events (12~13bytes per goroutine).
+	for g := 4950; g < 5050; g++ {
+		n := g
+		t.Run("G="+strconv.Itoa(n), func(t *testing.T) {
+			var wg sync.WaitGroup
+			wg.Add(n)
+
+			in := make(chan bool, 1000)
+			for i := 0; i < n; i++ {
+				go func() {
+					<-in
+					wg.Done()
+				}()
+			}
+			buf := new(bytes.Buffer)
+			if err := Start(buf); err != nil {
+				t.Fatalf("failed to start tracing: %v", err)
+			}
+
+			for i := 0; i < n; i++ {
+				in <- true
+			}
+			wg.Wait()
+			Stop()
+
+			_, err := trace.Parse(buf, "")
+			if err == trace.ErrTimeOrder {
+				t.Skipf("skipping trace: %v", err)
+			}
+
+			if err != nil {
+				t.Fatalf("failed to parse trace: %v", err)
+			}
+		})
+	}
+}
+
+func TestTraceStartStop(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+	Stop()
+	size := buf.Len()
+	if size == 0 {
+		t.Fatalf("trace is empty")
+	}
+	time.Sleep(100 * time.Millisecond)
+	if size != buf.Len() {
+		t.Fatalf("trace writes after stop: %v -> %v", size, buf.Len())
+	}
+	saveTrace(t, buf, "TestTraceStartStop")
+}
+
+func TestTraceDoubleStart(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	Stop()
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+	if err := Start(buf); err == nil {
+		t.Fatalf("succeed to start tracing second time")
+	}
+	Stop()
+	Stop()
+}
+
+func TestTrace(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+	Stop()
+	saveTrace(t, buf, "TestTrace")
+	_, err := trace.Parse(buf, "")
+	if err == trace.ErrTimeOrder {
+		t.Skipf("skipping trace: %v", err)
+	}
+	if err != nil {
+		t.Fatalf("failed to parse trace: %v", err)
+	}
+}
+
+func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc) {
+	res, err := trace.Parse(r, "")
+	if err == trace.ErrTimeOrder {
+		t.Skipf("skipping trace: %v", err)
+	}
+	if err != nil {
+		t.Fatalf("failed to parse trace: %v", err)
+	}
+	gs := trace.GoroutineStats(res.Events)
+	for goid := range gs {
+		// We don't do any particular checks on the result at the moment.
+		// But still check that RelatedGoroutines does not crash, hang, etc.
+		_ = trace.RelatedGoroutines(res.Events, goid)
+	}
+	return res.Events, gs
+}
+
+func testBrokenTimestamps(t *testing.T, data []byte) {
+	// On some processors cputicks (used to generate trace timestamps)
+	// produce non-monotonic timestamps. It is important that the parser
+	// distinguishes logically inconsistent traces (e.g. missing, excessive
+	// or misordered events) from broken timestamps. The former is a bug
+	// in tracer, the latter is a machine issue.
+	// So now that we have a consistent trace, test that (1) parser does
+	// not return a logical error in case of broken timestamps
+	// and (2) broken timestamps are eventually detected and reported.
+	trace.BreakTimestampsForTesting = true
+	defer func() {
+		trace.BreakTimestampsForTesting = false
+	}()
+	for i := 0; i < 1e4; i++ {
+		_, err := trace.Parse(bytes.NewReader(data), "")
+		if err == trace.ErrTimeOrder {
+			return
+		}
+		if err != nil {
+			t.Fatalf("failed to parse trace: %v", err)
+		}
+	}
+}
+
+func TestTraceStress(t *testing.T) {
+	if runtime.GOOS == "js" {
+		t.Skip("no os.Pipe on js")
+	}
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	if testing.Short() {
+		t.Skip("skipping in -short mode")
+	}
+
+	var wg sync.WaitGroup
+	done := make(chan bool)
+
+	// Create a goroutine blocked before tracing.
+	wg.Add(1)
+	go func() {
+		<-done
+		wg.Done()
+	}()
+
+	// Create a goroutine blocked in syscall before tracing.
+	rp, wp, err := os.Pipe()
+	if err != nil {
+		t.Fatalf("failed to create pipe: %v", err)
+	}
+	defer func() {
+		rp.Close()
+		wp.Close()
+	}()
+	wg.Add(1)
+	go func() {
+		var tmp [1]byte
+		rp.Read(tmp[:])
+		<-done
+		wg.Done()
+	}()
+	time.Sleep(time.Millisecond) // give the goroutine above time to block
+
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+
+	procs := runtime.GOMAXPROCS(10)
+	time.Sleep(50 * time.Millisecond) // test proc stop/start events
+
+	go func() {
+		runtime.LockOSThread()
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				runtime.Gosched()
+			}
+		}
+	}()
+
+	runtime.GC()
+	// Trigger GC from malloc.
+	n := int(1e3)
+	if isMemoryConstrained() {
+		// Reduce allocation to avoid running out of
+		// memory on the builder - see issue/12032.
+		n = 512
+	}
+	for i := 0; i < n; i++ {
+		_ = make([]byte, 1<<20)
+	}
+
+	// Create a bunch of busy goroutines to load all Ps.
+	for p := 0; p < 10; p++ {
+		wg.Add(1)
+		go func() {
+			// Do something useful.
+			tmp := make([]byte, 1<<16)
+			for i := range tmp {
+				tmp[i]++
+			}
+			_ = tmp
+			<-done
+			wg.Done()
+		}()
+	}
+
+	// Block in syscall.
+	wg.Add(1)
+	go func() {
+		var tmp [1]byte
+		rp.Read(tmp[:])
+		<-done
+		wg.Done()
+	}()
+
+	// Test timers.
+	timerDone := make(chan bool)
+	go func() {
+		time.Sleep(time.Millisecond)
+		timerDone <- true
+	}()
+	<-timerDone
+
+	// A bit of network.
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("listen failed: %v", err)
+	}
+	defer ln.Close()
+	go func() {
+		c, err := ln.Accept()
+		if err != nil {
+			return
+		}
+		time.Sleep(time.Millisecond)
+		var buf [1]byte
+		c.Write(buf[:])
+		c.Close()
+	}()
+	c, err := net.Dial("tcp", ln.Addr().String())
+	if err != nil {
+		t.Fatalf("dial failed: %v", err)
+	}
+	var tmp [1]byte
+	c.Read(tmp[:])
+	c.Close()
+
+	go func() {
+		runtime.Gosched()
+		select {}
+	}()
+
+	// Unblock helper goroutines and wait them to finish.
+	wp.Write(tmp[:])
+	wp.Write(tmp[:])
+	close(done)
+	wg.Wait()
+
+	runtime.GOMAXPROCS(procs)
+
+	Stop()
+	saveTrace(t, buf, "TestTraceStress")
+	trace := buf.Bytes()
+	parseTrace(t, buf)
+	testBrokenTimestamps(t, trace)
+}
+
+// isMemoryConstrained reports whether the current machine is likely
+// to be memory constrained.
+// This was originally for the openbsd/arm builder (Issue 12032).
+// TODO: move this to testenv? Make this look at memory? Look at GO_BUILDER_NAME?
+func isMemoryConstrained() bool {
+	if runtime.GOOS == "plan9" {
+		return true
+	}
+	switch runtime.GOARCH {
+	case "arm", "mips", "mipsle":
+		return true
+	}
+	return false
+}
+
+// Do a bunch of various stuff (timers, GC, network, etc) in a separate goroutine.
+// And concurrently with all that start/stop trace 3 times.
+func TestTraceStressStartStop(t *testing.T) {
+	if runtime.GOOS == "js" {
+		t.Skip("no os.Pipe on js")
+	}
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
+	outerDone := make(chan bool)
+
+	go func() {
+		defer func() {
+			outerDone <- true
+		}()
+
+		var wg sync.WaitGroup
+		done := make(chan bool)
+
+		wg.Add(1)
+		go func() {
+			<-done
+			wg.Done()
+		}()
+
+		rp, wp, err := os.Pipe()
+		if err != nil {
+			t.Errorf("failed to create pipe: %v", err)
+			return
+		}
+		defer func() {
+			rp.Close()
+			wp.Close()
+		}()
+		wg.Add(1)
+		go func() {
+			var tmp [1]byte
+			rp.Read(tmp[:])
+			<-done
+			wg.Done()
+		}()
+		time.Sleep(time.Millisecond)
+
+		go func() {
+			runtime.LockOSThread()
+			for {
+				select {
+				case <-done:
+					return
+				default:
+					runtime.Gosched()
+				}
+			}
+		}()
+
+		runtime.GC()
+		// Trigger GC from malloc.
+		n := int(1e3)
+		if isMemoryConstrained() {
+			// Reduce allocation to avoid running out of
+			// memory on the builder.
+			n = 512
+		}
+		for i := 0; i < n; i++ {
+			_ = make([]byte, 1<<20)
+		}
+
+		// Create a bunch of busy goroutines to load all Ps.
+		for p := 0; p < 10; p++ {
+			wg.Add(1)
+			go func() {
+				// Do something useful.
+				tmp := make([]byte, 1<<16)
+				for i := range tmp {
+					tmp[i]++
+				}
+				_ = tmp
+				<-done
+				wg.Done()
+			}()
+		}
+
+		// Block in syscall.
+		wg.Add(1)
+		go func() {
+			var tmp [1]byte
+			rp.Read(tmp[:])
+			<-done
+			wg.Done()
+		}()
+
+		runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+
+		// Test timers.
+		timerDone := make(chan bool)
+		go func() {
+			time.Sleep(time.Millisecond)
+			timerDone <- true
+		}()
+		<-timerDone
+
+		// A bit of network.
+		ln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			t.Errorf("listen failed: %v", err)
+			return
+		}
+		defer ln.Close()
+		go func() {
+			c, err := ln.Accept()
+			if err != nil {
+				return
+			}
+			time.Sleep(time.Millisecond)
+			var buf [1]byte
+			c.Write(buf[:])
+			c.Close()
+		}()
+		c, err := net.Dial("tcp", ln.Addr().String())
+		if err != nil {
+			t.Errorf("dial failed: %v", err)
+			return
+		}
+		var tmp [1]byte
+		c.Read(tmp[:])
+		c.Close()
+
+		go func() {
+			runtime.Gosched()
+			select {}
+		}()
+
+		// Unblock helper goroutines and wait them to finish.
+		wp.Write(tmp[:])
+		wp.Write(tmp[:])
+		close(done)
+		wg.Wait()
+	}()
+
+	for i := 0; i < 3; i++ {
+		buf := new(bytes.Buffer)
+		if err := Start(buf); err != nil {
+			t.Fatalf("failed to start tracing: %v", err)
+		}
+		time.Sleep(time.Millisecond)
+		Stop()
+		saveTrace(t, buf, "TestTraceStressStartStop")
+		trace := buf.Bytes()
+		parseTrace(t, buf)
+		testBrokenTimestamps(t, trace)
+	}
+	<-outerDone
+}
+
+func TestTraceFutileWakeup(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
+	c0 := make(chan int, 1)
+	c1 := make(chan int, 1)
+	c2 := make(chan int, 1)
+	const procs = 2
+	var done sync.WaitGroup
+	done.Add(4 * procs)
+	for p := 0; p < procs; p++ {
+		const iters = 1e3
+		go func() {
+			for i := 0; i < iters; i++ {
+				runtime.Gosched()
+				c0 <- 0
+			}
+			done.Done()
+		}()
+		go func() {
+			for i := 0; i < iters; i++ {
+				runtime.Gosched()
+				<-c0
+			}
+			done.Done()
+		}()
+		go func() {
+			for i := 0; i < iters; i++ {
+				runtime.Gosched()
+				select {
+				case c1 <- 0:
+				case c2 <- 0:
+				}
+			}
+			done.Done()
+		}()
+		go func() {
+			for i := 0; i < iters; i++ {
+				runtime.Gosched()
+				select {
+				case <-c1:
+				case <-c2:
+				}
+			}
+			done.Done()
+		}()
+	}
+	done.Wait()
+
+	Stop()
+	saveTrace(t, buf, "TestTraceFutileWakeup")
+	events, _ := parseTrace(t, buf)
+	// Check that (1) trace does not contain EvFutileWakeup events and
+	// (2) there are no consecutive EvGoBlock/EvGCStart/EvGoBlock events
+	// (we call runtime.Gosched between all operations, so these would be futile wakeups).
+	gs := make(map[uint64]int)
+	for _, ev := range events {
+		switch ev.Type {
+		case trace.EvFutileWakeup:
+			t.Fatalf("found EvFutileWakeup event")
+		case trace.EvGoBlockSend, trace.EvGoBlockRecv, trace.EvGoBlockSelect:
+			if gs[ev.G] == 2 {
+				t.Fatalf("goroutine %v blocked on %v at %v right after start",
+					ev.G, trace.EventDescriptions[ev.Type].Name, ev.Ts)
+			}
+			if gs[ev.G] == 1 {
+				t.Fatalf("goroutine %v blocked on %v at %v while blocked",
+					ev.G, trace.EventDescriptions[ev.Type].Name, ev.Ts)
+			}
+			gs[ev.G] = 1
+		case trace.EvGoStart:
+			if gs[ev.G] == 1 {
+				gs[ev.G] = 2
+			}
+		default:
+			delete(gs, ev.G)
+		}
+	}
+}
+
+func TestTraceCPUProfile(t *testing.T) {
+	if IsEnabled() {
+		t.Skip("skipping because -test.trace is set")
+	}
+
+	cpuBuf := new(bytes.Buffer)
+	if err := pprof.StartCPUProfile(cpuBuf); err != nil {
+		t.Skipf("failed to start CPU profile: %v", err)
+	}
+
+	buf := new(bytes.Buffer)
+	if err := Start(buf); err != nil {
+		t.Fatalf("failed to start tracing: %v", err)
+	}
+
+	dur := 100 * time.Millisecond
+	func() {
+		// Create a region in the execution trace. Set and clear goroutine
+		// labels fully within that region, so we know that any CPU profile
+		// sample with the label must also be eligible for inclusion in the
+		// execution trace.
+		ctx := context.Background()
+		defer StartRegion(ctx, "cpuHogger").End()
+		pprof.Do(ctx, pprof.Labels("tracing", "on"), func(ctx context.Context) {
+			cpuHogger(cpuHog1, &salt1, dur)
+		})
+		// Be sure the execution trace's view, when filtered to this goroutine
+		// via the explicit goroutine ID in each event, gets many more samples
+		// than the CPU profiler when filtered to this goroutine via labels.
+		cpuHogger(cpuHog1, &salt1, dur)
+	}()
+
+	Stop()
+	pprof.StopCPUProfile()
+	saveTrace(t, buf, "TestTraceCPUProfile")
+
+	prof, err := profile.Parse(cpuBuf)
+	if err != nil {
+		t.Fatalf("failed to parse CPU profile: %v", err)
+	}
+	// Examine the CPU profiler's view. Filter it to only include samples from
+	// the single test goroutine. Use labels to execute that filter: they should
+	// apply to all work done while that goroutine is getg().m.curg, and they
+	// should apply to no other goroutines.
+	pprofSamples := 0
+	pprofStacks := make(map[string]int)
+	for _, s := range prof.Sample {
+		if s.Label["tracing"] != nil {
+			var fns []string
+			var leaf string
+			for _, loc := range s.Location {
+				for _, line := range loc.Line {
+					fns = append(fns, fmt.Sprintf("%s:%d", line.Function.Name, line.Line))
+					leaf = line.Function.Name
+				}
+			}
+			// runtime.sigprof synthesizes call stacks when "normal traceback is
+			// impossible or has failed", using particular placeholder functions
+			// to represent common failure cases. Look for those functions in
+			// the leaf position as a sign that the call stack and its
+			// symbolization are more complex than this test can handle.
+			//
+			// TODO: Make the symbolization done by the execution tracer and CPU
+			// profiler match up even in these harder cases. See #53378.
+			switch leaf {
+			case "runtime._System", "runtime._GC", "runtime._ExternalCode", "runtime._VDSO":
+				continue
+			}
+			stack := strings.Join(fns, " ")
+			samples := int(s.Value[0])
+			pprofSamples += samples
+			pprofStacks[stack] += samples
+		}
+	}
+	if pprofSamples == 0 {
+		t.Skipf("CPU profile did not include any samples while tracing was active\n%s", prof)
+	}
+
+	// Examine the execution tracer's view of the CPU profile samples. Filter it
+	// to only include samples from the single test goroutine. Use the goroutine
+	// ID that was recorded in the events: that should reflect getg().m.curg,
+	// same as the profiler's labels (even when the M is using its g0 stack).
+	totalTraceSamples := 0
+	traceSamples := 0
+	traceStacks := make(map[string]int)
+	events, _ := parseTrace(t, buf)
+	var hogRegion *trace.Event
+	for _, ev := range events {
+		if ev.Type == trace.EvUserRegion && ev.Args[1] == 0 && ev.SArgs[0] == "cpuHogger" {
+			// mode "0" indicates region start
+			hogRegion = ev
+		}
+	}
+	if hogRegion == nil {
+		t.Fatalf("execution trace did not identify cpuHogger goroutine")
+	} else if hogRegion.Link == nil {
+		t.Fatalf("execution trace did not close cpuHogger region")
+	}
+	for _, ev := range events {
+		if ev.Type == trace.EvCPUSample {
+			totalTraceSamples++
+			if ev.G == hogRegion.G {
+				traceSamples++
+				var fns []string
+				for _, frame := range ev.Stk {
+					if frame.Fn != "runtime.goexit" {
+						fns = append(fns, fmt.Sprintf("%s:%d", frame.Fn, frame.Line))
+					}
+				}
+				stack := strings.Join(fns, " ")
+				traceStacks[stack]++
+			}
+		}
+	}
+
+	// The execution trace may drop CPU profile samples if the profiling buffer
+	// overflows. Based on the size of profBufWordCount, that takes a bit over
+	// 1900 CPU samples or 19 thread-seconds at a 100 Hz sample rate. If we've
+	// hit that case, then we definitely have at least one full buffer's worth
+	// of CPU samples, so we'll call that success.
+	overflowed := totalTraceSamples >= 1900
+	if traceSamples < pprofSamples {
+		t.Logf("exectution trace did not include all CPU profile samples; %d in profile, %d in trace", pprofSamples, traceSamples)
+		if !overflowed {
+			t.Fail()
+		}
+	}
+
+	for stack, traceSamples := range traceStacks {
+		pprofSamples := pprofStacks[stack]
+		delete(pprofStacks, stack)
+		if traceSamples < pprofSamples {
+			t.Logf("execution trace did not include all CPU profile samples for stack %q; %d in profile, %d in trace",
+				stack, pprofSamples, traceSamples)
+			if !overflowed {
+				t.Fail()
+			}
+		}
+	}
+	for stack, pprofSamples := range pprofStacks {
+		t.Logf("CPU profile included %d samples at stack %q not present in execution trace", pprofSamples, stack)
+		if !overflowed {
+			t.Fail()
+		}
+	}
+
+	if t.Failed() {
+		t.Logf("execution trace CPU samples:")
+		for stack, samples := range traceStacks {
+			t.Logf("%d: %q", samples, stack)
+		}
+		t.Logf("CPU profile:\n%v", prof)
+	}
+}
+
+func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
+	// We only need to get one 100 Hz clock tick, so we've got
+	// a large safety buffer.
+	// But do at least 500 iterations (which should take about 100ms),
+	// otherwise TestCPUProfileMultithreaded can fail if only one
+	// thread is scheduled during the testing period.
+	t0 := time.Now()
+	accum := *y
+	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
+		accum = f(accum)
+	}
+	*y = accum
+}
+
+var (
+	salt1 = 0
+)
+
+// The actual CPU hogging function.
+// Must not call other functions nor access heap/globals in the loop,
+// otherwise under race detector the samples will be in the race runtime.
+func cpuHog1(x int) int {
+	return cpuHog0(x, 1e5)
+}
+
+func cpuHog0(x, n int) int {
+	foo := x
+	for i := 0; i < n; i++ {
+		if i%1000 == 0 {
+			// Spend time in mcall, stored as gp.m.curg, with g0 running
+			runtime.Gosched()
+		}
+		if foo > 0 {
+			foo *= foo
+		} else {
+			foo *= foo + 1
+		}
+	}
+	return foo
+}
+
+func saveTrace(t *testing.T, buf *bytes.Buffer, name string) {
+	if !*saveTraces {
+		return
+	}
+	if err := os.WriteFile(name+".trace", buf.Bytes(), 0600); err != nil {
+		t.Errorf("failed to write trace file: %s", err)
+	}
+}
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
new file mode 100644
index 0000000..37f35d5
--- /dev/null
+++ b/src/runtime/traceback.go
@@ -0,0 +1,1377 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/bytealg"
+	"internal/goarch"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// The code in this file implements stack trace walking for all architectures.
+// The most important fact about a given architecture is whether it uses a link register.
+// On systems with link registers, the prologue for a non-leaf function stores the
+// incoming value of LR at the bottom of the newly allocated stack frame.
+// On systems without link registers (x86), the architecture pushes a return PC during
+// the call instruction, so the return PC ends up above the stack frame.
+// In this file, the return PC is always called LR, no matter how it was found.
+
+const usesLR = sys.MinFrameSize > 0
+
+// Generic traceback. Handles runtime stack prints (pcbuf == nil),
+// the runtime.Callers function (pcbuf != nil), as well as the garbage
+// collector (callback != nil).  A little clunky to merge these, but avoids
+// duplicating the code and all its subtlety.
+//
+// The skip argument is only valid with pcbuf != nil and counts the number
+// of logical frames to skip rather than physical frames (with inlining, a
+// PC in pcbuf can represent multiple calls).
+func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max int, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer, flags uint) int {
+	if skip > 0 && callback != nil {
+		throw("gentraceback callback cannot be used with non-zero skip")
+	}
+
+	// Don't call this "g"; it's too easy get "g" and "gp" confused.
+	if ourg := getg(); ourg == gp && ourg == ourg.m.curg {
+		// The starting sp has been passed in as a uintptr, and the caller may
+		// have other uintptr-typed stack references as well.
+		// If during one of the calls that got us here or during one of the
+		// callbacks below the stack must be grown, all these uintptr references
+		// to the stack will not be updated, and gentraceback will continue
+		// to inspect the old stack memory, which may no longer be valid.
+		// Even if all the variables were updated correctly, it is not clear that
+		// we want to expose a traceback that begins on one stack and ends
+		// on another stack. That could confuse callers quite a bit.
+		// Instead, we require that gentraceback and any other function that
+		// accepts an sp for the current goroutine (typically obtained by
+		// calling getcallersp) must not run on that goroutine's stack but
+		// instead on the g0 stack.
+		throw("gentraceback cannot trace user goroutine on its own stack")
+	}
+	level, _, _ := gotraceback()
+
+	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
+		if gp.syscallsp != 0 {
+			pc0 = gp.syscallpc
+			sp0 = gp.syscallsp
+			if usesLR {
+				lr0 = 0
+			}
+		} else {
+			pc0 = gp.sched.pc
+			sp0 = gp.sched.sp
+			if usesLR {
+				lr0 = gp.sched.lr
+			}
+		}
+	}
+
+	nprint := 0
+	var frame stkframe
+	frame.pc = pc0
+	frame.sp = sp0
+	if usesLR {
+		frame.lr = lr0
+	}
+	waspanic := false
+	cgoCtxt := gp.cgoCtxt
+	stack := gp.stack
+	printing := pcbuf == nil && callback == nil
+
+	// If the PC is zero, it's likely a nil function call.
+	// Start in the caller's frame.
+	if frame.pc == 0 {
+		if usesLR {
+			frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.lr = 0
+		} else {
+			frame.pc = uintptr(*(*uintptr)(unsafe.Pointer(frame.sp)))
+			frame.sp += goarch.PtrSize
+		}
+	}
+
+	// runtime/internal/atomic functions call into kernel helpers on
+	// arm < 7. See runtime/internal/atomic/sys_linux_arm.s.
+	//
+	// Start in the caller's frame.
+	if GOARCH == "arm" && goarm < 7 && GOOS == "linux" && frame.pc&0xffff0000 == 0xffff0000 {
+		// Note that the calls are simple BL without pushing the return
+		// address, so we use LR directly.
+		//
+		// The kernel helpers are frameless leaf functions, so SP and
+		// LR are not touched.
+		frame.pc = frame.lr
+		frame.lr = 0
+	}
+
+	f := findfunc(frame.pc)
+	if !f.valid() {
+		if callback != nil || printing {
+			print("runtime: g ", gp.goid, ": unknown pc ", hex(frame.pc), "\n")
+			tracebackHexdump(stack, &frame, 0)
+		}
+		if callback != nil {
+			throw("unknown pc")
+		}
+		return 0
+	}
+	frame.fn = f
+
+	var cache pcvalueCache
+
+	lastFuncID := funcID_normal
+	n := 0
+	for n < max {
+		// Typically:
+		//	pc is the PC of the running function.
+		//	sp is the stack pointer at that program counter.
+		//	fp is the frame pointer (caller's stack pointer) at that program counter, or nil if unknown.
+		//	stk is the stack containing sp.
+		//	The caller's program counter is lr, unless lr is zero, in which case it is *(uintptr*)sp.
+		f = frame.fn
+		if f.pcsp == 0 {
+			// No frame information, must be external function, like race support.
+			// See golang.org/issue/13568.
+			break
+		}
+
+		// Compute function info flags.
+		flag := f.flag
+		if f.funcID == funcID_cgocallback {
+			// cgocallback does write SP to switch from the g0 to the curg stack,
+			// but it carefully arranges that during the transition BOTH stacks
+			// have cgocallback frame valid for unwinding through.
+			// So we don't need to exclude it with the other SP-writing functions.
+			flag &^= funcFlag_SPWRITE
+		}
+		if frame.pc == pc0 && frame.sp == sp0 && pc0 == gp.syscallpc && sp0 == gp.syscallsp {
+			// Some Syscall functions write to SP, but they do so only after
+			// saving the entry PC/SP using entersyscall.
+			// Since we are using the entry PC/SP, the later SP write doesn't matter.
+			flag &^= funcFlag_SPWRITE
+		}
+
+		// Found an actual function.
+		// Derive frame pointer and link register.
+		if frame.fp == 0 {
+			// Jump over system stack transitions. If we're on g0 and there's a user
+			// goroutine, try to jump. Otherwise this is a regular call.
+			// We also defensively check that this won't switch M's on us,
+			// which could happen at critical points in the scheduler.
+			// This ensures gp.m doesn't change from a stack jump.
+			if flags&_TraceJumpStack != 0 && gp == gp.m.g0 && gp.m.curg != nil && gp.m.curg.m == gp.m {
+				switch f.funcID {
+				case funcID_morestack:
+					// morestack does not return normally -- newstack()
+					// gogo's to curg.sched. Match that.
+					// This keeps morestack() from showing up in the backtrace,
+					// but that makes some sense since it'll never be returned
+					// to.
+					gp = gp.m.curg
+					frame.pc = gp.sched.pc
+					frame.fn = findfunc(frame.pc)
+					f = frame.fn
+					flag = f.flag
+					frame.lr = gp.sched.lr
+					frame.sp = gp.sched.sp
+					stack = gp.stack
+					cgoCtxt = gp.cgoCtxt
+				case funcID_systemstack:
+					// systemstack returns normally, so just follow the
+					// stack transition.
+					if usesLR && funcspdelta(f, frame.pc, &cache) == 0 {
+						// We're at the function prologue and the stack
+						// switch hasn't happened, or epilogue where we're
+						// about to return. Just unwind normally.
+						// Do this only on LR machines because on x86
+						// systemstack doesn't have an SP delta (the CALL
+						// instruction opens the frame), therefore no way
+						// to check.
+						flag &^= funcFlag_SPWRITE
+						break
+					}
+					gp = gp.m.curg
+					frame.sp = gp.sched.sp
+					stack = gp.stack
+					cgoCtxt = gp.cgoCtxt
+					flag &^= funcFlag_SPWRITE
+				}
+			}
+			frame.fp = frame.sp + uintptr(funcspdelta(f, frame.pc, &cache))
+			if !usesLR {
+				// On x86, call instruction pushes return PC before entering new function.
+				frame.fp += goarch.PtrSize
+			}
+		}
+		var flr funcInfo
+		if flag&funcFlag_TOPFRAME != 0 {
+			// This function marks the top of the stack. Stop the traceback.
+			frame.lr = 0
+			flr = funcInfo{}
+		} else if flag&funcFlag_SPWRITE != 0 && (callback == nil || n > 0) {
+			// The function we are in does a write to SP that we don't know
+			// how to encode in the spdelta table. Examples include context
+			// switch routines like runtime.gogo but also any code that switches
+			// to the g0 stack to run host C code. Since we can't reliably unwind
+			// the SP (we might not even be on the stack we think we are),
+			// we stop the traceback here.
+			// This only applies for profiling signals (callback == nil).
+			//
+			// For a GC stack traversal (callback != nil), we should only see
+			// a function when it has voluntarily preempted itself on entry
+			// during the stack growth check. In that case, the function has
+			// not yet had a chance to do any writes to SP and is safe to unwind.
+			// isAsyncSafePoint does not allow assembly functions to be async preempted,
+			// and preemptPark double-checks that SPWRITE functions are not async preempted.
+			// So for GC stack traversal we leave things alone (this if body does not execute for n == 0)
+			// at the bottom frame of the stack. But farther up the stack we'd better not
+			// find any.
+			if callback != nil {
+				println("traceback: unexpected SPWRITE function", funcname(f))
+				throw("traceback")
+			}
+			frame.lr = 0
+			flr = funcInfo{}
+		} else {
+			var lrPtr uintptr
+			if usesLR {
+				if n == 0 && frame.sp < frame.fp || frame.lr == 0 {
+					lrPtr = frame.sp
+					frame.lr = *(*uintptr)(unsafe.Pointer(lrPtr))
+				}
+			} else {
+				if frame.lr == 0 {
+					lrPtr = frame.fp - goarch.PtrSize
+					frame.lr = uintptr(*(*uintptr)(unsafe.Pointer(lrPtr)))
+				}
+			}
+			flr = findfunc(frame.lr)
+			if !flr.valid() {
+				// This happens if you get a profiling interrupt at just the wrong time.
+				// In that context it is okay to stop early.
+				// But if callback is set, we're doing a garbage collection and must
+				// get everything, so crash loudly.
+				doPrint := printing
+				if doPrint && gp.m.incgo && f.funcID == funcID_sigpanic {
+					// We can inject sigpanic
+					// calls directly into C code,
+					// in which case we'll see a C
+					// return PC. Don't complain.
+					doPrint = false
+				}
+				if callback != nil || doPrint {
+					print("runtime: g ", gp.goid, ": unexpected return pc for ", funcname(f), " called from ", hex(frame.lr), "\n")
+					tracebackHexdump(stack, &frame, lrPtr)
+				}
+				if callback != nil {
+					throw("unknown caller pc")
+				}
+			}
+		}
+
+		frame.varp = frame.fp
+		if !usesLR {
+			// On x86, call instruction pushes return PC before entering new function.
+			frame.varp -= goarch.PtrSize
+		}
+
+		// For architectures with frame pointers, if there's
+		// a frame, then there's a saved frame pointer here.
+		//
+		// NOTE: This code is not as general as it looks.
+		// On x86, the ABI is to save the frame pointer word at the
+		// top of the stack frame, so we have to back down over it.
+		// On arm64, the frame pointer should be at the bottom of
+		// the stack (with R29 (aka FP) = RSP), in which case we would
+		// not want to do the subtraction here. But we started out without
+		// any frame pointer, and when we wanted to add it, we didn't
+		// want to break all the assembly doing direct writes to 8(RSP)
+		// to set the first parameter to a called function.
+		// So we decided to write the FP link *below* the stack pointer
+		// (with R29 = RSP - 8 in Go functions).
+		// This is technically ABI-compatible but not standard.
+		// And it happens to end up mimicking the x86 layout.
+		// Other architectures may make different decisions.
+		if frame.varp > frame.sp && framepointer_enabled {
+			frame.varp -= goarch.PtrSize
+		}
+
+		frame.argp = frame.fp + sys.MinFrameSize
+
+		// Determine frame's 'continuation PC', where it can continue.
+		// Normally this is the return address on the stack, but if sigpanic
+		// is immediately below this function on the stack, then the frame
+		// stopped executing due to a trap, and frame.pc is probably not
+		// a safe point for looking up liveness information. In this panicking case,
+		// the function either doesn't return at all (if it has no defers or if the
+		// defers do not recover) or it returns from one of the calls to
+		// deferproc a second time (if the corresponding deferred func recovers).
+		// In the latter case, use a deferreturn call site as the continuation pc.
+		frame.continpc = frame.pc
+		if waspanic {
+			if frame.fn.deferreturn != 0 {
+				frame.continpc = frame.fn.entry() + uintptr(frame.fn.deferreturn) + 1
+				// Note: this may perhaps keep return variables alive longer than
+				// strictly necessary, as we are using "function has a defer statement"
+				// as a proxy for "function actually deferred something". It seems
+				// to be a minor drawback. (We used to actually look through the
+				// gp._defer for a defer corresponding to this function, but that
+				// is hard to do with defer records on the stack during a stack copy.)
+				// Note: the +1 is to offset the -1 that
+				// stack.go:getStackMap does to back up a return
+				// address make sure the pc is in the CALL instruction.
+			} else {
+				frame.continpc = 0
+			}
+		}
+
+		if callback != nil {
+			if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
+				return n
+			}
+		}
+
+		if pcbuf != nil {
+			pc := frame.pc
+			// backup to CALL instruction to read inlining info (same logic as below)
+			tracepc := pc
+			// Normally, pc is a return address. In that case, we want to look up
+			// file/line information using pc-1, because that is the pc of the
+			// call instruction (more precisely, the last byte of the call instruction).
+			// Callers expect the pc buffer to contain return addresses and do the
+			// same -1 themselves, so we keep pc unchanged.
+			// When the pc is from a signal (e.g. profiler or segv) then we want
+			// to look up file/line information using pc, and we store pc+1 in the
+			// pc buffer so callers can unconditionally subtract 1 before looking up.
+			// See issue 34123.
+			// The pc can be at function entry when the frame is initialized without
+			// actually running code, like runtime.mstart.
+			if (n == 0 && flags&_TraceTrap != 0) || waspanic || pc == f.entry() {
+				pc++
+			} else {
+				tracepc--
+			}
+
+			// If there is inlining info, record the inner frames.
+			if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				for {
+					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, &cache)
+					if ix < 0 {
+						break
+					}
+					if inltree[ix].funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+						// ignore wrappers
+					} else if skip > 0 {
+						skip--
+					} else if n < max {
+						(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+						n++
+					}
+					lastFuncID = inltree[ix].funcID
+					// Back up to an instruction in the "caller".
+					tracepc = frame.fn.entry() + uintptr(inltree[ix].parentPc)
+					pc = tracepc + 1
+				}
+			}
+			// Record the main frame.
+			if f.funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+				// Ignore wrapper functions (except when they trigger panics).
+			} else if skip > 0 {
+				skip--
+			} else if n < max {
+				(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+				n++
+			}
+			lastFuncID = f.funcID
+			n-- // offset n++ below
+		}
+
+		if printing {
+			// assume skip=0 for printing.
+			//
+			// Never elide wrappers if we haven't printed
+			// any frames. And don't elide wrappers that
+			// called panic rather than the wrapped
+			// function. Otherwise, leave them out.
+
+			// backup to CALL instruction to read inlining info (same logic as below)
+			tracepc := frame.pc
+			if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry() && !waspanic {
+				tracepc--
+			}
+			// If there is inlining info, print the inner frames.
+			if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				var inlFunc _func
+				inlFuncInfo := funcInfo{&inlFunc, f.datap}
+				for {
+					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, nil)
+					if ix < 0 {
+						break
+					}
+
+					// Create a fake _func for the
+					// inlined function.
+					inlFunc.nameOff = inltree[ix].nameOff
+					inlFunc.funcID = inltree[ix].funcID
+					inlFunc.startLine = inltree[ix].startLine
+
+					if (flags&_TraceRuntimeFrames) != 0 || showframe(inlFuncInfo, gp, nprint == 0, inlFuncInfo.funcID, lastFuncID) {
+						name := funcname(inlFuncInfo)
+						file, line := funcline(f, tracepc)
+						print(name, "(...)\n")
+						print("\t", file, ":", line, "\n")
+						nprint++
+					}
+					lastFuncID = inltree[ix].funcID
+					// Back up to an instruction in the "caller".
+					tracepc = frame.fn.entry() + uintptr(inltree[ix].parentPc)
+				}
+			}
+			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0, f.funcID, lastFuncID) {
+				// Print during crash.
+				//	main(0x1, 0x2, 0x3)
+				//		/home/rsc/go/src/runtime/x.go:23 +0xf
+				//
+				name := funcname(f)
+				file, line := funcline(f, tracepc)
+				if name == "runtime.gopanic" {
+					name = "panic"
+				}
+				print(name, "(")
+				argp := unsafe.Pointer(frame.argp)
+				printArgs(f, argp, tracepc)
+				print(")\n")
+				print("\t", file, ":", line)
+				if frame.pc > f.entry() {
+					print(" +", hex(frame.pc-f.entry()))
+				}
+				if gp.m != nil && gp.m.throwing >= throwTypeRuntime && gp == gp.m.curg || level >= 2 {
+					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp), " pc=", hex(frame.pc))
+				}
+				print("\n")
+				nprint++
+			}
+			lastFuncID = f.funcID
+		}
+		n++
+
+		if f.funcID == funcID_cgocallback && len(cgoCtxt) > 0 {
+			ctxt := cgoCtxt[len(cgoCtxt)-1]
+			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
+
+			// skip only applies to Go frames.
+			// callback != nil only used when we only care
+			// about Go frames.
+			if skip == 0 && callback == nil {
+				n = tracebackCgoContext(pcbuf, printing, ctxt, n, max)
+			}
+		}
+
+		waspanic = f.funcID == funcID_sigpanic
+		injectedCall := waspanic || f.funcID == funcID_asyncPreempt || f.funcID == funcID_debugCallV2
+
+		// Do not unwind past the bottom of the stack.
+		if !flr.valid() {
+			break
+		}
+
+		if frame.pc == frame.lr && frame.sp == frame.fp {
+			// If the next frame is identical to the current frame, we cannot make progress.
+			print("runtime: traceback stuck. pc=", hex(frame.pc), " sp=", hex(frame.sp), "\n")
+			tracebackHexdump(stack, &frame, frame.sp)
+			throw("traceback stuck")
+		}
+
+		// Unwind to next frame.
+		frame.fn = flr
+		frame.pc = frame.lr
+		frame.lr = 0
+		frame.sp = frame.fp
+		frame.fp = 0
+
+		// On link register architectures, sighandler saves the LR on stack
+		// before faking a call.
+		if usesLR && injectedCall {
+			x := *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.sp += alignUp(sys.MinFrameSize, sys.StackAlign)
+			f = findfunc(frame.pc)
+			frame.fn = f
+			if !f.valid() {
+				frame.pc = x
+			} else if funcspdelta(f, frame.pc, &cache) == 0 {
+				frame.lr = x
+			}
+		}
+	}
+
+	if printing {
+		n = nprint
+	}
+
+	// Note that panic != nil is okay here: there can be leftover panics,
+	// because the defers on the panic stack do not nest in frame order as
+	// they do on the defer stack. If you have:
+	//
+	//	frame 1 defers d1
+	//	frame 2 defers d2
+	//	frame 3 defers d3
+	//	frame 4 panics
+	//	frame 4's panic starts running defers
+	//	frame 5, running d3, defers d4
+	//	frame 5 panics
+	//	frame 5's panic starts running defers
+	//	frame 6, running d4, garbage collects
+	//	frame 6, running d2, garbage collects
+	//
+	// During the execution of d4, the panic stack is d4 -> d3, which
+	// is nested properly, and we'll treat frame 3 as resumable, because we
+	// can find d3. (And in fact frame 3 is resumable. If d4 recovers
+	// and frame 5 continues running, d3, d3 can recover and we'll
+	// resume execution in (returning from) frame 3.)
+	//
+	// During the execution of d2, however, the panic stack is d2 -> d3,
+	// which is inverted. The scan will match d2 to frame 2 but having
+	// d2 on the stack until then means it will not match d3 to frame 3.
+	// This is okay: if we're running d2, then all the defers after d2 have
+	// completed and their corresponding frames are dead. Not finding d3
+	// for frame 3 means we'll set frame 3's continpc == 0, which is correct
+	// (frame 3 is dead). At the end of the walk the panic stack can thus
+	// contain defers (d3 in this case) for dead frames. The inversion here
+	// always indicates a dead frame, and the effect of the inversion on the
+	// scan is to hide those dead frames, so the scan is still okay:
+	// what's left on the panic stack are exactly (and only) the dead frames.
+	//
+	// We require callback != nil here because only when callback != nil
+	// do we know that gentraceback is being called in a "must be correct"
+	// context as opposed to a "best effort" context. The tracebacks with
+	// callbacks only happen when everything is stopped nicely.
+	// At other times, such as when gathering a stack for a profiling signal
+	// or when printing a traceback during a crash, everything may not be
+	// stopped nicely, and the stack walk may not be able to complete.
+	if callback != nil && n < max && frame.sp != gp.stktopsp {
+		print("runtime: g", gp.goid, ": frame.sp=", hex(frame.sp), " top=", hex(gp.stktopsp), "\n")
+		print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "] n=", n, " max=", max, "\n")
+		throw("traceback did not unwind completely")
+	}
+
+	return n
+}
+
+// printArgs prints function arguments in traceback.
+func printArgs(f funcInfo, argp unsafe.Pointer, pc uintptr) {
+	// The "instruction" of argument printing is encoded in _FUNCDATA_ArgInfo.
+	// See cmd/compile/internal/ssagen.emitArgInfo for the description of the
+	// encoding.
+	// These constants need to be in sync with the compiler.
+	const (
+		_endSeq         = 0xff
+		_startAgg       = 0xfe
+		_endAgg         = 0xfd
+		_dotdotdot      = 0xfc
+		_offsetTooLarge = 0xfb
+	)
+
+	const (
+		limit    = 10                       // print no more than 10 args/components
+		maxDepth = 5                        // no more than 5 layers of nesting
+		maxLen   = (maxDepth*3+2)*limit + 1 // max length of _FUNCDATA_ArgInfo (see the compiler side for reasoning)
+	)
+
+	p := (*[maxLen]uint8)(funcdata(f, _FUNCDATA_ArgInfo))
+	if p == nil {
+		return
+	}
+
+	liveInfo := funcdata(f, _FUNCDATA_ArgLiveInfo)
+	liveIdx := pcdatavalue(f, _PCDATA_ArgLiveIndex, pc, nil)
+	startOffset := uint8(0xff) // smallest offset that needs liveness info (slots with a lower offset is always live)
+	if liveInfo != nil {
+		startOffset = *(*uint8)(liveInfo)
+	}
+
+	isLive := func(off, slotIdx uint8) bool {
+		if liveInfo == nil || liveIdx <= 0 {
+			return true // no liveness info, always live
+		}
+		if off < startOffset {
+			return true
+		}
+		bits := *(*uint8)(add(liveInfo, uintptr(liveIdx)+uintptr(slotIdx/8)))
+		return bits&(1<<(slotIdx%8)) != 0
+	}
+
+	print1 := func(off, sz, slotIdx uint8) {
+		x := readUnaligned64(add(argp, uintptr(off)))
+		// mask out irrelevant bits
+		if sz < 8 {
+			shift := 64 - sz*8
+			if goarch.BigEndian {
+				x = x >> shift
+			} else {
+				x = x << shift >> shift
+			}
+		}
+		print(hex(x))
+		if !isLive(off, slotIdx) {
+			print("?")
+		}
+	}
+
+	start := true
+	printcomma := func() {
+		if !start {
+			print(", ")
+		}
+	}
+	pi := 0
+	slotIdx := uint8(0) // register arg spill slot index
+printloop:
+	for {
+		o := p[pi]
+		pi++
+		switch o {
+		case _endSeq:
+			break printloop
+		case _startAgg:
+			printcomma()
+			print("{")
+			start = true
+			continue
+		case _endAgg:
+			print("}")
+		case _dotdotdot:
+			printcomma()
+			print("...")
+		case _offsetTooLarge:
+			printcomma()
+			print("_")
+		default:
+			printcomma()
+			sz := p[pi]
+			pi++
+			print1(o, sz, slotIdx)
+			if o >= startOffset {
+				slotIdx++
+			}
+		}
+		start = false
+	}
+}
+
+// tracebackCgoContext handles tracing back a cgo context value, from
+// the context argument to setCgoTraceback, for the gentraceback
+// function. It returns the new value of n.
+func tracebackCgoContext(pcbuf *uintptr, printing bool, ctxt uintptr, n, max int) int {
+	var cgoPCs [32]uintptr
+	cgoContextPCs(ctxt, cgoPCs[:])
+	var arg cgoSymbolizerArg
+	anySymbolized := false
+	for _, pc := range cgoPCs {
+		if pc == 0 || n >= max {
+			break
+		}
+		if pcbuf != nil {
+			(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+		}
+		if printing {
+			if cgoSymbolizer == nil {
+				print("non-Go function at pc=", hex(pc), "\n")
+			} else {
+				c := printOneCgoTraceback(pc, max-n, &arg)
+				n += c - 1 // +1 a few lines down
+				anySymbolized = true
+			}
+		}
+		n++
+	}
+	if anySymbolized {
+		arg.pc = 0
+		callCgoSymbolizer(&arg)
+	}
+	return n
+}
+
+func printcreatedby(gp *g) {
+	// Show what created goroutine, except main goroutine (goid 1).
+	pc := gp.gopc
+	f := findfunc(pc)
+	if f.valid() && showframe(f, gp, false, funcID_normal, funcID_normal) && gp.goid != 1 {
+		printcreatedby1(f, pc)
+	}
+}
+
+func printcreatedby1(f funcInfo, pc uintptr) {
+	print("created by ", funcname(f), "\n")
+	tracepc := pc // back up to CALL instruction for funcline.
+	if pc > f.entry() {
+		tracepc -= sys.PCQuantum
+	}
+	file, line := funcline(f, tracepc)
+	print("\t", file, ":", line)
+	if pc > f.entry() {
+		print(" +", hex(pc-f.entry()))
+	}
+	print("\n")
+}
+
+func traceback(pc, sp, lr uintptr, gp *g) {
+	traceback1(pc, sp, lr, gp, 0)
+}
+
+// tracebacktrap is like traceback but expects that the PC and SP were obtained
+// from a trap, not from gp->sched or gp->syscallpc/gp->syscallsp or getcallerpc/getcallersp.
+// Because they are from a trap instead of from a saved pair,
+// the initial PC must not be rewound to the previous instruction.
+// (All the saved pairs record a PC that is a return address, so we
+// rewind it into the CALL instruction.)
+// If gp.m.libcall{g,pc,sp} information is available, it uses that information in preference to
+// the pc/sp/lr passed in.
+func tracebacktrap(pc, sp, lr uintptr, gp *g) {
+	if gp.m.libcallsp != 0 {
+		// We're in C code somewhere, traceback from the saved position.
+		traceback1(gp.m.libcallpc, gp.m.libcallsp, 0, gp.m.libcallg.ptr(), 0)
+		return
+	}
+	traceback1(pc, sp, lr, gp, _TraceTrap)
+}
+
+func traceback1(pc, sp, lr uintptr, gp *g, flags uint) {
+	// If the goroutine is in cgo, and we have a cgo traceback, print that.
+	if iscgo && gp.m != nil && gp.m.ncgo > 0 && gp.syscallsp != 0 && gp.m.cgoCallers != nil && gp.m.cgoCallers[0] != 0 {
+		// Lock cgoCallers so that a signal handler won't
+		// change it, copy the array, reset it, unlock it.
+		// We are locked to the thread and are not running
+		// concurrently with a signal handler.
+		// We just have to stop a signal handler from interrupting
+		// in the middle of our copy.
+		gp.m.cgoCallersUse.Store(1)
+		cgoCallers := *gp.m.cgoCallers
+		gp.m.cgoCallers[0] = 0
+		gp.m.cgoCallersUse.Store(0)
+
+		printCgoTraceback(&cgoCallers)
+	}
+
+	if readgstatus(gp)&^_Gscan == _Gsyscall {
+		// Override registers if blocked in system call.
+		pc = gp.syscallpc
+		sp = gp.syscallsp
+		flags &^= _TraceTrap
+	}
+	if gp.m != nil && gp.m.vdsoSP != 0 {
+		// Override registers if running in VDSO. This comes after the
+		// _Gsyscall check to cover VDSO calls after entersyscall.
+		pc = gp.m.vdsoPC
+		sp = gp.m.vdsoSP
+		flags &^= _TraceTrap
+	}
+
+	// Print traceback. By default, omits runtime frames.
+	// If that means we print nothing at all, repeat forcing all frames printed.
+	n := gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags)
+	if n == 0 && (flags&_TraceRuntimeFrames) == 0 {
+		n = gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags|_TraceRuntimeFrames)
+	}
+	if n == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	printcreatedby(gp)
+
+	if gp.ancestors == nil {
+		return
+	}
+	for _, ancestor := range *gp.ancestors {
+		printAncestorTraceback(ancestor)
+	}
+}
+
+// printAncestorTraceback prints the traceback of the given ancestor.
+// TODO: Unify this with gentraceback and CallersFrames.
+func printAncestorTraceback(ancestor ancestorInfo) {
+	print("[originating from goroutine ", ancestor.goid, "]:\n")
+	for fidx, pc := range ancestor.pcs {
+		f := findfunc(pc) // f previously validated
+		if showfuncinfo(f, fidx == 0, funcID_normal, funcID_normal) {
+			printAncestorTracebackFuncInfo(f, pc)
+		}
+	}
+	if len(ancestor.pcs) == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	// Show what created goroutine, except main goroutine (goid 1).
+	f := findfunc(ancestor.gopc)
+	if f.valid() && showfuncinfo(f, false, funcID_normal, funcID_normal) && ancestor.goid != 1 {
+		printcreatedby1(f, ancestor.gopc)
+	}
+}
+
+// printAncestorTracebackFuncInfo prints the given function info at a given pc
+// within an ancestor traceback. The precision of this info is reduced
+// due to only have access to the pcs at the time of the caller
+// goroutine being created.
+func printAncestorTracebackFuncInfo(f funcInfo, pc uintptr) {
+	name := funcname(f)
+	if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+		inltree := (*[1 << 20]inlinedCall)(inldata)
+		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
+		if ix >= 0 {
+			name = funcnameFromNameOff(f, inltree[ix].nameOff)
+		}
+	}
+	file, line := funcline(f, pc)
+	if name == "runtime.gopanic" {
+		name = "panic"
+	}
+	print(name, "(...)\n")
+	print("\t", file, ":", line)
+	if pc > f.entry() {
+		print(" +", hex(pc-f.entry()))
+	}
+	print("\n")
+}
+
+func callers(skip int, pcbuf []uintptr) int {
+	sp := getcallersp()
+	pc := getcallerpc()
+	gp := getg()
+	var n int
+	systemstack(func() {
+		n = gentraceback(pc, sp, 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
+	})
+	return n
+}
+
+func gcallers(gp *g, skip int, pcbuf []uintptr) int {
+	return gentraceback(^uintptr(0), ^uintptr(0), 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
+}
+
+// showframe reports whether the frame with the given characteristics should
+// be printed during a traceback.
+func showframe(f funcInfo, gp *g, firstFrame bool, funcID, childID funcID) bool {
+	mp := getg().m
+	if mp.throwing >= throwTypeRuntime && gp != nil && (gp == mp.curg || gp == mp.caughtsig.ptr()) {
+		return true
+	}
+	return showfuncinfo(f, firstFrame, funcID, childID)
+}
+
+// showfuncinfo reports whether a function with the given characteristics should
+// be printed during a traceback.
+func showfuncinfo(f funcInfo, firstFrame bool, funcID, childID funcID) bool {
+	// Note that f may be a synthesized funcInfo for an inlined
+	// function, in which case only nameOff and funcID are set.
+
+	level, _, _ := gotraceback()
+	if level > 1 {
+		// Show all frames.
+		return true
+	}
+
+	if !f.valid() {
+		return false
+	}
+
+	if funcID == funcID_wrapper && elideWrapperCalling(childID) {
+		return false
+	}
+
+	name := funcname(f)
+
+	// Special case: always show runtime.gopanic frame
+	// in the middle of a stack trace, so that we can
+	// see the boundary between ordinary code and
+	// panic-induced deferred code.
+	// See golang.org/issue/5832.
+	if name == "runtime.gopanic" && !firstFrame {
+		return true
+	}
+
+	return bytealg.IndexByteString(name, '.') >= 0 && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
+}
+
+// isExportedRuntime reports whether name is an exported runtime function.
+// It is only for runtime functions, so ASCII A-Z is fine.
+func isExportedRuntime(name string) bool {
+	const n = len("runtime.")
+	return len(name) > n && name[:n] == "runtime." && 'A' <= name[n] && name[n] <= 'Z'
+}
+
+// elideWrapperCalling reports whether a wrapper function that called
+// function id should be elided from stack traces.
+func elideWrapperCalling(id funcID) bool {
+	// If the wrapper called a panic function instead of the
+	// wrapped function, we want to include it in stacks.
+	return !(id == funcID_gopanic || id == funcID_sigpanic || id == funcID_panicwrap)
+}
+
+var gStatusStrings = [...]string{
+	_Gidle:      "idle",
+	_Grunnable:  "runnable",
+	_Grunning:   "running",
+	_Gsyscall:   "syscall",
+	_Gwaiting:   "waiting",
+	_Gdead:      "dead",
+	_Gcopystack: "copystack",
+	_Gpreempted: "preempted",
+}
+
+func goroutineheader(gp *g) {
+	gpstatus := readgstatus(gp)
+
+	isScan := gpstatus&_Gscan != 0
+	gpstatus &^= _Gscan // drop the scan bit
+
+	// Basic string status
+	var status string
+	if 0 <= gpstatus && gpstatus < uint32(len(gStatusStrings)) {
+		status = gStatusStrings[gpstatus]
+	} else {
+		status = "???"
+	}
+
+	// Override.
+	if gpstatus == _Gwaiting && gp.waitreason != waitReasonZero {
+		status = gp.waitreason.String()
+	}
+
+	// approx time the G is blocked, in minutes
+	var waitfor int64
+	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && gp.waitsince != 0 {
+		waitfor = (nanotime() - gp.waitsince) / 60e9
+	}
+	print("goroutine ", gp.goid, " [", status)
+	if isScan {
+		print(" (scan)")
+	}
+	if waitfor >= 1 {
+		print(", ", waitfor, " minutes")
+	}
+	if gp.lockedm != 0 {
+		print(", locked to thread")
+	}
+	print("]:\n")
+}
+
+func tracebackothers(me *g) {
+	level, _, _ := gotraceback()
+
+	// Show the current goroutine first, if we haven't already.
+	curgp := getg().m.curg
+	if curgp != nil && curgp != me {
+		print("\n")
+		goroutineheader(curgp)
+		traceback(^uintptr(0), ^uintptr(0), 0, curgp)
+	}
+
+	// We can't call locking forEachG here because this may be during fatal
+	// throw/panic, where locking could be out-of-order or a direct
+	// deadlock.
+	//
+	// Instead, use forEachGRace, which requires no locking. We don't lock
+	// against concurrent creation of new Gs, but even with allglock we may
+	// miss Gs created after this loop.
+	forEachGRace(func(gp *g) {
+		if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
+			return
+		}
+		print("\n")
+		goroutineheader(gp)
+		// Note: gp.m == getg().m occurs when tracebackothers is called
+		// from a signal handler initiated during a systemstack call.
+		// The original G is still in the running state, and we want to
+		// print its stack.
+		if gp.m != getg().m && readgstatus(gp)&^_Gscan == _Grunning {
+			print("\tgoroutine running on other thread; stack unavailable\n")
+			printcreatedby(gp)
+		} else {
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		}
+	})
+}
+
+// tracebackHexdump hexdumps part of stk around frame.sp and frame.fp
+// for debugging purposes. If the address bad is included in the
+// hexdumped range, it will mark it as well.
+func tracebackHexdump(stk stack, frame *stkframe, bad uintptr) {
+	const expand = 32 * goarch.PtrSize
+	const maxExpand = 256 * goarch.PtrSize
+	// Start around frame.sp.
+	lo, hi := frame.sp, frame.sp
+	// Expand to include frame.fp.
+	if frame.fp != 0 && frame.fp < lo {
+		lo = frame.fp
+	}
+	if frame.fp != 0 && frame.fp > hi {
+		hi = frame.fp
+	}
+	// Expand a bit more.
+	lo, hi = lo-expand, hi+expand
+	// But don't go too far from frame.sp.
+	if lo < frame.sp-maxExpand {
+		lo = frame.sp - maxExpand
+	}
+	if hi > frame.sp+maxExpand {
+		hi = frame.sp + maxExpand
+	}
+	// And don't go outside the stack bounds.
+	if lo < stk.lo {
+		lo = stk.lo
+	}
+	if hi > stk.hi {
+		hi = stk.hi
+	}
+
+	// Print the hex dump.
+	print("stack: frame={sp:", hex(frame.sp), ", fp:", hex(frame.fp), "} stack=[", hex(stk.lo), ",", hex(stk.hi), ")\n")
+	hexdumpWords(lo, hi, func(p uintptr) byte {
+		switch p {
+		case frame.fp:
+			return '>'
+		case frame.sp:
+			return '<'
+		case bad:
+			return '!'
+		}
+		return 0
+	})
+}
+
+// isSystemGoroutine reports whether the goroutine g must be omitted
+// in stack dumps and deadlock detector. This is any goroutine that
+// starts at a runtime.* entry point, except for runtime.main,
+// runtime.handleAsyncEvent (wasm only) and sometimes runtime.runfinq.
+//
+// If fixed is true, any goroutine that can vary between user and
+// system (that is, the finalizer goroutine) is considered a user
+// goroutine.
+func isSystemGoroutine(gp *g, fixed bool) bool {
+	// Keep this in sync with internal/trace.IsSystemGoroutine.
+	f := findfunc(gp.startpc)
+	if !f.valid() {
+		return false
+	}
+	if f.funcID == funcID_runtime_main || f.funcID == funcID_handleAsyncEvent {
+		return false
+	}
+	if f.funcID == funcID_runfinq {
+		// We include the finalizer goroutine if it's calling
+		// back into user code.
+		if fixed {
+			// This goroutine can vary. In fixed mode,
+			// always consider it a user goroutine.
+			return false
+		}
+		return fingStatus.Load()&fingRunningFinalizer == 0
+	}
+	return hasPrefix(funcname(f), "runtime.")
+}
+
+// SetCgoTraceback records three C functions to use to gather
+// traceback information from C code and to convert that traceback
+// information into symbolic information. These are used when printing
+// stack traces for a program that uses cgo.
+//
+// The traceback and context functions may be called from a signal
+// handler, and must therefore use only async-signal safe functions.
+// The symbolizer function may be called while the program is
+// crashing, and so must be cautious about using memory.  None of the
+// functions may call back into Go.
+//
+// The context function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t Context;
+//	};
+//
+// If the Context field is 0, the context function is being called to
+// record the current traceback context. It should record in the
+// Context field whatever information is needed about the current
+// point of execution to later produce a stack trace, probably the
+// stack pointer and PC. In this case the context function will be
+// called from C code.
+//
+// If the Context field is not 0, then it is a value returned by a
+// previous call to the context function. This case is called when the
+// context is no longer needed; that is, when the Go code is returning
+// to its C code caller. This permits the context function to release
+// any associated resources.
+//
+// While it would be correct for the context function to record a
+// complete a stack trace whenever it is called, and simply copy that
+// out in the traceback function, in a typical program the context
+// function will be called many times without ever recording a
+// traceback for that context. Recording a complete stack trace in a
+// call to the context function is likely to be inefficient.
+//
+// The traceback function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context    uintptr
+//		SigContext uintptr
+//		Buf        *uintptr
+//		Max        uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t  Context;
+//		uintptr_t  SigContext;
+//		uintptr_t* Buf;
+//		uintptr_t  Max;
+//	};
+//
+// The Context field will be zero to gather a traceback from the
+// current program execution point. In this case, the traceback
+// function will be called from C code.
+//
+// Otherwise Context will be a value previously returned by a call to
+// the context function. The traceback function should gather a stack
+// trace from that saved point in the program execution. The traceback
+// function may be called from an execution thread other than the one
+// that recorded the context, but only when the context is known to be
+// valid and unchanging. The traceback function may also be called
+// deeper in the call stack on the same thread that recorded the
+// context. The traceback function may be called multiple times with
+// the same Context value; it will usually be appropriate to cache the
+// result, if possible, the first time this is called for a specific
+// context value.
+//
+// If the traceback function is called from a signal handler on a Unix
+// system, SigContext will be the signal context argument passed to
+// the signal handler (a C ucontext_t* cast to uintptr_t). This may be
+// used to start tracing at the point where the signal occurred. If
+// the traceback function is not called from a signal handler,
+// SigContext will be zero.
+//
+// Buf is where the traceback information should be stored. It should
+// be PC values, such that Buf[0] is the PC of the caller, Buf[1] is
+// the PC of that function's caller, and so on.  Max is the maximum
+// number of entries to store.  The function should store a zero to
+// indicate the top of the stack, or that the caller is on a different
+// stack, presumably a Go stack.
+//
+// Unlike runtime.Callers, the PC values returned should, when passed
+// to the symbolizer function, return the file/line of the call
+// instruction.  No additional subtraction is required or appropriate.
+//
+// On all platforms, the traceback function is invoked when a call from
+// Go to C to Go requests a stack trace. On linux/amd64, linux/ppc64le,
+// linux/arm64, and freebsd/amd64, the traceback function is also invoked
+// when a signal is received by a thread that is executing a cgo call.
+// The traceback function should not make assumptions about when it is
+// called, as future versions of Go may make additional calls.
+//
+// The symbolizer function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		PC      uintptr // program counter to fetch information for
+//		File    *byte   // file name (NUL terminated)
+//		Lineno  uintptr // line number
+//		Func    *byte   // function name (NUL terminated)
+//		Entry   uintptr // function entry point
+//		More    uintptr // set non-zero if more info for this PC
+//		Data    uintptr // unused by runtime, available for function
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t PC;
+//		char*     File;
+//		uintptr_t Lineno;
+//		char*     Func;
+//		uintptr_t Entry;
+//		uintptr_t More;
+//		uintptr_t Data;
+//	};
+//
+// The PC field will be a value returned by a call to the traceback
+// function.
+//
+// The first time the function is called for a particular traceback,
+// all the fields except PC will be 0. The function should fill in the
+// other fields if possible, setting them to 0/nil if the information
+// is not available. The Data field may be used to store any useful
+// information across calls. The More field should be set to non-zero
+// if there is more information for this PC, zero otherwise. If More
+// is set non-zero, the function will be called again with the same
+// PC, and may return different information (this is intended for use
+// with inlined functions). If More is zero, the function will be
+// called with the next PC value in the traceback. When the traceback
+// is complete, the function will be called once more with PC set to
+// zero; this may be used to free any information. Each call will
+// leave the fields of the struct set to the same values they had upon
+// return, except for the PC field when the More field is zero. The
+// function must not keep a copy of the struct pointer between calls.
+//
+// When calling SetCgoTraceback, the version argument is the version
+// number of the structs that the functions expect to receive.
+// Currently this must be zero.
+//
+// The symbolizer function may be nil, in which case the results of
+// the traceback function will be displayed as numbers. If the
+// traceback function is nil, the symbolizer function will never be
+// called. The context function may be nil, in which case the
+// traceback function will only be called with the context field set
+// to zero.  If the context function is nil, then calls from Go to C
+// to Go will not show a traceback for the C portion of the call stack.
+//
+// SetCgoTraceback should be called only once, ideally from an init function.
+func SetCgoTraceback(version int, traceback, context, symbolizer unsafe.Pointer) {
+	if version != 0 {
+		panic("unsupported version")
+	}
+
+	if cgoTraceback != nil && cgoTraceback != traceback ||
+		cgoContext != nil && cgoContext != context ||
+		cgoSymbolizer != nil && cgoSymbolizer != symbolizer {
+		panic("call SetCgoTraceback only once")
+	}
+
+	cgoTraceback = traceback
+	cgoContext = context
+	cgoSymbolizer = symbolizer
+
+	// The context function is called when a C function calls a Go
+	// function. As such it is only called by C code in runtime/cgo.
+	if _cgo_set_context_function != nil {
+		cgocall(_cgo_set_context_function, context)
+	}
+}
+
+var cgoTraceback unsafe.Pointer
+var cgoContext unsafe.Pointer
+var cgoSymbolizer unsafe.Pointer
+
+// cgoTracebackArg is the type passed to cgoTraceback.
+type cgoTracebackArg struct {
+	context    uintptr
+	sigContext uintptr
+	buf        *uintptr
+	max        uintptr
+}
+
+// cgoContextArg is the type passed to the context function.
+type cgoContextArg struct {
+	context uintptr
+}
+
+// cgoSymbolizerArg is the type passed to cgoSymbolizer.
+type cgoSymbolizerArg struct {
+	pc       uintptr
+	file     *byte
+	lineno   uintptr
+	funcName *byte
+	entry    uintptr
+	more     uintptr
+	data     uintptr
+}
+
+// printCgoTraceback prints a traceback of callers.
+func printCgoTraceback(callers *cgoCallers) {
+	if cgoSymbolizer == nil {
+		for _, c := range callers {
+			if c == 0 {
+				break
+			}
+			print("non-Go function at pc=", hex(c), "\n")
+		}
+		return
+	}
+
+	var arg cgoSymbolizerArg
+	for _, c := range callers {
+		if c == 0 {
+			break
+		}
+		printOneCgoTraceback(c, 0x7fffffff, &arg)
+	}
+	arg.pc = 0
+	callCgoSymbolizer(&arg)
+}
+
+// printOneCgoTraceback prints the traceback of a single cgo caller.
+// This can print more than one line because of inlining.
+// Returns the number of frames printed.
+func printOneCgoTraceback(pc uintptr, max int, arg *cgoSymbolizerArg) int {
+	c := 0
+	arg.pc = pc
+	for c <= max {
+		callCgoSymbolizer(arg)
+		if arg.funcName != nil {
+			// Note that we don't print any argument
+			// information here, not even parentheses.
+			// The symbolizer must add that if appropriate.
+			println(gostringnocopy(arg.funcName))
+		} else {
+			println("non-Go function")
+		}
+		print("\t")
+		if arg.file != nil {
+			print(gostringnocopy(arg.file), ":", arg.lineno, " ")
+		}
+		print("pc=", hex(pc), "\n")
+		c++
+		if arg.more == 0 {
+			break
+		}
+	}
+	return c
+}
+
+// callCgoSymbolizer calls the cgoSymbolizer function.
+func callCgoSymbolizer(arg *cgoSymbolizerArg) {
+	call := cgocall
+	if panicking.Load() > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(arg), unsafe.Sizeof(cgoSymbolizerArg{}))
+	}
+	if asanenabled {
+		asanwrite(unsafe.Pointer(arg), unsafe.Sizeof(cgoSymbolizerArg{}))
+	}
+	call(cgoSymbolizer, noescape(unsafe.Pointer(arg)))
+}
+
+// cgoContextPCs gets the PC values from a cgo traceback.
+func cgoContextPCs(ctxt uintptr, buf []uintptr) {
+	if cgoTraceback == nil {
+		return
+	}
+	call := cgocall
+	if panicking.Load() > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	arg := cgoTracebackArg{
+		context: ctxt,
+		buf:     (*uintptr)(noescape(unsafe.Pointer(&buf[0]))),
+		max:     uintptr(len(buf)),
+	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(&arg), unsafe.Sizeof(arg))
+	}
+	if asanenabled {
+		asanwrite(unsafe.Pointer(&arg), unsafe.Sizeof(arg))
+	}
+	call(cgoTraceback, noescape(unsafe.Pointer(&arg)))
+}
diff --git a/src/runtime/traceback_test.go b/src/runtime/traceback_test.go
new file mode 100644
index 0000000..97eb921
--- /dev/null
+++ b/src/runtime/traceback_test.go
@@ -0,0 +1,422 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"internal/abi"
+	"internal/testenv"
+	"runtime"
+	"testing"
+)
+
+var testTracebackArgsBuf [1000]byte
+
+func TestTracebackArgs(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	optimized := !testenv.OptimizationOff()
+	abiSel := func(x, y string) string {
+		// select expected output based on ABI
+		// In noopt build we always spill arguments so the output is the same as stack ABI.
+		if optimized && abi.IntArgRegs > 0 {
+			return x
+		}
+		return y
+	}
+
+	tests := []struct {
+		fn     func() int
+		expect string
+	}{
+		// simple ints
+		{
+			func() int { return testTracebackArgs1(1, 2, 3, 4, 5) },
+			"testTracebackArgs1(0x1, 0x2, 0x3, 0x4, 0x5)",
+		},
+		// some aggregates
+		{
+			func() int {
+				return testTracebackArgs2(false, struct {
+					a, b, c int
+					x       [2]int
+				}{1, 2, 3, [2]int{4, 5}}, [0]int{}, [3]byte{6, 7, 8})
+			},
+			"testTracebackArgs2(0x0, {0x1, 0x2, 0x3, {0x4, 0x5}}, {}, {0x6, 0x7, 0x8})",
+		},
+		{
+			func() int { return testTracebackArgs3([3]byte{1, 2, 3}, 4, 5, 6, [3]byte{7, 8, 9}) },
+			"testTracebackArgs3({0x1, 0x2, 0x3}, 0x4, 0x5, 0x6, {0x7, 0x8, 0x9})",
+		},
+		// too deeply nested type
+		{
+			func() int { return testTracebackArgs4(false, [1][1][1][1][1][1][1][1][1][1]int{}) },
+			"testTracebackArgs4(0x0, {{{{{...}}}}})",
+		},
+		// a lot of zero-sized type
+		{
+			func() int {
+				z := [0]int{}
+				return testTracebackArgs5(false, struct {
+					x int
+					y [0]int
+					z [2][0]int
+				}{1, z, [2][0]int{}}, z, z, z, z, z, z, z, z, z, z, z, z)
+			},
+			"testTracebackArgs5(0x0, {0x1, {}, {{}, {}}}, {}, {}, {}, {}, {}, ...)",
+		},
+
+		// edge cases for ...
+		// no ... for 10 args
+		{
+			func() int { return testTracebackArgs6a(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) },
+			"testTracebackArgs6a(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa)",
+		},
+		// has ... for 11 args
+		{
+			func() int { return testTracebackArgs6b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) },
+			"testTracebackArgs6b(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...)",
+		},
+		// no ... for aggregates with 10 words
+		{
+			func() int { return testTracebackArgs7a([10]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}) },
+			"testTracebackArgs7a({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa})",
+		},
+		// has ... for aggregates with 11 words
+		{
+			func() int { return testTracebackArgs7b([11]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) },
+			"testTracebackArgs7b({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...})",
+		},
+		// no ... for aggregates, but with more args
+		{
+			func() int { return testTracebackArgs7c([10]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 11) },
+			"testTracebackArgs7c({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa}, ...)",
+		},
+		// has ... for aggregates and also for more args
+		{
+			func() int { return testTracebackArgs7d([11]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 12) },
+			"testTracebackArgs7d({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...}, ...)",
+		},
+		// nested aggregates, no ...
+		{
+			func() int { return testTracebackArgs8a(testArgsType8a{1, 2, 3, 4, 5, 6, 7, 8, [2]int{9, 10}}) },
+			"testTracebackArgs8a({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa}})",
+		},
+		// nested aggregates, ... in inner but not outer
+		{
+			func() int { return testTracebackArgs8b(testArgsType8b{1, 2, 3, 4, 5, 6, 7, 8, [3]int{9, 10, 11}}) },
+			"testTracebackArgs8b({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa, ...}})",
+		},
+		// nested aggregates, ... in outer but not inner
+		{
+			func() int { return testTracebackArgs8c(testArgsType8c{1, 2, 3, 4, 5, 6, 7, 8, [2]int{9, 10}, 11}) },
+			"testTracebackArgs8c({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa}, ...})",
+		},
+		// nested aggregates, ... in both inner and outer
+		{
+			func() int { return testTracebackArgs8d(testArgsType8d{1, 2, 3, 4, 5, 6, 7, 8, [3]int{9, 10, 11}, 12}) },
+			"testTracebackArgs8d({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa, ...}, ...})",
+		},
+
+		// Register argument liveness.
+		// 1, 3 are used and live, 2, 4 are dead (in register ABI).
+		// Address-taken (7) and stack ({5, 6}) args are always live.
+		{
+			func() int {
+				poisonStack() // poison arg area to make output deterministic
+				return testTracebackArgs9(1, 2, 3, 4, [2]int{5, 6}, 7)
+			},
+			abiSel(
+				"testTracebackArgs9(0x1, 0xffffffff?, 0x3, 0xff?, {0x5, 0x6}, 0x7)",
+				"testTracebackArgs9(0x1, 0x2, 0x3, 0x4, {0x5, 0x6}, 0x7)"),
+		},
+		// No live.
+		// (Note: this assume at least 5 int registers if register ABI is used.)
+		{
+			func() int {
+				poisonStack() // poison arg area to make output deterministic
+				return testTracebackArgs10(1, 2, 3, 4, 5)
+			},
+			abiSel(
+				"testTracebackArgs10(0xffffffff?, 0xffffffff?, 0xffffffff?, 0xffffffff?, 0xffffffff?)",
+				"testTracebackArgs10(0x1, 0x2, 0x3, 0x4, 0x5)"),
+		},
+		// Conditional spills.
+		// Spill in conditional, not executed.
+		{
+			func() int {
+				poisonStack() // poison arg area to make output deterministic
+				return testTracebackArgs11a(1, 2, 3)
+			},
+			abiSel(
+				"testTracebackArgs11a(0xffffffff?, 0xffffffff?, 0xffffffff?)",
+				"testTracebackArgs11a(0x1, 0x2, 0x3)"),
+		},
+		// 2 spills in conditional, not executed; 3 spills in conditional, executed, but not statically known.
+		// So print 0x3?.
+		{
+			func() int {
+				poisonStack() // poison arg area to make output deterministic
+				return testTracebackArgs11b(1, 2, 3, 4)
+			},
+			abiSel(
+				"testTracebackArgs11b(0xffffffff?, 0xffffffff?, 0x3?, 0x4)",
+				"testTracebackArgs11b(0x1, 0x2, 0x3, 0x4)"),
+		},
+	}
+	for _, test := range tests {
+		n := test.fn()
+		got := testTracebackArgsBuf[:n]
+		if !bytes.Contains(got, []byte(test.expect)) {
+			t.Errorf("traceback does not contain expected string: want %q, got\n%s", test.expect, got)
+		}
+	}
+}
+
+//go:noinline
+func testTracebackArgs1(a, b, c, d, e int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs2(a bool, b struct {
+	a, b, c int
+	x       [2]int
+}, _ [0]int, d [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		// use in-reg args to keep them alive
+		return b.a + b.b + b.c + b.x[0] + b.x[1] + int(d[0]) + int(d[1]) + int(d[2])
+	}
+	return n
+
+}
+
+//go:noinline
+//go:registerparams
+func testTracebackArgs3(x [3]byte, a, b, c int, y [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return int(x[0]) + int(x[1]) + int(x[2]) + a + b + c + int(y[0]) + int(y[1]) + int(y[2])
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs4(a bool, x [1][1][1][1][1][1][1][1][1][1]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs5(a bool, x struct {
+	x int
+	y [0]int
+	z [2][0]int
+}, _, _, _, _, _, _, _, _, _, _, _, _ [0]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs6a(a, b, c, d, e, f, g, h, i, j int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e + f + g + h + i + j
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs6b(a, b, c, d, e, f, g, h, i, j, k int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e + f + g + h + i + j + k
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7a(a [10]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7b(a [11]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + a[10]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7c(a [10]int, b int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + b
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7d(a [11]int, b int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + a[10] + b
+	}
+	return n
+}
+
+type testArgsType8a struct {
+	a, b, c, d, e, f, g, h int
+	i                      [2]int
+}
+type testArgsType8b struct {
+	a, b, c, d, e, f, g, h int
+	i                      [3]int
+}
+type testArgsType8c struct {
+	a, b, c, d, e, f, g, h int
+	i                      [2]int
+	j                      int
+}
+type testArgsType8d struct {
+	a, b, c, d, e, f, g, h int
+	i                      [3]int
+	j                      int
+}
+
+//go:noinline
+func testTracebackArgs8a(a testArgsType8a) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8b(a testArgsType8b) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.i[2]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8c(a testArgsType8c) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.j
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8d(a testArgsType8d) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.i[2] + a.j
+	}
+	return n
+}
+
+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:nosplit
+//go:noinline
+func testTracebackArgs9(a int64, b int32, c int16, d int8, x [2]int, y int) int {
+	if a < 0 {
+		println(&y) // take address, make y live, even if no longer used at traceback
+	}
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use half of in-reg args to keep them alive, the other half are dead
+		return int(a) + int(c)
+	}
+	return n
+}
+
+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:nosplit
+//go:noinline
+func testTracebackArgs10(a, b, c, d, e int32) int {
+	// no use of any args
+	return runtime.Stack(testTracebackArgsBuf[:], false)
+}
+
+// norace to avoid race instrumentation changing spill locations.
+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:norace
+//go:nosplit
+//go:noinline
+func testTracebackArgs11a(a, b, c int32) int {
+	if a < 0 {
+		println(a, b, c) // spill in a conditional, may not execute
+	}
+	if b < 0 {
+		return int(a + b + c)
+	}
+	return runtime.Stack(testTracebackArgsBuf[:], false)
+}
+
+// norace to avoid race instrumentation changing spill locations.
+// nosplit to avoid preemption or morestack spilling registers.
+//
+//go:norace
+//go:nosplit
+//go:noinline
+func testTracebackArgs11b(a, b, c, d int32) int {
+	var x int32
+	if a < 0 {
+		print() // spill b in a conditional
+		x = b
+	} else {
+		print() // spill c in a conditional
+		x = c
+	}
+	if d < 0 { // d is always needed
+		return int(x + d)
+	}
+	return runtime.Stack(testTracebackArgsBuf[:], false)
+}
+
+// Poison the arg area with deterministic values.
+//
+//go:noinline
+func poisonStack() [20]int {
+	return [20]int{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}
+}