summaryrefslogtreecommitdiffstats
path: root/src/runtime/pagetrace_on.go
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/runtime/pagetrace_on.go358
1 files changed, 358 insertions, 0 deletions
diff --git a/src/runtime/pagetrace_on.go b/src/runtime/pagetrace_on.go
new file mode 100644
index 0000000..f82521c
--- /dev/null
+++ b/src/runtime/pagetrace_on.go
@@ -0,0 +1,358 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.pagetrace
+
+// Page tracer.
+//
+// This file contains an implementation of page trace instrumentation for tracking
+// the way the Go runtime manages pages of memory. The trace may be enabled at program
+// startup with the GODEBUG option pagetrace.
+//
+// Each page trace event is either 8 or 16 bytes wide. The first
+// 8 bytes follow this format for non-sync events:
+//
+// [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType]
+//
+// If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word
+// containing the full npages value (the npages bitfield is 0).
+//
+// The base address's bottom pageShift bits are always zero hence why we can pack other
+// data in there. We ignore the top 16 bits, assuming a 48 bit address space for the
+// heap.
+//
+// The timestamp delta is computed from the difference between the current nanotime
+// timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of
+// this delta is removed and only the next pageTraceTimeDeltaBits are kept.
+//
+// A sync event is emitted at the beginning of each trace buffer and whenever the
+// timestamp delta would not fit in an event.
+//
+// Sync events have the following structure:
+//
+// [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent]
+//
+// In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID
+// (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp,
+// but like for the delta we drop the bottom pageTraceTimeLostBits here as well.
+
+package runtime
+
+import (
+ "runtime/internal/sys"
+ "unsafe"
+)
+
+// pageTraceAlloc records a page trace allocation event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent)
+ }
+}
+
+// pageTraceFree records a page trace free event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceFree(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent)
+ }
+}
+
+// pageTraceScav records a page trace scavenge event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceScav(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceScavEvent)
+ }
+}
+
+// pageTraceEventType is a page trace event type.
+type pageTraceEventType uint8
+
+const (
+ pageTraceSyncEvent pageTraceEventType = iota // Timestamp emission.
+ pageTraceAllocEvent // Allocation of pages.
+ pageTraceFreeEvent // Freeing pages.
+ pageTraceScavEvent // Scavenging pages.
+)
+
+// pageTraceEmit emits a page trace event.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) {
+ // Get a buffer.
+ var tbp *pageTraceBuf
+ pid := int32(-1)
+ if pp == nil {
+ // We have no P, so take the global buffer.
+ lock(&pageTrace.lock)
+ tbp = &pageTrace.buf
+ } else {
+ tbp = &pp.pageTraceBuf
+ pid = pp.id
+ }
+
+ // Initialize the buffer if necessary.
+ tb := *tbp
+ if tb.buf == nil {
+ tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys))
+ tb = tb.writePid(pid)
+ }
+
+ // Handle timestamp and emit a sync event if necessary.
+ if now < tb.timeBase {
+ now = tb.timeBase
+ }
+ if now-tb.timeBase >= pageTraceTimeMaxDelta {
+ tb.timeBase = now
+ tb = tb.writeSync(pid)
+ }
+
+ // Emit the event.
+ tb = tb.writeEvent(pid, now, base, npages, typ)
+
+ // Write back the buffer.
+ *tbp = tb
+ if pp == nil {
+ unlock(&pageTrace.lock)
+ }
+}
+
+const (
+ pageTraceBufSize = 32 << 10
+
+ // These constants describe the per-event timestamp delta encoding.
+ pageTraceTimeLostBits = 7 // How many bits of precision we lose in the delta.
+ pageTraceTimeDeltaBits = 16 // Size of the delta in bits.
+ pageTraceTimeMaxDelta = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits)
+)
+
+// pageTraceEvents is the low-level buffer containing the trace data.
+type pageTraceEvents struct {
+ _ sys.NotInHeap
+ events [pageTraceBufSize / 8]uint64
+}
+
+// pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events
+// to the buffer. It tracks state necessary to do so.
+type pageTraceBuf struct {
+ buf *pageTraceEvents
+ len int // How many events have been written so far.
+ timeBase int64 // The current timestamp base from which deltas are produced.
+ finished bool // Whether this trace buf should no longer flush anything out.
+}
+
+// writePid writes a P ID event indicating which P we're running on.
+//
+// Assumes there's always space in the buffer since this is only called at the
+// beginning of a new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf {
+ e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent)
+ tb.buf.events[tb.len] = e
+ tb.len++
+ return tb
+}
+
+// writeSync writes a sync event, which is just a timestamp. Handles flushing.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf {
+ if tb.len+1 > len(tb.buf.events) {
+ // N.B. flush will writeSync again.
+ return tb.flush(pid, tb.timeBase)
+ }
+ e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent)
+ tb.buf.events[tb.len] = e
+ tb.len++
+ return tb
+}
+
+// writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary.
+//
+// pid indicates the P we're currently running on. Necessary in case we need to flush.
+// now is the current nanotime timestamp.
+// base is the base address of whatever group of pages this event is happening to.
+// npages is the length of the group of pages this event is happening to.
+// typ is the event that's happening to these pages.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf {
+ large := 0
+ np := npages
+ if npages >= 1024 {
+ large = 1
+ np = 0
+ }
+ if tb.len+1+large > len(tb.buf.events) {
+ tb = tb.flush(pid, now)
+ }
+ if base%pageSize != 0 {
+ throw("base address not page aligned")
+ }
+ e := uint64(base)
+ // The pageShift low-order bits are zero.
+ e |= uint64(typ) // 2 bits
+ e |= uint64(large) << 2 // 1 bit
+ e |= uint64(np) << 3 // 10 bits
+ // Write the timestamp delta in the upper pageTraceTimeDeltaBits.
+ e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits)
+ tb.buf.events[tb.len] = e
+ if large != 0 {
+ // npages doesn't fit in 10 bits, so write an additional word with that data.
+ tb.buf.events[tb.len+1] = uint64(npages)
+ }
+ tb.len += 1 + large
+ return tb
+}
+
+// flush writes out the contents of the buffer to pageTrace.fd and resets the buffer.
+// It then writes out a P ID event and the first sync event for the new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf {
+ if !tb.finished {
+ lock(&pageTrace.fdLock)
+ writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8)
+ unlock(&pageTrace.fdLock)
+ }
+ tb.len = 0
+ tb.timeBase = now
+ return tb.writePid(pid).writeSync(pid)
+}
+
+var pageTrace struct {
+ // enabled indicates whether tracing is enabled. If true, fd >= 0.
+ //
+ // Safe to read without synchronization because it's only set once
+ // at program initialization.
+ enabled bool
+
+ // buf is the page trace buffer used if there is no P.
+ //
+ // lock protects buf.
+ lock mutex
+ buf pageTraceBuf
+
+ // fdLock protects writing to fd.
+ //
+ // fd is the file to write the page trace to.
+ fdLock mutex
+ fd int32
+}
+
+// initPageTrace initializes the page tracing infrastructure from GODEBUG.
+//
+// env must be the value of the GODEBUG environment variable.
+func initPageTrace(env string) {
+ var value string
+ for env != "" {
+ elt, rest := env, ""
+ for i := 0; i < len(env); i++ {
+ if env[i] == ',' {
+ elt, rest = env[:i], env[i+1:]
+ break
+ }
+ }
+ env = rest
+ if hasPrefix(elt, "pagetrace=") {
+ value = elt[len("pagetrace="):]
+ break
+ }
+ }
+ pageTrace.fd = -1
+ if canCreateFile && value != "" {
+ var tmp [4096]byte
+ if len(value) != 0 && len(value) < 4096 {
+ copy(tmp[:], value)
+ pageTrace.fd = create(&tmp[0], 0o664)
+ }
+ }
+ pageTrace.enabled = pageTrace.fd >= 0
+}
+
+// finishPageTrace flushes all P's trace buffers and disables page tracing.
+func finishPageTrace() {
+ if !pageTrace.enabled {
+ return
+ }
+ // Grab worldsema as we're about to execute a ragged barrier.
+ semacquire(&worldsema)
+ systemstack(func() {
+ // Disable tracing. This isn't strictly necessary and it's best-effort.
+ pageTrace.enabled = false
+
+ // Execute a ragged barrier, flushing each trace buffer.
+ forEachP(waitReasonPageTraceFlush, func(pp *p) {
+ if pp.pageTraceBuf.buf != nil {
+ pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime())
+ }
+ pp.pageTraceBuf.finished = true
+ })
+
+ // Write the global have-no-P buffer.
+ lock(&pageTrace.lock)
+ if pageTrace.buf.buf != nil {
+ pageTrace.buf = pageTrace.buf.flush(-1, nanotime())
+ }
+ pageTrace.buf.finished = true
+ unlock(&pageTrace.lock)
+
+ // Safely close the file as nothing else should be allowed to write to the fd.
+ lock(&pageTrace.fdLock)
+ closefd(pageTrace.fd)
+ pageTrace.fd = -1
+ unlock(&pageTrace.fdLock)
+ })
+ semrelease(&worldsema)
+}
+
+// writeFull ensures that a complete write of bn bytes from b is made to fd.
+func writeFull(fd uintptr, b *byte, bn int) {
+ for bn > 0 {
+ n := write(fd, unsafe.Pointer(b), int32(bn))
+ if n == -_EINTR || n == -_EAGAIN {
+ continue
+ }
+ if n < 0 {
+ print("errno=", -n, "\n")
+ throw("writeBytes: bad write")
+ }
+ bn -= int(n)
+ b = addb(b, uintptr(n))
+ }
+}