diff options
Diffstat (limited to '')
-rw-r--r-- | src/runtime/pagetrace_on.go | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/src/runtime/pagetrace_on.go b/src/runtime/pagetrace_on.go new file mode 100644 index 0000000..f82521c --- /dev/null +++ b/src/runtime/pagetrace_on.go @@ -0,0 +1,358 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.pagetrace + +// Page tracer. +// +// This file contains an implementation of page trace instrumentation for tracking +// the way the Go runtime manages pages of memory. The trace may be enabled at program +// startup with the GODEBUG option pagetrace. +// +// Each page trace event is either 8 or 16 bytes wide. The first +// 8 bytes follow this format for non-sync events: +// +// [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType] +// +// If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word +// containing the full npages value (the npages bitfield is 0). +// +// The base address's bottom pageShift bits are always zero hence why we can pack other +// data in there. We ignore the top 16 bits, assuming a 48 bit address space for the +// heap. +// +// The timestamp delta is computed from the difference between the current nanotime +// timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of +// this delta is removed and only the next pageTraceTimeDeltaBits are kept. +// +// A sync event is emitted at the beginning of each trace buffer and whenever the +// timestamp delta would not fit in an event. +// +// Sync events have the following structure: +// +// [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent] +// +// In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID +// (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp, +// but like for the delta we drop the bottom pageTraceTimeLostBits here as well. + +package runtime + +import ( + "runtime/internal/sys" + "unsafe" +) + +// pageTraceAlloc records a page trace allocation event. +// pp may be nil. Call only if debug.pagetracefd != 0. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func pageTraceAlloc(pp *p, now int64, base, npages uintptr) { + if pageTrace.enabled { + if now == 0 { + now = nanotime() + } + pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent) + } +} + +// pageTraceFree records a page trace free event. +// pp may be nil. Call only if debug.pagetracefd != 0. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func pageTraceFree(pp *p, now int64, base, npages uintptr) { + if pageTrace.enabled { + if now == 0 { + now = nanotime() + } + pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent) + } +} + +// pageTraceScav records a page trace scavenge event. +// pp may be nil. Call only if debug.pagetracefd != 0. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func pageTraceScav(pp *p, now int64, base, npages uintptr) { + if pageTrace.enabled { + if now == 0 { + now = nanotime() + } + pageTraceEmit(pp, now, base, npages, pageTraceScavEvent) + } +} + +// pageTraceEventType is a page trace event type. +type pageTraceEventType uint8 + +const ( + pageTraceSyncEvent pageTraceEventType = iota // Timestamp emission. + pageTraceAllocEvent // Allocation of pages. + pageTraceFreeEvent // Freeing pages. + pageTraceScavEvent // Scavenging pages. +) + +// pageTraceEmit emits a page trace event. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) { + // Get a buffer. + var tbp *pageTraceBuf + pid := int32(-1) + if pp == nil { + // We have no P, so take the global buffer. + lock(&pageTrace.lock) + tbp = &pageTrace.buf + } else { + tbp = &pp.pageTraceBuf + pid = pp.id + } + + // Initialize the buffer if necessary. + tb := *tbp + if tb.buf == nil { + tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys)) + tb = tb.writePid(pid) + } + + // Handle timestamp and emit a sync event if necessary. + if now < tb.timeBase { + now = tb.timeBase + } + if now-tb.timeBase >= pageTraceTimeMaxDelta { + tb.timeBase = now + tb = tb.writeSync(pid) + } + + // Emit the event. + tb = tb.writeEvent(pid, now, base, npages, typ) + + // Write back the buffer. + *tbp = tb + if pp == nil { + unlock(&pageTrace.lock) + } +} + +const ( + pageTraceBufSize = 32 << 10 + + // These constants describe the per-event timestamp delta encoding. + pageTraceTimeLostBits = 7 // How many bits of precision we lose in the delta. + pageTraceTimeDeltaBits = 16 // Size of the delta in bits. + pageTraceTimeMaxDelta = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits) +) + +// pageTraceEvents is the low-level buffer containing the trace data. +type pageTraceEvents struct { + _ sys.NotInHeap + events [pageTraceBufSize / 8]uint64 +} + +// pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events +// to the buffer. It tracks state necessary to do so. +type pageTraceBuf struct { + buf *pageTraceEvents + len int // How many events have been written so far. + timeBase int64 // The current timestamp base from which deltas are produced. + finished bool // Whether this trace buf should no longer flush anything out. +} + +// writePid writes a P ID event indicating which P we're running on. +// +// Assumes there's always space in the buffer since this is only called at the +// beginning of a new buffer. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf { + e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent) + tb.buf.events[tb.len] = e + tb.len++ + return tb +} + +// writeSync writes a sync event, which is just a timestamp. Handles flushing. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf { + if tb.len+1 > len(tb.buf.events) { + // N.B. flush will writeSync again. + return tb.flush(pid, tb.timeBase) + } + e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent) + tb.buf.events[tb.len] = e + tb.len++ + return tb +} + +// writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary. +// +// pid indicates the P we're currently running on. Necessary in case we need to flush. +// now is the current nanotime timestamp. +// base is the base address of whatever group of pages this event is happening to. +// npages is the length of the group of pages this event is happening to. +// typ is the event that's happening to these pages. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf { + large := 0 + np := npages + if npages >= 1024 { + large = 1 + np = 0 + } + if tb.len+1+large > len(tb.buf.events) { + tb = tb.flush(pid, now) + } + if base%pageSize != 0 { + throw("base address not page aligned") + } + e := uint64(base) + // The pageShift low-order bits are zero. + e |= uint64(typ) // 2 bits + e |= uint64(large) << 2 // 1 bit + e |= uint64(np) << 3 // 10 bits + // Write the timestamp delta in the upper pageTraceTimeDeltaBits. + e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits) + tb.buf.events[tb.len] = e + if large != 0 { + // npages doesn't fit in 10 bits, so write an additional word with that data. + tb.buf.events[tb.len+1] = uint64(npages) + } + tb.len += 1 + large + return tb +} + +// flush writes out the contents of the buffer to pageTrace.fd and resets the buffer. +// It then writes out a P ID event and the first sync event for the new buffer. +// +// Must run on the system stack as a crude way to prevent preemption. +// +//go:systemstack +func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf { + if !tb.finished { + lock(&pageTrace.fdLock) + writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8) + unlock(&pageTrace.fdLock) + } + tb.len = 0 + tb.timeBase = now + return tb.writePid(pid).writeSync(pid) +} + +var pageTrace struct { + // enabled indicates whether tracing is enabled. If true, fd >= 0. + // + // Safe to read without synchronization because it's only set once + // at program initialization. + enabled bool + + // buf is the page trace buffer used if there is no P. + // + // lock protects buf. + lock mutex + buf pageTraceBuf + + // fdLock protects writing to fd. + // + // fd is the file to write the page trace to. + fdLock mutex + fd int32 +} + +// initPageTrace initializes the page tracing infrastructure from GODEBUG. +// +// env must be the value of the GODEBUG environment variable. +func initPageTrace(env string) { + var value string + for env != "" { + elt, rest := env, "" + for i := 0; i < len(env); i++ { + if env[i] == ',' { + elt, rest = env[:i], env[i+1:] + break + } + } + env = rest + if hasPrefix(elt, "pagetrace=") { + value = elt[len("pagetrace="):] + break + } + } + pageTrace.fd = -1 + if canCreateFile && value != "" { + var tmp [4096]byte + if len(value) != 0 && len(value) < 4096 { + copy(tmp[:], value) + pageTrace.fd = create(&tmp[0], 0o664) + } + } + pageTrace.enabled = pageTrace.fd >= 0 +} + +// finishPageTrace flushes all P's trace buffers and disables page tracing. +func finishPageTrace() { + if !pageTrace.enabled { + return + } + // Grab worldsema as we're about to execute a ragged barrier. + semacquire(&worldsema) + systemstack(func() { + // Disable tracing. This isn't strictly necessary and it's best-effort. + pageTrace.enabled = false + + // Execute a ragged barrier, flushing each trace buffer. + forEachP(waitReasonPageTraceFlush, func(pp *p) { + if pp.pageTraceBuf.buf != nil { + pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime()) + } + pp.pageTraceBuf.finished = true + }) + + // Write the global have-no-P buffer. + lock(&pageTrace.lock) + if pageTrace.buf.buf != nil { + pageTrace.buf = pageTrace.buf.flush(-1, nanotime()) + } + pageTrace.buf.finished = true + unlock(&pageTrace.lock) + + // Safely close the file as nothing else should be allowed to write to the fd. + lock(&pageTrace.fdLock) + closefd(pageTrace.fd) + pageTrace.fd = -1 + unlock(&pageTrace.fdLock) + }) + semrelease(&worldsema) +} + +// writeFull ensures that a complete write of bn bytes from b is made to fd. +func writeFull(fd uintptr, b *byte, bn int) { + for bn > 0 { + n := write(fd, unsafe.Pointer(b), int32(bn)) + if n == -_EINTR || n == -_EAGAIN { + continue + } + if n < 0 { + print("errno=", -n, "\n") + throw("writeBytes: bad write") + } + bn -= int(n) + b = addb(b, uintptr(n)) + } +} |