diff options
Diffstat (limited to 'src/runtime/mgclimit.go')
-rw-r--r-- | src/runtime/mgclimit.go | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/src/runtime/mgclimit.go b/src/runtime/mgclimit.go new file mode 100644 index 0000000..ef3cc08 --- /dev/null +++ b/src/runtime/mgclimit.go @@ -0,0 +1,484 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runtime + +import "runtime/internal/atomic" + +// gcCPULimiter is a mechanism to limit GC CPU utilization in situations +// where it might become excessive and inhibit application progress (e.g. +// a death spiral). +// +// The core of the limiter is a leaky bucket mechanism that fills with GC +// CPU time and drains with mutator time. Because the bucket fills and +// drains with time directly (i.e. without any weighting), this effectively +// sets a very conservative limit of 50%. This limit could be enforced directly, +// however, but the purpose of the bucket is to accommodate spikes in GC CPU +// utilization without hurting throughput. +// +// Note that the bucket in the leaky bucket mechanism can never go negative, +// so the GC never gets credit for a lot of CPU time spent without the GC +// running. This is intentional, as an application that stays idle for, say, +// an entire day, could build up enough credit to fail to prevent a death +// spiral the following day. The bucket's capacity is the GC's only leeway. +// +// The capacity thus also sets the window the limiter considers. For example, +// if the capacity of the bucket is 1 cpu-second, then the limiter will not +// kick in until at least 1 full cpu-second in the last 2 cpu-second window +// is spent on GC CPU time. +var gcCPULimiter gcCPULimiterState + +type gcCPULimiterState struct { + lock atomic.Uint32 + + enabled atomic.Bool + bucket struct { + // Invariants: + // - fill >= 0 + // - capacity >= 0 + // - fill <= capacity + fill, capacity uint64 + } + // overflow is the cumulative amount of GC CPU time that we tried to fill the + // bucket with but exceeded its capacity. + overflow uint64 + + // gcEnabled is an internal copy of gcBlackenEnabled that determines + // whether the limiter tracks total assist time. + // + // gcBlackenEnabled isn't used directly so as to keep this structure + // unit-testable. + gcEnabled bool + + // transitioning is true when the GC is in a STW and transitioning between + // the mark and sweep phases. + transitioning bool + + // assistTimePool is the accumulated assist time since the last update. + assistTimePool atomic.Int64 + + // idleMarkTimePool is the accumulated idle mark time since the last update. + idleMarkTimePool atomic.Int64 + + // idleTimePool is the accumulated time Ps spent on the idle list since the last update. + idleTimePool atomic.Int64 + + // lastUpdate is the nanotime timestamp of the last time update was called. + // + // Updated under lock, but may be read concurrently. + lastUpdate atomic.Int64 + + // lastEnabledCycle is the GC cycle that last had the limiter enabled. + lastEnabledCycle atomic.Uint32 + + // nprocs is an internal copy of gomaxprocs, used to determine total available + // CPU time. + // + // gomaxprocs isn't used directly so as to keep this structure unit-testable. + nprocs int32 + + // test indicates whether this instance of the struct was made for testing purposes. + test bool +} + +// limiting returns true if the CPU limiter is currently enabled, meaning the Go GC +// should take action to limit CPU utilization. +// +// It is safe to call concurrently with other operations. +func (l *gcCPULimiterState) limiting() bool { + return l.enabled.Load() +} + +// startGCTransition notifies the limiter of a GC transition. +// +// This call takes ownership of the limiter and disables all other means of +// updating the limiter. Release ownership by calling finishGCTransition. +// +// It is safe to call concurrently with other operations. +func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) { + if !l.tryLock() { + // This must happen during a STW, so we can't fail to acquire the lock. + // If we did, something went wrong. Throw. + throw("failed to acquire lock to start a GC transition") + } + if l.gcEnabled == enableGC { + throw("transitioning GC to the same state as before?") + } + // Flush whatever was left between the last update and now. + l.updateLocked(now) + l.gcEnabled = enableGC + l.transitioning = true + // N.B. finishGCTransition releases the lock. + // + // We don't release here to increase the chance that if there's a failure + // to finish the transition, that we throw on failing to acquire the lock. +} + +// finishGCTransition notifies the limiter that the GC transition is complete +// and releases ownership of it. It also accumulates STW time in the bucket. +// now must be the timestamp from the end of the STW pause. +func (l *gcCPULimiterState) finishGCTransition(now int64) { + if !l.transitioning { + throw("finishGCTransition called without starting one?") + } + // Count the full nprocs set of CPU time because the world is stopped + // between startGCTransition and finishGCTransition. Even though the GC + // isn't running on all CPUs, it is preventing user code from doing so, + // so it might as well be. + if lastUpdate := l.lastUpdate.Load(); now >= lastUpdate { + l.accumulate(0, (now-lastUpdate)*int64(l.nprocs)) + } + l.lastUpdate.Store(now) + l.transitioning = false + l.unlock() +} + +// gcCPULimiterUpdatePeriod dictates the maximum amount of wall-clock time +// we can go before updating the limiter. +const gcCPULimiterUpdatePeriod = 10e6 // 10ms + +// needUpdate returns true if the limiter's maximum update period has been +// exceeded, and so would benefit from an update. +func (l *gcCPULimiterState) needUpdate(now int64) bool { + return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod +} + +// addAssistTime notifies the limiter of additional assist time. It will be +// included in the next update. +func (l *gcCPULimiterState) addAssistTime(t int64) { + l.assistTimePool.Add(t) +} + +// addIdleTime notifies the limiter of additional time a P spent on the idle list. It will be +// subtracted from the total CPU time in the next update. +func (l *gcCPULimiterState) addIdleTime(t int64) { + l.idleTimePool.Add(t) +} + +// update updates the bucket given runtime-specific information. now is the +// current monotonic time in nanoseconds. +// +// This is safe to call concurrently with other operations, except *GCTransition. +func (l *gcCPULimiterState) update(now int64) { + if !l.tryLock() { + // We failed to acquire the lock, which means something else is currently + // updating. Just drop our update, the next one to update will include + // our total assist time. + return + } + if l.transitioning { + throw("update during transition") + } + l.updateLocked(now) + l.unlock() +} + +// updateLocked is the implementation of update. l.lock must be held. +func (l *gcCPULimiterState) updateLocked(now int64) { + lastUpdate := l.lastUpdate.Load() + if now < lastUpdate { + // Defensively avoid overflow. This isn't even the latest update anyway. + return + } + windowTotalTime := (now - lastUpdate) * int64(l.nprocs) + l.lastUpdate.Store(now) + + // Drain the pool of assist time. + assistTime := l.assistTimePool.Load() + if assistTime != 0 { + l.assistTimePool.Add(-assistTime) + } + + // Drain the pool of idle time. + idleTime := l.idleTimePool.Load() + if idleTime != 0 { + l.idleTimePool.Add(-idleTime) + } + + if !l.test { + // Consume time from in-flight events. Make sure we're not preemptible so allp can't change. + // + // The reason we do this instead of just waiting for those events to finish and push updates + // is to ensure that all the time we're accounting for happened sometime between lastUpdate + // and now. This dramatically simplifies reasoning about the limiter because we're not at + // risk of extra time being accounted for in this window than actually happened in this window, + // leading to all sorts of weird transient behavior. + mp := acquirem() + for _, pp := range allp { + typ, duration := pp.limiterEvent.consume(now) + switch typ { + case limiterEventIdleMarkWork: + fallthrough + case limiterEventIdle: + idleTime += duration + sched.idleTime.Add(duration) + case limiterEventMarkAssist: + fallthrough + case limiterEventScavengeAssist: + assistTime += duration + case limiterEventNone: + break + default: + throw("invalid limiter event type found") + } + } + releasem(mp) + } + + // Compute total GC time. + windowGCTime := assistTime + if l.gcEnabled { + windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization) + } + + // Subtract out all idle time from the total time. Do this after computing + // GC time, because the background utilization is dependent on the *real* + // total time, not the total time after idle time is subtracted. + // + // Idle time is counted as any time that a P is on the P idle list plus idle mark + // time. Idle mark workers soak up time that the application spends idle. + // + // On a heavily undersubscribed system, any additional idle time can skew GC CPU + // utilization, because the GC might be executing continuously and thrashing, + // yet the CPU utilization with respect to GOMAXPROCS will be quite low, so + // the limiter fails to turn on. By subtracting idle time, we're removing time that + // we know the application was idle giving a more accurate picture of whether + // the GC is thrashing. + // + // Note that this can cause the limiter to turn on even if it's not needed. For + // instance, on a system with 32 Ps but only 1 running goroutine, each GC will have + // 8 dedicated GC workers. Assuming the GC cycle is half mark phase and half sweep + // phase, then the GC CPU utilization over that cycle, with idle time removed, will + // be 8/(8+2) = 80%. Even though the limiter turns on, though, assist should be + // unnecessary, as the GC has way more CPU time to outpace the 1 goroutine that's + // running. + windowTotalTime -= idleTime + + l.accumulate(windowTotalTime-windowGCTime, windowGCTime) +} + +// accumulate adds time to the bucket and signals whether the limiter is enabled. +// +// This is an internal function that deals just with the bucket. Prefer update. +// l.lock must be held. +func (l *gcCPULimiterState) accumulate(mutatorTime, gcTime int64) { + headroom := l.bucket.capacity - l.bucket.fill + enabled := headroom == 0 + + // Let's be careful about three things here: + // 1. The addition and subtraction, for the invariants. + // 2. Overflow. + // 3. Excessive mutation of l.enabled, which is accessed + // by all assists, potentially more than once. + change := gcTime - mutatorTime + + // Handle limiting case. + if change > 0 && headroom <= uint64(change) { + l.overflow += uint64(change) - headroom + l.bucket.fill = l.bucket.capacity + if !enabled { + l.enabled.Store(true) + l.lastEnabledCycle.Store(memstats.numgc + 1) + } + return + } + + // Handle non-limiting cases. + if change < 0 && l.bucket.fill <= uint64(-change) { + // Bucket emptied. + l.bucket.fill = 0 + } else { + // All other cases. + l.bucket.fill -= uint64(-change) + } + if change != 0 && enabled { + l.enabled.Store(false) + } +} + +// tryLock attempts to lock l. Returns true on success. +func (l *gcCPULimiterState) tryLock() bool { + return l.lock.CompareAndSwap(0, 1) +} + +// unlock releases the lock on l. Must be called if tryLock returns true. +func (l *gcCPULimiterState) unlock() { + old := l.lock.Swap(0) + if old != 1 { + throw("double unlock") + } +} + +// capacityPerProc is the limiter's bucket capacity for each P in GOMAXPROCS. +const capacityPerProc = 1e9 // 1 second in nanoseconds + +// resetCapacity updates the capacity based on GOMAXPROCS. Must not be called +// while the GC is enabled. +// +// It is safe to call concurrently with other operations. +func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) { + if !l.tryLock() { + // This must happen during a STW, so we can't fail to acquire the lock. + // If we did, something went wrong. Throw. + throw("failed to acquire lock to reset capacity") + } + // Flush the rest of the time for this period. + l.updateLocked(now) + l.nprocs = nprocs + + l.bucket.capacity = uint64(nprocs) * capacityPerProc + if l.bucket.fill > l.bucket.capacity { + l.bucket.fill = l.bucket.capacity + l.enabled.Store(true) + l.lastEnabledCycle.Store(memstats.numgc + 1) + } else if l.bucket.fill < l.bucket.capacity { + l.enabled.Store(false) + } + l.unlock() +} + +// limiterEventType indicates the type of an event occurring on some P. +// +// These events represent the full set of events that the GC CPU limiter tracks +// to execute its function. +// +// This type may use no more than limiterEventBits bits of information. +type limiterEventType uint8 + +const ( + limiterEventNone limiterEventType = iota // None of the following events. + limiterEventIdleMarkWork // Refers to an idle mark worker (see gcMarkWorkerMode). + limiterEventMarkAssist // Refers to mark assist (see gcAssistAlloc). + limiterEventScavengeAssist // Refers to a scavenge assist (see allocSpan). + limiterEventIdle // Refers to time a P spent on the idle list. + + limiterEventBits = 3 +) + +// limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent +// the event type. The rest of the bits of that field represent a timestamp. +const ( + limiterEventTypeMask = uint64((1<<limiterEventBits)-1) << (64 - limiterEventBits) + limiterEventStampNone = limiterEventStamp(0) +) + +// limiterEventStamp is a nanotime timestamp packed with a limiterEventType. +type limiterEventStamp uint64 + +// makeLimiterEventStamp creates a new stamp from the event type and the current timestamp. +func makeLimiterEventStamp(typ limiterEventType, now int64) limiterEventStamp { + return limiterEventStamp(uint64(typ)<<(64-limiterEventBits) | (uint64(now) &^ limiterEventTypeMask)) +} + +// duration computes the difference between now and the start time stored in the stamp. +// +// Returns 0 if the difference is negative, which may happen if now is stale or if the +// before and after timestamps cross a 2^(64-limiterEventBits) boundary. +func (s limiterEventStamp) duration(now int64) int64 { + // The top limiterEventBits bits of the timestamp are derived from the current time + // when computing a duration. + start := int64((uint64(now) & limiterEventTypeMask) | (uint64(s) &^ limiterEventTypeMask)) + if now < start { + return 0 + } + return now - start +} + +// type extracts the event type from the stamp. +func (s limiterEventStamp) typ() limiterEventType { + return limiterEventType(s >> (64 - limiterEventBits)) +} + +// limiterEvent represents tracking state for an event tracked by the GC CPU limiter. +type limiterEvent struct { + stamp atomic.Uint64 // Stores a limiterEventStamp. +} + +// start begins tracking a new limiter event of the current type. If an event +// is already in flight, then a new event cannot begin because the current time is +// already being attributed to that event. In this case, this function returns false. +// Otherwise, it returns true. +// +// The caller must be non-preemptible until at least stop is called or this function +// returns false. Because this is trying to measure "on-CPU" time of some event, getting +// scheduled away during it can mean that whatever we're measuring isn't a reflection +// of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as +// close of an approximation as we can. +func (e *limiterEvent) start(typ limiterEventType, now int64) bool { + if limiterEventStamp(e.stamp.Load()).typ() != limiterEventNone { + return false + } + e.stamp.Store(uint64(makeLimiterEventStamp(typ, now))) + return true +} + +// consume acquires the partial event CPU time from any in-flight event. +// It achieves this by storing the current time as the new event time. +// +// Returns the type of the in-flight event, as well as how long it's currently been +// executing for. Returns limiterEventNone if no event is active. +func (e *limiterEvent) consume(now int64) (typ limiterEventType, duration int64) { + // Read the limiter event timestamp and update it to now. + for { + old := limiterEventStamp(e.stamp.Load()) + typ = old.typ() + if typ == limiterEventNone { + // There's no in-flight event, so just push that up. + return + } + duration = old.duration(now) + if duration == 0 { + // We might have a stale now value, or this crossed the + // 2^(64-limiterEventBits) boundary in the clock readings. + // Just ignore it. + return limiterEventNone, 0 + } + new := makeLimiterEventStamp(typ, now) + if e.stamp.CompareAndSwap(uint64(old), uint64(new)) { + break + } + } + return +} + +// stop stops the active limiter event. Throws if the +// +// The caller must be non-preemptible across the event. See start as to why. +func (e *limiterEvent) stop(typ limiterEventType, now int64) { + var stamp limiterEventStamp + for { + stamp = limiterEventStamp(e.stamp.Load()) + if stamp.typ() != typ { + print("runtime: want=", typ, " got=", stamp.typ(), "\n") + throw("limiterEvent.stop: found wrong event in p's limiter event slot") + } + if e.stamp.CompareAndSwap(uint64(stamp), uint64(limiterEventStampNone)) { + break + } + } + duration := stamp.duration(now) + if duration == 0 { + // It's possible that we're missing time because we crossed a + // 2^(64-limiterEventBits) boundary between the start and end. + // In this case, we're dropping that information. This is OK because + // at worst it'll cause a transient hiccup that will quickly resolve + // itself as all new timestamps begin on the other side of the boundary. + // Such a hiccup should be incredibly rare. + return + } + // Account for the event. + switch typ { + case limiterEventIdleMarkWork: + gcCPULimiter.addIdleTime(duration) + case limiterEventIdle: + gcCPULimiter.addIdleTime(duration) + sched.idleTime.Add(duration) + case limiterEventMarkAssist: + fallthrough + case limiterEventScavengeAssist: + gcCPULimiter.addAssistTime(duration) + default: + throw("limiterEvent.stop: invalid limiter event type found") + } +} |