diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/jpeg-xl/lib/jxl/base/profiler.cc | |
parent | Initial commit. (diff) | |
download | firefox-esr-upstream.tar.xz firefox-esr-upstream.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/base/profiler.cc')
-rw-r--r-- | third_party/jpeg-xl/lib/jxl/base/profiler.cc | 540 |
1 files changed, 540 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc new file mode 100644 index 0000000000..a38d9b82b7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc @@ -0,0 +1,540 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/profiler.h" + +#if JXL_PROFILER_ENABLED + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> // memcpy + +#include <algorithm> // sort +#include <atomic> +#include <cinttypes> // PRIu64 +#include <hwy/cache_control.h> +#include <limits> +#include <new> + +// Optionally use SIMD in StreamCacheLine if available. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace profiler { +namespace HWY_NAMESPACE { + +// Overwrites `to` without loading it into cache (read-for-ownership). +// Copies 64 bytes from/to naturally aligned addresses. +void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) { +#if HWY_TARGET == HWY_SCALAR + hwy::CopyBytes<64>(from, to); +#else + const HWY_CAPPED(uint64_t, 2) d; + HWY_FENCE; + const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from); + const auto v0 = Load(d, from64 + 0); + const auto v1 = Load(d, from64 + 2); + const auto v2 = Load(d, from64 + 4); + const auto v3 = Load(d, from64 + 6); + // Fences prevent the compiler from reordering loads/stores, which may + // interfere with write-combining. + HWY_FENCE; + uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to); + Stream(v0, d, to64 + 0); + Stream(v1, d, to64 + 2); + Stream(v2, d, to64 + 4); + Stream(v3, d, to64 + 6); + HWY_FENCE; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace profiler +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace profiler { + +HWY_EXPORT(StreamCacheLine); + +namespace { + +// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that +// enters at least one zone. Once this buffer is full, the thread will analyze +// packets (two per zone), which introduces observer overhead. +#ifndef PROFILER_THREAD_STORAGE +#define PROFILER_THREAD_STORAGE 32ULL +#endif + +#define PROFILER_PRINT_OVERHEAD 0 + +// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT): +constexpr size_t kMaxDepth = 64; // Maximum nesting of zones. +constexpr size_t kMaxZones = 256; // Total number of zones. + +// Stack of active (entered but not exited) zones. POD, uninitialized. +// Used to deduct child duration from the parent's self time. +struct ActiveZone { + const char* name; + uint64_t entry_timestamp; + uint64_t child_total; +}; + +// Totals for all Zones with the same name. POD, must be zero-initialized. +struct ZoneTotals { + uint64_t total_duration; + const char* name; + uint64_t num_calls; +}; + +template <typename T> +inline T ClampedSubtract(const T minuend, const T subtrahend) { + if (subtrahend > minuend) { + return 0; + } + return minuend - subtrahend; +} + +} // namespace + +// Per-thread call graph (stack) and ZoneTotals for each zone. +class Results { + public: + Results() { + // Zero-initialize all accumulators (avoids a check for num_zones_ == 0). + memset(zones_, 0, sizeof(zones_)); + } + + // Used for computing overhead when this thread encounters its first Zone. + // This has no observable effect apart from increasing "analyze_elapsed_". + uint64_t ZoneDuration(const Packet* packets) { + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(num_zones_ == 0); + AnalyzePackets(packets, 2); + const uint64_t duration = zones_[0].total_duration; + zones_[0].num_calls = 0; + zones_[0].total_duration = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + return duration; + } + + void SetSelfOverhead(const uint64_t self_overhead) { + self_overhead_ = self_overhead; + } + + void SetChildOverhead(const uint64_t child_overhead) { + child_overhead_ = child_overhead; + } + + // Draw all required information from the packets, which can be discarded + // afterwards. Called whenever this thread's storage is full. + void AnalyzePackets(const Packet* HWY_RESTRICT packets, + const size_t num_packets) { + // Ensures prior weakly-ordered streaming stores are globally visible. + hwy::FlushStream(); + + const uint64_t t0 = TicksBefore(); + + for (size_t i = 0; i < num_packets; ++i) { + const uint64_t timestamp = packets[i].timestamp; + // Entering a zone + if (packets[i].name != nullptr) { + HWY_ASSERT(depth_ < kMaxDepth); + zone_stack_[depth_].name = packets[i].name; + zone_stack_[depth_].entry_timestamp = timestamp; + zone_stack_[depth_].child_total = 0; + ++depth_; + continue; + } + + HWY_ASSERT(depth_ != 0); + const ActiveZone& active = zone_stack_[depth_ - 1]; + const uint64_t duration = timestamp - active.entry_timestamp; + const uint64_t self_duration = ClampedSubtract( + duration, self_overhead_ + child_overhead_ + active.child_total); + + UpdateOrAdd(active.name, 1, self_duration); + --depth_; + + // "Deduct" the nested time from its parent's self_duration. + if (depth_ != 0) { + zone_stack_[depth_ - 1].child_total += duration + child_overhead_; + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + } + + // Incorporates results from another thread. Call after all threads have + // exited any zones. + void Assimilate(const Results& other) { + const uint64_t t0 = TicksBefore(); + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(other.depth_ == 0); + + for (size_t i = 0; i < other.num_zones_; ++i) { + const ZoneTotals& zone = other.zones_[i]; + UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration); + } + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; + } + + // Single-threaded. + void Print() { + const uint64_t t0 = TicksBefore(); + MergeDuplicates(); + + // Sort by decreasing total (self) cost. + std::sort(zones_, zones_ + num_zones_, + [](const ZoneTotals& r1, const ZoneTotals& r2) { + return r1.total_duration > r2.total_duration; + }); + + uint64_t total_visible_duration = 0; + for (size_t i = 0; i < num_zones_; ++i) { + const ZoneTotals& r = zones_[i]; + if (r.name[0] != '@') { + total_visible_duration += r.total_duration; + printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name, + r.num_calls, r.total_duration / r.num_calls, r.total_duration); + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_); + printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration); + } + + // Single-threaded. Clears all results as if no zones had been recorded. + void Reset() { + analyze_elapsed_ = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + memset(zone_stack_, 0, sizeof(zone_stack_)); + memset(zones_, 0, sizeof(zones_)); + } + + private: + // Updates ZoneTotals of the same name, or inserts a new one if this thread + // has not yet seen that name. Uses a self-organizing list data structure, + // which avoids dynamic memory allocations and is faster than unordered_map. + void UpdateOrAdd(const char* name, const uint64_t num_calls, + const uint64_t duration) { + // Special case for first zone: (maybe) update, without swapping. + if (zones_[0].name == name) { + zones_[0].total_duration += duration; + zones_[0].num_calls += num_calls; + return; + } + + // Look for a zone with the same name. + for (size_t i = 1; i < num_zones_; ++i) { + if (zones_[i].name == name) { + zones_[i].total_duration += duration; + zones_[i].num_calls += num_calls; + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + std::swap(zones_[i - 1], zones_[i]); + return; + } + } + + // Not found; create a new ZoneTotals. + HWY_ASSERT(num_zones_ < kMaxZones); + ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_; + zone->name = name; + zone->num_calls = num_calls; + zone->total_duration = duration; + ++num_zones_; + } + + // Each instantiation of a function template seems to get its own copy of + // __func__ and GCC doesn't merge them. An N^2 search for duplicates is + // acceptable because we only expect a few dozen zones. + void MergeDuplicates() { + for (size_t i = 0; i < num_zones_; ++i) { + // Add any subsequent duplicates to num_calls and total_duration. + for (size_t j = i + 1; j < num_zones_;) { + if (!strcmp(zones_[i].name, zones_[j].name)) { + zones_[i].num_calls += zones_[j].num_calls; + zones_[i].total_duration += zones_[j].total_duration; + // Fill hole with last item. + zones_[j] = zones_[--num_zones_]; + } else { // Name differed, try next ZoneTotals. + ++j; + } + } + } + } + + uint64_t analyze_elapsed_ = 0; + uint64_t self_overhead_ = 0; + uint64_t child_overhead_ = 0; + + size_t depth_ = 0; // Number of active zones <= kMaxDepth. + size_t num_zones_ = 0; // Number of unique zones <= kMaxZones. + + // After other members to avoid large pointer offsets. + alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest + alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list +}; + +ThreadSpecific::ThreadSpecific() + : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet) + packets_(hwy::AllocateAligned<Packet>(max_packets_)), + num_packets_(0), + results_(hwy::MakeUniqueAligned<Results>()) {} + +ThreadSpecific::~ThreadSpecific() {} + +void ThreadSpecific::FlushBuffer() { + if (num_packets_ + kBufferCapacity > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + // This buffering halves observer overhead and decreases the overall + // runtime by about 3%. + HWY_DYNAMIC_DISPATCH(StreamCacheLine) + (buffer_, packets_.get() + num_packets_); + num_packets_ += kBufferCapacity; + buffer_size_ = 0; +} + +void ThreadSpecific::AnalyzeRemainingPackets() { + // Storage full => empty it. + if (num_packets_ + buffer_size_ > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + + // Move buffer to storage + memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); + num_packets_ += buffer_size_; + buffer_size_ = 0; + + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; +} + +namespace { + +class HalfSampleMode { + public: + // Returns mode. "sorted" must be in ascending order. + template <typename T> + T operator()(const T* const HWY_RESTRICT sorted, + const size_t num_values) const { + int64_t center = num_values / 2; + int64_t width = num_values; + + // Zoom in on modal intervals of decreasing width. Stop before we reach + // width=1, i.e. single values, for which there is no "slope". + while (width > 2) { + // Round up so we can still reach the outer edges of odd widths. + width = (width + 1) / 2; + + center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width); + } + + return sorted[center]; // mode := middle value in modal interval. + } + + private: + // Returns center of the densest region [c-radius, c+radius]. + template <typename T> + static HWY_INLINE int64_t CenterOfIntervalWithMinSlope( + const T* HWY_RESTRICT sorted, const int64_t total_values, + const int64_t center, const int64_t width) { + const int64_t radius = (width + 1) / 2; + + auto compute_slope = [radius, total_values, sorted]( + int64_t c, int64_t* actual_center = nullptr) { + // For symmetry, check 2*radius+1 values, i.e. [min, max]. + const int64_t min = std::max(c - radius, int64_t(0)); + const int64_t max = std::min(c + radius, total_values - 1); + HWY_ASSERT(min < max); + HWY_ASSERT(sorted[min] <= + sorted[max] + std::numeric_limits<float>::epsilon()); + const float dx = max - min + 1; + const float slope = (sorted[max] - sorted[min]) / dx; + + if (actual_center != nullptr) { + // c may be out of bounds, so return center of the clamped bounds. + *actual_center = (min + max + 1) / 2; + } + return slope; + }; + + // First find min_slope for all centers. + float min_slope = std::numeric_limits<float>::max(); + for (int64_t c = center - radius; c <= center + radius; ++c) { + min_slope = std::min(min_slope, compute_slope(c)); + } + + // Candidates := centers with slope ~= min_slope. + std::vector<int64_t> candidates; + for (int64_t c = center - radius; c <= center + radius; ++c) { + int64_t actual_center; + const float slope = compute_slope(c, &actual_center); + if (slope <= min_slope * 1.001f) { + candidates.push_back(actual_center); + } + } + + // Keep the median. + HWY_ASSERT(!candidates.empty()); + if (candidates.size() == 1) return candidates[0]; + std::nth_element(candidates.begin(), + candidates.begin() + candidates.size() / 2, + candidates.end()); + return candidates[candidates.size() / 2]; + } +}; + +} // namespace + +void ThreadSpecific::ComputeOverhead() { + // Delay after capturing timestamps before/after the actual zone runs. Even + // with frequency throttling disabled, this has a multimodal distribution, + // including 32, 34, 48, 52, 59, 62. + uint64_t self_overhead; + { + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 1024; + uint32_t durations[kNumDurations]; + + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + { // + PROFILER_ZONE("Dummy Zone (never shown)"); + } + const uint64_t duration = results_->ZoneDuration(buffer_); + buffer_size_ = 0; + durations[idx_duration] = static_cast<uint32_t>(duration); + HWY_ASSERT(num_packets_ == 0); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + // Median. + std::sort(samples, samples + kNumSamples); + self_overhead = samples[kNumSamples / 2]; +#if PROFILER_PRINT_OVERHEAD + printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead)); +#endif + results_->SetSelfOverhead(self_overhead); + } + + // Delay before capturing start timestamp / after end timestamp. + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 16; + uint32_t durations[kNumDurations]; + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + const size_t kReps = 10000; + // Analysis time should not be included => must fit within buffer. + HWY_ASSERT(kReps * 2 < max_packets_); + hwy::FlushStream(); + const uint64_t t0 = TicksBefore(); + for (size_t i = 0; i < kReps; ++i) { + PROFILER_ZONE("Dummy"); + } + hwy::FlushStream(); + const uint64_t t1 = TicksAfter(); + HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2); + buffer_size_ = 0; + num_packets_ = 0; + const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; + durations[idx_duration] = + static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + std::sort(samples, samples + kNumSamples); + const uint64_t child_overhead = samples[9 * kNumSamples / 10]; +#if PROFILER_PRINT_OVERHEAD + printf("Child overhead: %" PRIu64 "\n", + static_cast<uint64_t>(child_overhead)); +#endif + results_->SetChildOverhead(child_overhead); +} + +namespace { + +// Could be a static member of Zone, but that would expose <atomic> in header. +std::atomic<ThreadSpecific*>& GetHead() { + static std::atomic<ThreadSpecific*> head_{nullptr}; // Owning + return head_; +} + +} // namespace + +// Thread-safe. +ThreadSpecific* Zone::InitThreadSpecific() { + ThreadSpecific* thread_specific = + hwy::MakeUniqueAligned<ThreadSpecific>().release(); + + // Insert into unordered list + std::atomic<ThreadSpecific*>& head = GetHead(); + ThreadSpecific* old_head = head.load(std::memory_order_relaxed); + thread_specific->SetNext(old_head); + while (!head.compare_exchange_weak(old_head, thread_specific, + std::memory_order_release, + std::memory_order_relaxed)) { + thread_specific->SetNext(old_head); + // TODO(janwas): pause + } + + // ComputeOverhead also creates a Zone, so this needs to be set before that + // to prevent infinite recursion. + GetThreadSpecific() = thread_specific; + + thread_specific->ComputeOverhead(); + return thread_specific; +} + +// Single-threaded. +/*static*/ void Zone::PrintResults() { + ThreadSpecific* head = GetHead().load(std::memory_order_relaxed); + ThreadSpecific* p = head; + while (p) { + p->AnalyzeRemainingPackets(); + + // Combine all threads into a single Result. + if (p != head) { + head->GetResults().Assimilate(p->GetResults()); + p->GetResults().Reset(); + } + + p = p->GetNext(); + } + + if (head != nullptr) { + head->GetResults().Print(); + head->GetResults().Reset(); + } +} + +} // namespace profiler +} // namespace jxl + +#endif // HWY_ONCE +#endif // JXL_PROFILER_ENABLED |