1 files changed, 540 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
new file mode 100644
index 0000000000..a38d9b82b7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
@@ -0,0 +1,540 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/profiler.h"
+
+#if JXL_PROFILER_ENABLED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // sort
+#include <atomic>
+#include <cinttypes>  // PRIu64
+#include <hwy/cache_control.h>
+#include <limits>
+#include <new>
+
+// Optionally use SIMD in StreamCacheLine if available.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace profiler {
+namespace HWY_NAMESPACE {
+
+// Overwrites `to` without loading it into cache (read-for-ownership).
+// Copies 64 bytes from/to naturally aligned addresses.
+void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
+#if HWY_TARGET == HWY_SCALAR
+  hwy::CopyBytes<64>(from, to);
+#else
+  const HWY_CAPPED(uint64_t, 2) d;
+  HWY_FENCE;
+  const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
+  const auto v0 = Load(d, from64 + 0);
+  const auto v1 = Load(d, from64 + 2);
+  const auto v2 = Load(d, from64 + 4);
+  const auto v3 = Load(d, from64 + 6);
+  // Fences prevent the compiler from reordering loads/stores, which may
+  // interfere with write-combining.
+  HWY_FENCE;
+  uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
+  Stream(v0, d, to64 + 0);
+  Stream(v1, d, to64 + 2);
+  Stream(v2, d, to64 + 4);
+  Stream(v3, d, to64 + 6);
+  HWY_FENCE;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace profiler
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace profiler {
+
+HWY_EXPORT(StreamCacheLine);
+
+namespace {
+
+// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// packets (two per zone), which introduces observer overhead.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 32ULL
+#endif
+
+#define PROFILER_PRINT_OVERHEAD 0
+
+// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
+constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
+constexpr size_t kMaxZones = 256;  // Total number of zones.
+
+// Stack of active (entered but not exited) zones. POD, uninitialized.
+// Used to deduct child duration from the parent's self time.
+struct ActiveZone {
+  const char* name;
+  uint64_t entry_timestamp;
+  uint64_t child_total;
+};
+
+// Totals for all Zones with the same name. POD, must be zero-initialized.
+struct ZoneTotals {
+  uint64_t total_duration;
+  const char* name;
+  uint64_t num_calls;
+};
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+  if (subtrahend > minuend) {
+    return 0;
+  }
+  return minuend - subtrahend;
+}
+
+}  // namespace
+
+// Per-thread call graph (stack) and ZoneTotals for each zone.
+class Results {
+ public:
+  Results() {
+    // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+  // Used for computing overhead when this thread encounters its first Zone.
+  // This has no observable effect apart from increasing "analyze_elapsed_".
+  uint64_t ZoneDuration(const Packet* packets) {
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(num_zones_ == 0);
+    AnalyzePackets(packets, 2);
+    const uint64_t duration = zones_[0].total_duration;
+    zones_[0].num_calls = 0;
+    zones_[0].total_duration = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    return duration;
+  }
+
+  void SetSelfOverhead(const uint64_t self_overhead) {
+    self_overhead_ = self_overhead;
+  }
+
+  void SetChildOverhead(const uint64_t child_overhead) {
+    child_overhead_ = child_overhead;
+  }
+
+  // Draw all required information from the packets, which can be discarded
+  // afterwards. Called whenever this thread's storage is full.
+  void AnalyzePackets(const Packet* HWY_RESTRICT packets,
+                      const size_t num_packets) {
+    // Ensures prior weakly-ordered streaming stores are globally visible.
+    hwy::FlushStream();
+
+    const uint64_t t0 = TicksBefore();
+
+    for (size_t i = 0; i < num_packets; ++i) {
+      const uint64_t timestamp = packets[i].timestamp;
+      // Entering a zone
+      if (packets[i].name != nullptr) {
+        HWY_ASSERT(depth_ < kMaxDepth);
+        zone_stack_[depth_].name = packets[i].name;
+        zone_stack_[depth_].entry_timestamp = timestamp;
+        zone_stack_[depth_].child_total = 0;
+        ++depth_;
+        continue;
+      }
+
+      HWY_ASSERT(depth_ != 0);
+      const ActiveZone& active = zone_stack_[depth_ - 1];
+      const uint64_t duration = timestamp - active.entry_timestamp;
+      const uint64_t self_duration = ClampedSubtract(
+          duration, self_overhead_ + child_overhead_ + active.child_total);
+
+      UpdateOrAdd(active.name, 1, self_duration);
+      --depth_;
+
+      // "Deduct" the nested time from its parent's self_duration.
+      if (depth_ != 0) {
+        zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+  }
+
+  // Incorporates results from another thread. Call after all threads have
+  // exited any zones.
+  void Assimilate(const Results& other) {
+    const uint64_t t0 = TicksBefore();
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(other.depth_ == 0);
+
+    for (size_t i = 0; i < other.num_zones_; ++i) {
+      const ZoneTotals& zone = other.zones_[i];
+      UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
+    }
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+  }
+
+  // Single-threaded.
+  void Print() {
+    const uint64_t t0 = TicksBefore();
+    MergeDuplicates();
+
+    // Sort by decreasing total (self) cost.
+    std::sort(zones_, zones_ + num_zones_,
+              [](const ZoneTotals& r1, const ZoneTotals& r2) {
+                return r1.total_duration > r2.total_duration;
+              });
+
+    uint64_t total_visible_duration = 0;
+    for (size_t i = 0; i < num_zones_; ++i) {
+      const ZoneTotals& r = zones_[i];
+      if (r.name[0] != '@') {
+        total_visible_duration += r.total_duration;
+        printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
+               r.num_calls, r.total_duration / r.num_calls, r.total_duration);
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+    printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
+    printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
+  }
+
+  // Single-threaded. Clears all results as if no zones had been recorded.
+  void Reset() {
+    analyze_elapsed_ = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    memset(zone_stack_, 0, sizeof(zone_stack_));
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+ private:
+  // Updates ZoneTotals of the same name, or inserts a new one if this thread
+  // has not yet seen that name. Uses a self-organizing list data structure,
+  // which avoids dynamic memory allocations and is faster than unordered_map.
+  void UpdateOrAdd(const char* name, const uint64_t num_calls,
+                   const uint64_t duration) {
+    // Special case for first zone: (maybe) update, without swapping.
+    if (zones_[0].name == name) {
+      zones_[0].total_duration += duration;
+      zones_[0].num_calls += num_calls;
+      return;
+    }
+
+    // Look for a zone with the same name.
+    for (size_t i = 1; i < num_zones_; ++i) {
+      if (zones_[i].name == name) {
+        zones_[i].total_duration += duration;
+        zones_[i].num_calls += num_calls;
+        // Swap with predecessor (more conservative than move to front,
+        // but at least as successful).
+        std::swap(zones_[i - 1], zones_[i]);
+        return;
+      }
+    }
+
+    // Not found; create a new ZoneTotals.
+    HWY_ASSERT(num_zones_ < kMaxZones);
+    ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
+    zone->name = name;
+    zone->num_calls = num_calls;
+    zone->total_duration = duration;
+    ++num_zones_;
+  }
+
+  // Each instantiation of a function template seems to get its own copy of
+  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+  // acceptable because we only expect a few dozen zones.
+  void MergeDuplicates() {
+    for (size_t i = 0; i < num_zones_; ++i) {
+      // Add any subsequent duplicates to num_calls and total_duration.
+      for (size_t j = i + 1; j < num_zones_;) {
+        if (!strcmp(zones_[i].name, zones_[j].name)) {
+          zones_[i].num_calls += zones_[j].num_calls;
+          zones_[i].total_duration += zones_[j].total_duration;
+          // Fill hole with last item.
+          zones_[j] = zones_[--num_zones_];
+        } else {  // Name differed, try next ZoneTotals.
+          ++j;
+        }
+      }
+    }
+  }
+
+  uint64_t analyze_elapsed_ = 0;
+  uint64_t self_overhead_ = 0;
+  uint64_t child_overhead_ = 0;
+
+  size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
+  size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.
+
+  // After other members to avoid large pointer offsets.
+  alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
+  alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
+};
+
+ThreadSpecific::ThreadSpecific()
+    : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
+      packets_(hwy::AllocateAligned<Packet>(max_packets_)),
+      num_packets_(0),
+      results_(hwy::MakeUniqueAligned<Results>()) {}
+
+ThreadSpecific::~ThreadSpecific() {}
+
+void ThreadSpecific::FlushBuffer() {
+  if (num_packets_ + kBufferCapacity > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+  // This buffering halves observer overhead and decreases the overall
+  // runtime by about 3%.
+  HWY_DYNAMIC_DISPATCH(StreamCacheLine)
+  (buffer_, packets_.get() + num_packets_);
+  num_packets_ += kBufferCapacity;
+  buffer_size_ = 0;
+}
+
+void ThreadSpecific::AnalyzeRemainingPackets() {
+  // Storage full => empty it.
+  if (num_packets_ + buffer_size_ > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+
+  // Move buffer to storage
+  memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+  num_packets_ += buffer_size_;
+  buffer_size_ = 0;
+
+  results_->AnalyzePackets(packets_.get(), num_packets_);
+  num_packets_ = 0;
+}
+
+namespace {
+
+class HalfSampleMode {
+ public:
+  // Returns mode. "sorted" must be in ascending order.
+  template <typename T>
+  T operator()(const T* const HWY_RESTRICT sorted,
+               const size_t num_values) const {
+    int64_t center = num_values / 2;
+    int64_t width = num_values;
+
+    // Zoom in on modal intervals of decreasing width. Stop before we reach
+    // width=1, i.e. single values, for which there is no "slope".
+    while (width > 2) {
+      // Round up so we can still reach the outer edges of odd widths.
+      width = (width + 1) / 2;
+
+      center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
+    }
+
+    return sorted[center];  // mode := middle value in modal interval.
+  }
+
+ private:
+  // Returns center of the densest region [c-radius, c+radius].
+  template <typename T>
+  static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
+      const T* HWY_RESTRICT sorted, const int64_t total_values,
+      const int64_t center, const int64_t width) {
+    const int64_t radius = (width + 1) / 2;
+
+    auto compute_slope = [radius, total_values, sorted](
+                             int64_t c, int64_t* actual_center = nullptr) {
+      // For symmetry, check 2*radius+1 values, i.e. [min, max].
+      const int64_t min = std::max(c - radius, int64_t(0));
+      const int64_t max = std::min(c + radius, total_values - 1);
+      HWY_ASSERT(min < max);
+      HWY_ASSERT(sorted[min] <=
+                 sorted[max] + std::numeric_limits<float>::epsilon());
+      const float dx = max - min + 1;
+      const float slope = (sorted[max] - sorted[min]) / dx;
+
+      if (actual_center != nullptr) {
+        // c may be out of bounds, so return center of the clamped bounds.
+        *actual_center = (min + max + 1) / 2;
+      }
+      return slope;
+    };
+
+    // First find min_slope for all centers.
+    float min_slope = std::numeric_limits<float>::max();
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      min_slope = std::min(min_slope, compute_slope(c));
+    }
+
+    // Candidates := centers with slope ~= min_slope.
+    std::vector<int64_t> candidates;
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      int64_t actual_center;
+      const float slope = compute_slope(c, &actual_center);
+      if (slope <= min_slope * 1.001f) {
+        candidates.push_back(actual_center);
+      }
+    }
+
+    // Keep the median.
+    HWY_ASSERT(!candidates.empty());
+    if (candidates.size() == 1) return candidates[0];
+    std::nth_element(candidates.begin(),
+                     candidates.begin() + candidates.size() / 2,
+                     candidates.end());
+    return candidates[candidates.size() / 2];
+  }
+};
+
+}  // namespace
+
+void ThreadSpecific::ComputeOverhead() {
+  // Delay after capturing timestamps before/after the actual zone runs. Even
+  // with frequency throttling disabled, this has a multimodal distribution,
+  // including 32, 34, 48, 52, 59, 62.
+  uint64_t self_overhead;
+  {
+    const size_t kNumSamples = 32;
+    uint32_t samples[kNumSamples];
+    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+      const size_t kNumDurations = 1024;
+      uint32_t durations[kNumDurations];
+
+      for (size_t idx_duration = 0; idx_duration < kNumDurations;
+           ++idx_duration) {
+        {  //
+          PROFILER_ZONE("Dummy Zone (never shown)");
+        }
+        const uint64_t duration = results_->ZoneDuration(buffer_);
+        buffer_size_ = 0;
+        durations[idx_duration] = static_cast<uint32_t>(duration);
+        HWY_ASSERT(num_packets_ == 0);
+      }
+      std::sort(durations, durations + kNumDurations);
+      samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+    }
+    // Median.
+    std::sort(samples, samples + kNumSamples);
+    self_overhead = samples[kNumSamples / 2];
+#if PROFILER_PRINT_OVERHEAD
+    printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
+#endif
+    results_->SetSelfOverhead(self_overhead);
+  }
+
+  // Delay before capturing start timestamp / after end timestamp.
+  const size_t kNumSamples = 32;
+  uint32_t samples[kNumSamples];
+  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+    const size_t kNumDurations = 16;
+    uint32_t durations[kNumDurations];
+    for (size_t idx_duration = 0; idx_duration < kNumDurations;
+         ++idx_duration) {
+      const size_t kReps = 10000;
+      // Analysis time should not be included => must fit within buffer.
+      HWY_ASSERT(kReps * 2 < max_packets_);
+      hwy::FlushStream();
+      const uint64_t t0 = TicksBefore();
+      for (size_t i = 0; i < kReps; ++i) {
+        PROFILER_ZONE("Dummy");
+      }
+      hwy::FlushStream();
+      const uint64_t t1 = TicksAfter();
+      HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
+      buffer_size_ = 0;
+      num_packets_ = 0;
+      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+      durations[idx_duration] =
+          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+    }
+    std::sort(durations, durations + kNumDurations);
+    samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+  }
+  std::sort(samples, samples + kNumSamples);
+  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+#if PROFILER_PRINT_OVERHEAD
+  printf("Child overhead: %" PRIu64 "\n",
+         static_cast<uint64_t>(child_overhead));
+#endif
+  results_->SetChildOverhead(child_overhead);
+}
+
+namespace {
+
+// Could be a static member of Zone, but that would expose <atomic> in header.
+std::atomic<ThreadSpecific*>& GetHead() {
+  static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
+  return head_;
+}
+
+}  // namespace
+
+// Thread-safe.
+ThreadSpecific* Zone::InitThreadSpecific() {
+  ThreadSpecific* thread_specific =
+      hwy::MakeUniqueAligned<ThreadSpecific>().release();
+
+  // Insert into unordered list
+  std::atomic<ThreadSpecific*>& head = GetHead();
+  ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
+  thread_specific->SetNext(old_head);
+  while (!head.compare_exchange_weak(old_head, thread_specific,
+                                     std::memory_order_release,
+                                     std::memory_order_relaxed)) {
+    thread_specific->SetNext(old_head);
+    // TODO(janwas): pause
+  }
+
+  // ComputeOverhead also creates a Zone, so this needs to be set before that
+  // to prevent infinite recursion.
+  GetThreadSpecific() = thread_specific;
+
+  thread_specific->ComputeOverhead();
+  return thread_specific;
+}
+
+// Single-threaded.
+/*static*/ void Zone::PrintResults() {
+  ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
+  ThreadSpecific* p = head;
+  while (p) {
+    p->AnalyzeRemainingPackets();
+
+    // Combine all threads into a single Result.
+    if (p != head) {
+      head->GetResults().Assimilate(p->GetResults());
+      p->GetResults().Reset();
+    }
+
+    p = p->GetNext();
+  }
+
+  if (head != nullptr) {
+    head->GetResults().Print();
+    head->GetResults().Reset();
+  }
+}
+
+}  // namespace profiler
+}  // namespace jxl
+
+#endif  // HWY_ONCE
+#endif  // JXL_PROFILER_ENABLED