third_party/jpeg-xl/lib/jxl/base/profiler.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540

// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "lib/jxl/base/profiler.h"

#if JXL_PROFILER_ENABLED

#include <stdio.h>
#include <stdlib.h>
#include <string.h>  // memcpy

#include <algorithm>  // sort
#include <atomic>
#include <cinttypes>  // PRIu64
#include <hwy/cache_control.h>
#include <limits>
#include <new>

// Optionally use SIMD in StreamCacheLine if available.
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>

HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace profiler {
namespace HWY_NAMESPACE {

// Overwrites `to` without loading it into cache (read-for-ownership).
// Copies 64 bytes from/to naturally aligned addresses.
void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
#if HWY_TARGET == HWY_SCALAR
  hwy::CopyBytes<64>(from, to);
#else
  const HWY_CAPPED(uint64_t, 2) d;
  HWY_FENCE;
  const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
  const auto v0 = Load(d, from64 + 0);
  const auto v1 = Load(d, from64 + 2);
  const auto v2 = Load(d, from64 + 4);
  const auto v3 = Load(d, from64 + 6);
  // Fences prevent the compiler from reordering loads/stores, which may
  // interfere with write-combining.
  HWY_FENCE;
  uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
  Stream(v0, d, to64 + 0);
  Stream(v1, d, to64 + 2);
  Stream(v2, d, to64 + 4);
  Stream(v3, d, to64 + 6);
  HWY_FENCE;
#endif
}

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace profiler
}  // namespace jxl
HWY_AFTER_NAMESPACE();

#if HWY_ONCE
namespace jxl {
namespace profiler {

HWY_EXPORT(StreamCacheLine);

namespace {

// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that
// enters at least one zone. Once this buffer is full, the thread will analyze
// packets (two per zone), which introduces observer overhead.
#ifndef PROFILER_THREAD_STORAGE
#define PROFILER_THREAD_STORAGE 32ULL
#endif

#define PROFILER_PRINT_OVERHEAD 0

// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
constexpr size_t kMaxZones = 256;  // Total number of zones.

// Stack of active (entered but not exited) zones. POD, uninitialized.
// Used to deduct child duration from the parent's self time.
struct ActiveZone {
  const char* name;
  uint64_t entry_timestamp;
  uint64_t child_total;
};

// Totals for all Zones with the same name. POD, must be zero-initialized.
struct ZoneTotals {
  uint64_t total_duration;
  const char* name;
  uint64_t num_calls;
};

template <typename T>
inline T ClampedSubtract(const T minuend, const T subtrahend) {
  if (subtrahend > minuend) {
    return 0;
  }
  return minuend - subtrahend;
}

}  // namespace

// Per-thread call graph (stack) and ZoneTotals for each zone.
class Results {
 public:
  Results() {
    // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
    memset(zones_, 0, sizeof(zones_));
  }

  // Used for computing overhead when this thread encounters its first Zone.
  // This has no observable effect apart from increasing "analyze_elapsed_".
  uint64_t ZoneDuration(const Packet* packets) {
    HWY_ASSERT(depth_ == 0);
    HWY_ASSERT(num_zones_ == 0);
    AnalyzePackets(packets, 2);
    const uint64_t duration = zones_[0].total_duration;
    zones_[0].num_calls = 0;
    zones_[0].total_duration = 0;
    HWY_ASSERT(depth_ == 0);
    num_zones_ = 0;
    return duration;
  }

  void SetSelfOverhead(const uint64_t self_overhead) {
    self_overhead_ = self_overhead;
  }

  void SetChildOverhead(const uint64_t child_overhead) {
    child_overhead_ = child_overhead;
  }

  // Draw all required information from the packets, which can be discarded
  // afterwards. Called whenever this thread's storage is full.
  void AnalyzePackets(const Packet* HWY_RESTRICT packets,
                      const size_t num_packets) {
    // Ensures prior weakly-ordered streaming stores are globally visible.
    hwy::FlushStream();

    const uint64_t t0 = TicksBefore();

    for (size_t i = 0; i < num_packets; ++i) {
      const uint64_t timestamp = packets[i].timestamp;
      // Entering a zone
      if (packets[i].name != nullptr) {
        HWY_ASSERT(depth_ < kMaxDepth);
        zone_stack_[depth_].name = packets[i].name;
        zone_stack_[depth_].entry_timestamp = timestamp;
        zone_stack_[depth_].child_total = 0;
        ++depth_;
        continue;
      }

      HWY_ASSERT(depth_ != 0);
      const ActiveZone& active = zone_stack_[depth_ - 1];
      const uint64_t duration = timestamp - active.entry_timestamp;
      const uint64_t self_duration = ClampedSubtract(
          duration, self_overhead_ + child_overhead_ + active.child_total);

      UpdateOrAdd(active.name, 1, self_duration);
      --depth_;

      // "Deduct" the nested time from its parent's self_duration.
      if (depth_ != 0) {
        zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
      }
    }

    const uint64_t t1 = TicksAfter();
    analyze_elapsed_ += t1 - t0;
  }

  // Incorporates results from another thread. Call after all threads have
  // exited any zones.
  void Assimilate(const Results& other) {
    const uint64_t t0 = TicksBefore();
    HWY_ASSERT(depth_ == 0);
    HWY_ASSERT(other.depth_ == 0);

    for (size_t i = 0; i < other.num_zones_; ++i) {
      const ZoneTotals& zone = other.zones_[i];
      UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
    }
    const uint64_t t1 = TicksAfter();
    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
  }

  // Single-threaded.
  void Print() {
    const uint64_t t0 = TicksBefore();
    MergeDuplicates();

    // Sort by decreasing total (self) cost.
    std::sort(zones_, zones_ + num_zones_,
              [](const ZoneTotals& r1, const ZoneTotals& r2) {
                return r1.total_duration > r2.total_duration;
              });

    uint64_t total_visible_duration = 0;
    for (size_t i = 0; i < num_zones_; ++i) {
      const ZoneTotals& r = zones_[i];
      if (r.name[0] != '@') {
        total_visible_duration += r.total_duration;
        printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
               r.num_calls, r.total_duration / r.num_calls, r.total_duration);
      }
    }

    const uint64_t t1 = TicksAfter();
    analyze_elapsed_ += t1 - t0;
    printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
    printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
  }

  // Single-threaded. Clears all results as if no zones had been recorded.
  void Reset() {
    analyze_elapsed_ = 0;
    HWY_ASSERT(depth_ == 0);
    num_zones_ = 0;
    memset(zone_stack_, 0, sizeof(zone_stack_));
    memset(zones_, 0, sizeof(zones_));
  }

 private:
  // Updates ZoneTotals of the same name, or inserts a new one if this thread
  // has not yet seen that name. Uses a self-organizing list data structure,
  // which avoids dynamic memory allocations and is faster than unordered_map.
  void UpdateOrAdd(const char* name, const uint64_t num_calls,
                   const uint64_t duration) {
    // Special case for first zone: (maybe) update, without swapping.
    if (zones_[0].name == name) {
      zones_[0].total_duration += duration;
      zones_[0].num_calls += num_calls;
      return;
    }

    // Look for a zone with the same name.
    for (size_t i = 1; i < num_zones_; ++i) {
      if (zones_[i].name == name) {
        zones_[i].total_duration += duration;
        zones_[i].num_calls += num_calls;
        // Swap with predecessor (more conservative than move to front,
        // but at least as successful).
        std::swap(zones_[i - 1], zones_[i]);
        return;
      }
    }

    // Not found; create a new ZoneTotals.
    HWY_ASSERT(num_zones_ < kMaxZones);
    ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
    zone->name = name;
    zone->num_calls = num_calls;
    zone->total_duration = duration;
    ++num_zones_;
  }

  // Each instantiation of a function template seems to get its own copy of
  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
  // acceptable because we only expect a few dozen zones.
  void MergeDuplicates() {
    for (size_t i = 0; i < num_zones_; ++i) {
      // Add any subsequent duplicates to num_calls and total_duration.
      for (size_t j = i + 1; j < num_zones_;) {
        if (!strcmp(zones_[i].name, zones_[j].name)) {
          zones_[i].num_calls += zones_[j].num_calls;
          zones_[i].total_duration += zones_[j].total_duration;
          // Fill hole with last item.
          zones_[j] = zones_[--num_zones_];
        } else {  // Name differed, try next ZoneTotals.
          ++j;
        }
      }
    }
  }

  uint64_t analyze_elapsed_ = 0;
  uint64_t self_overhead_ = 0;
  uint64_t child_overhead_ = 0;

  size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
  size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.

  // After other members to avoid large pointer offsets.
  alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
  alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
};

ThreadSpecific::ThreadSpecific()
    : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
      packets_(hwy::AllocateAligned<Packet>(max_packets_)),
      num_packets_(0),
      results_(hwy::MakeUniqueAligned<Results>()) {}

ThreadSpecific::~ThreadSpecific() {}

void ThreadSpecific::FlushBuffer() {
  if (num_packets_ + kBufferCapacity > max_packets_) {
    results_->AnalyzePackets(packets_.get(), num_packets_);
    num_packets_ = 0;
  }
  // This buffering halves observer overhead and decreases the overall
  // runtime by about 3%.
  HWY_DYNAMIC_DISPATCH(StreamCacheLine)
  (buffer_, packets_.get() + num_packets_);
  num_packets_ += kBufferCapacity;
  buffer_size_ = 0;
}

void ThreadSpecific::AnalyzeRemainingPackets() {
  // Storage full => empty it.
  if (num_packets_ + buffer_size_ > max_packets_) {
    results_->AnalyzePackets(packets_.get(), num_packets_);
    num_packets_ = 0;
  }

  // Move buffer to storage
  memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
  num_packets_ += buffer_size_;
  buffer_size_ = 0;

  results_->AnalyzePackets(packets_.get(), num_packets_);
  num_packets_ = 0;
}

namespace {

class HalfSampleMode {
 public:
  // Returns mode. "sorted" must be in ascending order.
  template <typename T>
  T operator()(const T* const HWY_RESTRICT sorted,
               const size_t num_values) const {
    int64_t center = num_values / 2;
    int64_t width = num_values;

    // Zoom in on modal intervals of decreasing width. Stop before we reach
    // width=1, i.e. single values, for which there is no "slope".
    while (width > 2) {
      // Round up so we can still reach the outer edges of odd widths.
      width = (width + 1) / 2;

      center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
    }

    return sorted[center];  // mode := middle value in modal interval.
  }

 private:
  // Returns center of the densest region [c-radius, c+radius].
  template <typename T>
  static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
      const T* HWY_RESTRICT sorted, const int64_t total_values,
      const int64_t center, const int64_t width) {
    const int64_t radius = (width + 1) / 2;

    auto compute_slope = [radius, total_values, sorted](
                             int64_t c, int64_t* actual_center = nullptr) {
      // For symmetry, check 2*radius+1 values, i.e. [min, max].
      const int64_t min = std::max(c - radius, int64_t(0));
      const int64_t max = std::min(c + radius, total_values - 1);
      HWY_ASSERT(min < max);
      HWY_ASSERT(sorted[min] <=
                 sorted[max] + std::numeric_limits<float>::epsilon());
      const float dx = max - min + 1;
      const float slope = (sorted[max] - sorted[min]) / dx;

      if (actual_center != nullptr) {
        // c may be out of bounds, so return center of the clamped bounds.
        *actual_center = (min + max + 1) / 2;
      }
      return slope;
    };

    // First find min_slope for all centers.
    float min_slope = std::numeric_limits<float>::max();
    for (int64_t c = center - radius; c <= center + radius; ++c) {
      min_slope = std::min(min_slope, compute_slope(c));
    }

    // Candidates := centers with slope ~= min_slope.
    std::vector<int64_t> candidates;
    for (int64_t c = center - radius; c <= center + radius; ++c) {
      int64_t actual_center;
      const float slope = compute_slope(c, &actual_center);
      if (slope <= min_slope * 1.001f) {
        candidates.push_back(actual_center);
      }
    }

    // Keep the median.
    HWY_ASSERT(!candidates.empty());
    if (candidates.size() == 1) return candidates[0];
    std::nth_element(candidates.begin(),
                     candidates.begin() + candidates.size() / 2,
                     candidates.end());
    return candidates[candidates.size() / 2];
  }
};

}  // namespace

void ThreadSpecific::ComputeOverhead() {
  // Delay after capturing timestamps before/after the actual zone runs. Even
  // with frequency throttling disabled, this has a multimodal distribution,
  // including 32, 34, 48, 52, 59, 62.
  uint64_t self_overhead;
  {
    const size_t kNumSamples = 32;
    uint32_t samples[kNumSamples];
    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
      const size_t kNumDurations = 1024;
      uint32_t durations[kNumDurations];

      for (size_t idx_duration = 0; idx_duration < kNumDurations;
           ++idx_duration) {
        {  //
          PROFILER_ZONE("Dummy Zone (never shown)");
        }
        const uint64_t duration = results_->ZoneDuration(buffer_);
        buffer_size_ = 0;
        durations[idx_duration] = static_cast<uint32_t>(duration);
        HWY_ASSERT(num_packets_ == 0);
      }
      std::sort(durations, durations + kNumDurations);
      samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
    }
    // Median.
    std::sort(samples, samples + kNumSamples);
    self_overhead = samples[kNumSamples / 2];
#if PROFILER_PRINT_OVERHEAD
    printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
#endif
    results_->SetSelfOverhead(self_overhead);
  }

  // Delay before capturing start timestamp / after end timestamp.
  const size_t kNumSamples = 32;
  uint32_t samples[kNumSamples];
  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
    const size_t kNumDurations = 16;
    uint32_t durations[kNumDurations];
    for (size_t idx_duration = 0; idx_duration < kNumDurations;
         ++idx_duration) {
      const size_t kReps = 10000;
      // Analysis time should not be included => must fit within buffer.
      HWY_ASSERT(kReps * 2 < max_packets_);
      hwy::FlushStream();
      const uint64_t t0 = TicksBefore();
      for (size_t i = 0; i < kReps; ++i) {
        PROFILER_ZONE("Dummy");
      }
      hwy::FlushStream();
      const uint64_t t1 = TicksAfter();
      HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
      buffer_size_ = 0;
      num_packets_ = 0;
      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
      durations[idx_duration] =
          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
    }
    std::sort(durations, durations + kNumDurations);
    samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
  }
  std::sort(samples, samples + kNumSamples);
  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
#if PROFILER_PRINT_OVERHEAD
  printf("Child overhead: %" PRIu64 "\n",
         static_cast<uint64_t>(child_overhead));
#endif
  results_->SetChildOverhead(child_overhead);
}

namespace {

// Could be a static member of Zone, but that would expose <atomic> in header.
std::atomic<ThreadSpecific*>& GetHead() {
  static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
  return head_;
}

}  // namespace

// Thread-safe.
ThreadSpecific* Zone::InitThreadSpecific() {
  ThreadSpecific* thread_specific =
      hwy::MakeUniqueAligned<ThreadSpecific>().release();

  // Insert into unordered list
  std::atomic<ThreadSpecific*>& head = GetHead();
  ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
  thread_specific->SetNext(old_head);
  while (!head.compare_exchange_weak(old_head, thread_specific,
                                     std::memory_order_release,
                                     std::memory_order_relaxed)) {
    thread_specific->SetNext(old_head);
    // TODO(janwas): pause
  }

  // ComputeOverhead also creates a Zone, so this needs to be set before that
  // to prevent infinite recursion.
  GetThreadSpecific() = thread_specific;

  thread_specific->ComputeOverhead();
  return thread_specific;
}

// Single-threaded.
/*static*/ void Zone::PrintResults() {
  ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
  ThreadSpecific* p = head;
  while (p) {
    p->AnalyzeRemainingPackets();

    // Combine all threads into a single Result.
    if (p != head) {
      head->GetResults().Assimilate(p->GetResults());
      p->GetResults().Reset();
    }

    p = p->GetNext();
  }

  if (head != nullptr) {
    head->GetResults().Print();
    head->GetResults().Reset();
  }
}

}  // namespace profiler
}  // namespace jxl

#endif  // HWY_ONCE
#endif  // JXL_PROFILER_ENABLED