summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/base/profiler.h
blob: 4c0efa4b3a8928b44b083c8863a59e46763ba70f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#ifndef LIB_JXL_BASE_PROFILER_H_
#define LIB_JXL_BASE_PROFILER_H_

// High precision, low overhead time measurements. Returns exact call counts and
// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
//
// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which
// defines JXL_PROFILER_ENABLED.
//
// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
// print call counts and average durations [CPU cycles] to stdout, sorted in
// descending order of total duration.

// If zero, this file has no effect and no measurements will be recorded.
#ifndef JXL_PROFILER_ENABLED
#define JXL_PROFILER_ENABLED 0
#endif
#if JXL_PROFILER_ENABLED

#include <stddef.h>
#include <stdint.h>

#include <hwy/aligned_allocator.h>
#include <hwy/base.h>

#include "lib/jxl/base/tsc_timer.h"

#if HWY_COMPILER_MSVC
#define PROFILER_PUBLIC
#else
#define PROFILER_PUBLIC __attribute__((visibility("default")))
#endif

namespace jxl {
namespace profiler {

// Represents zone entry/exit events. POD.
#pragma pack(push, 1)
struct Packet {
  // Computing a hash or string table is likely too expensive, and offsets
  // from other libraries' string literals can be too large to combine them and
  // a full-resolution timestamp into 64 bits.
  uint64_t timestamp;
  const char* name;  // nullptr for exit packets
#if UINTPTR_MAX <= 0xFFFFFFFFu
  uint32_t padding;
#endif
};
#pragma pack(pop)
static_assert(sizeof(Packet) == 16, "Wrong Packet size");

class Results;  // pImpl

// Per-thread packet storage, dynamically allocated and aligned.
class ThreadSpecific {
  static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);

 public:
  PROFILER_PUBLIC explicit ThreadSpecific();
  PROFILER_PUBLIC ~ThreadSpecific();

  // Depends on Zone => defined out of line.
  PROFILER_PUBLIC void ComputeOverhead();

  HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
  HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }

  PROFILER_PUBLIC void AnalyzeRemainingPackets();

  // Accessors instead of public member for well-defined data layout.
  void SetNext(ThreadSpecific* next) { next_ = next; }
  ThreadSpecific* GetNext() const { return next_; }

  Results& GetResults() { return *results_; }

 private:
  PROFILER_PUBLIC void FlushBuffer();

  // Write packet to buffer/storage, emptying them as needed.
  void Write(const char* name, const uint64_t timestamp) {
    if (buffer_size_ == kBufferCapacity) {  // Full
      FlushBuffer();
    }
    buffer_[buffer_size_].name = name;
    buffer_[buffer_size_].timestamp = timestamp;
    ++buffer_size_;
  }

  // Write-combining buffer to avoid cache pollution. Must be the first
  // non-static member to ensure cache-line alignment.
  Packet buffer_[kBufferCapacity];
  size_t buffer_size_ = 0;

  // Contiguous storage for zone enter/exit packets.
  const size_t max_packets_;
  hwy::AlignedFreeUniquePtr<Packet[]> packets_;
  size_t num_packets_;

  // Linked list of all threads.
  ThreadSpecific* next_ = nullptr;  // Owned, never released.

  hwy::AlignedUniquePtr<Results> results_;
};

// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
// responsible for initializing ThreadSpecific.
class Zone {
 public:
  HWY_NOINLINE explicit Zone(const char* name) {
    HWY_FENCE;
    ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
    if (HWY_UNLIKELY(thread_specific == nullptr)) {
      thread_specific = InitThreadSpecific();
    }

    thread_specific->WriteEntry(name);
  }

  HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }

  // Call exactly once after all threads have exited all zones.
  PROFILER_PUBLIC static void PrintResults();

 private:
  // Returns reference to the thread's ThreadSpecific pointer (initially null).
  // Function-local static avoids needing a separate definition.
  static ThreadSpecific*& GetThreadSpecific() {
    static thread_local ThreadSpecific* thread_specific;
    return thread_specific;
  }

  // Non time-critical.
  PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
};

// Creates a zone starting from here until the end of the current scope.
// Timestamps will be recorded when entering and exiting the zone.
// To ensure the name pointer remains valid, we require it to be a string
// literal (by merging with ""). We also compare strings by address.
#define PROFILER_ZONE(name)                  \
  HWY_FENCE;                                 \
  const ::jxl::profiler::Zone zone("" name); \
  HWY_FENCE

// Creates a zone for an entire function (when placed at its beginning).
// Shorter/more convenient than ZONE.
#define PROFILER_FUNC                         \
  HWY_FENCE;                                  \
  const ::jxl::profiler::Zone zone(__func__); \
  HWY_FENCE

#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults

}  // namespace profiler
}  // namespace jxl

#else  // !JXL_PROFILER_ENABLED
#define PROFILER_ZONE(name)
#define PROFILER_FUNC
#define PROFILER_PRINT_RESULTS()
#endif

#endif  // LIB_JXL_BASE_PROFILER_H_