summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/base
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/base')
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/arch_macros.h33
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/bits.h147
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/byte_order.h274
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc157
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/cache_aligned.h74
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/compiler_specific.h157
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/data_parallel.cc23
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/data_parallel.h120
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/file_io.h153
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/float.h98
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/iaca.h65
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/os_macros.h50
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/override.h29
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc63
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/padded_bytes.h197
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/printf_macros.h34
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/profiler.cc540
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/profiler.h170
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/random.cc21
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/random.h95
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h44
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/scope_guard.h48
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/span.h60
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/status.h326
-rw-r--r--third_party/jpeg-xl/lib/jxl/base/tsc_timer.h172
25 files changed, 3150 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/base/arch_macros.h b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
new file mode 100644
index 0000000000..a98301915e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_ARCH_MACROS_H_
+#define LIB_JXL_BASE_ARCH_MACROS_H_
+
+// Defines the JXL_ARCH_* macros.
+
+namespace jxl {
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define JXL_ARCH_X64 1
+#else
+#define JXL_ARCH_X64 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define JXL_ARCH_PPC 1
+#else
+#define JXL_ARCH_PPC 0
+#endif
+
+#if defined(__aarch64__) || defined(__arm__)
+#define JXL_ARCH_ARM 1
+#else
+#define JXL_ARCH_ARM 0
+#endif
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_ARCH_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/bits.h b/third_party/jpeg-xl/lib/jxl/base/bits.h
new file mode 100644
index 0000000000..9f86118e72
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/bits.h
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BITS_H_
+#define LIB_JXL_BASE_BITS_H_
+
+// Specialized instructions for processing register-sized bit arrays.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace jxl {
+
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+
+template <typename T>
+constexpr bool IsSigned() {
+ return T(0) > T(-1);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+ JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return 31 - index;
+#else
+ return static_cast<size_t>(__builtin_clz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+ JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+ unsigned long index;
+ _BitScanReverse64(&index, x);
+ return 63 - index;
+#else // JXL_ARCH_X64
+ // _BitScanReverse64 not available
+ uint32_t msb = static_cast<uint32_t>(x >> 32u);
+ unsigned long index;
+ if (msb == 0) {
+ uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+ _BitScanReverse(&index, lsb);
+ return 63 - index;
+ } else {
+ _BitScanReverse(&index, msb);
+ return 31 - index;
+ }
+#endif // JXL_ARCH_X64
+#else
+ return static_cast<size_t>(__builtin_clzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(const T x) {
+ static_assert(!IsSigned<T>(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned");
+ return Num0BitsAboveMS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+ JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+ unsigned long index;
+ _BitScanForward(&index, x);
+ return index;
+#else
+ return static_cast<size_t>(__builtin_ctz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+ JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+ unsigned long index;
+ _BitScanForward64(&index, x);
+ return index;
+#else // JXL_ARCH_64
+ // _BitScanForward64 not available
+ uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+ unsigned long index;
+ if (lsb == 0) {
+ uint32_t msb = static_cast<uint32_t>(x >> 32u);
+ _BitScanForward(&index, msb);
+ return 32 + index;
+ } else {
+ _BitScanForward(&index, lsb);
+ return index;
+ }
+#endif // JXL_ARCH_X64
+#else
+ return static_cast<size_t>(__builtin_ctzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) {
+ static_assert(!IsSigned<T>(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned");
+ return Num0BitsBelowLS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) {
+ return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) {
+ return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded down.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) {
+ return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded up.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) {
+ const size_t floor_log2 = FloorLog2Nonzero(x);
+ if ((x & (x - 1)) == 0) return floor_log2; // power of two
+ return floor_log2 + 1;
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_BITS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/byte_order.h b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
new file mode 100644
index 0000000000..8966834e08
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BYTE_ORDER_H_
+#define LIB_JXL_BASE_BYTE_ORDER_H_
+
+#include <jxl/types.h>
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include "lib/jxl/base/compiler_specific.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h> // _byteswap_*
+#endif
+
+#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#define JXL_BYTE_ORDER_LITTLE 1
+#else
+// This means that we don't know that the byte order is little endian, in
+// this case we use endian-neutral code that works for both little- and
+// big-endian.
+#define JXL_BYTE_ORDER_LITTLE 0
+#endif
+
+// Returns whether the system is little-endian (least-significant byte first).
+#if JXL_BYTE_ORDER_LITTLE
+static constexpr bool IsLittleEndian() { return true; }
+#else
+static inline bool IsLittleEndian() {
+ const uint32_t multibyte = 1;
+ uint8_t byte;
+ memcpy(&byte, &multibyte, 1);
+ return byte == 1;
+}
+#endif
+
+static inline bool SwapEndianness(JxlEndianness endianness) {
+ return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) ||
+ (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian()));
+}
+
+#if JXL_COMPILER_MSVC
+#define JXL_BSWAP16(x) _byteswap_ushort(x)
+#define JXL_BSWAP32(x) _byteswap_ulong(x)
+#define JXL_BSWAP64(x) _byteswap_uint64(x)
+#else
+#define JXL_BSWAP16(x) __builtin_bswap16(x)
+#define JXL_BSWAP32(x) __builtin_bswap32(x)
+#define JXL_BSWAP64(x) __builtin_bswap64(x)
+#endif
+
+static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) {
+ const uint32_t byte1 = p[0];
+ const uint32_t byte0 = p[1];
+ return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) {
+ const uint32_t byte0 = p[0];
+ const uint32_t byte1 = p[1];
+ return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ uint32_t big;
+ memcpy(&big, p, 4);
+ return JXL_BSWAP32(big);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ const uint32_t byte3 = p[0];
+ const uint32_t byte2 = p[1];
+ const uint32_t byte1 = p[2];
+ const uint32_t byte0 = p[3];
+ return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ uint64_t big;
+ memcpy(&big, p, 8);
+ return JXL_BSWAP64(big);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ const uint64_t byte7 = p[0];
+ const uint64_t byte6 = p[1];
+ const uint64_t byte5 = p[2];
+ const uint64_t byte4 = p[3];
+ const uint64_t byte3 = p[4];
+ const uint64_t byte2 = p[5];
+ const uint64_t byte1 = p[6];
+ const uint64_t byte0 = p[7];
+ return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) |
+ (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) |
+ (byte1 << 8ull) | byte0;
+#endif
+}
+
+static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ uint32_t little;
+ memcpy(&little, p, 4);
+ return little;
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ const uint32_t byte0 = p[0];
+ const uint32_t byte1 = p[1];
+ const uint32_t byte2 = p[2];
+ const uint32_t byte3 = p[3];
+ return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ uint64_t little;
+ memcpy(&little, p, 8);
+ return little;
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ const uint64_t byte0 = p[0];
+ const uint64_t byte1 = p[1];
+ const uint64_t byte2 = p[2];
+ const uint64_t byte3 = p[3];
+ const uint64_t byte4 = p[4];
+ const uint64_t byte5 = p[5];
+ const uint64_t byte6 = p[6];
+ const uint64_t byte7 = p[7];
+ return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) |
+ (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+// Loads a Big-Endian float
+static JXL_INLINE float LoadBEFloat(const uint8_t* p) {
+ uint32_t u = LoadBE32(p);
+ float result;
+ memcpy(&result, &u, 4);
+ return result;
+}
+
+// Loads a Little-Endian float
+static JXL_INLINE float LoadLEFloat(const uint8_t* p) {
+ uint32_t u = LoadLE32(p);
+ float result;
+ memcpy(&result, &u, 4);
+ return result;
+}
+
+static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) {
+ p[0] = (native >> 8) & 0xFF;
+ p[1] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) {
+ p[1] = (native >> 8) & 0xFF;
+ p[0] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ const uint32_t big = JXL_BSWAP32(native);
+ memcpy(p, &big, 4);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ p[0] = native >> 24;
+ p[1] = (native >> 16) & 0xFF;
+ p[2] = (native >> 8) & 0xFF;
+ p[3] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ const uint64_t big = JXL_BSWAP64(native);
+ memcpy(p, &big, 8);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ p[0] = native >> 56ull;
+ p[1] = (native >> 48ull) & 0xFF;
+ p[2] = (native >> 40ull) & 0xFF;
+ p[3] = (native >> 32ull) & 0xFF;
+ p[4] = (native >> 24ull) & 0xFF;
+ p[5] = (native >> 16ull) & 0xFF;
+ p[6] = (native >> 8ull) & 0xFF;
+ p[7] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ const uint32_t little = native;
+ memcpy(p, &little, 4);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ p[3] = native >> 24;
+ p[2] = (native >> 16) & 0xFF;
+ p[1] = (native >> 8) & 0xFF;
+ p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+ const uint64_t little = native;
+ memcpy(p, &little, 8);
+#else
+ // Byte-order-independent - can't assume this machine is big endian.
+ p[7] = native >> 56;
+ p[6] = (native >> 48) & 0xFF;
+ p[5] = (native >> 40) & 0xFF;
+ p[4] = (native >> 32) & 0xFF;
+ p[3] = (native >> 24) & 0xFF;
+ p[2] = (native >> 16) & 0xFF;
+ p[1] = (native >> 8) & 0xFF;
+ p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE float BSwapFloat(float x) {
+ uint32_t u;
+ memcpy(&u, &x, 4);
+ uint32_t uswap = JXL_BSWAP32(u);
+ float xswap;
+ memcpy(&xswap, &uswap, 4);
+ return xswap;
+}
+
+// Big/Little Endian order.
+struct OrderBE {};
+struct OrderLE {};
+
+// Wrappers for calling from generic code.
+static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native,
+ uint8_t* p) {
+ return StoreBE16(native, p);
+}
+
+static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native,
+ uint8_t* p) {
+ return StoreLE16(native, p);
+}
+
+static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native,
+ uint8_t* p) {
+ return StoreBE32(native, p);
+}
+
+static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native,
+ uint8_t* p) {
+ return StoreLE32(native, p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) {
+ return LoadBE16(p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) {
+ return LoadLE16(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) {
+ return LoadBE32(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) {
+ return LoadLE32(p);
+}
+
+#endif // LIB_JXL_BASE_BYTE_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
new file mode 100644
index 0000000000..9a9cc585a1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/cache_aligned.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Disabled: slower than malloc + alignment.
+#define JXL_USE_MMAP 0
+
+#if JXL_USE_MMAP
+#include <sys/mman.h>
+#endif
+
+#include <algorithm> // std::max
+#include <atomic>
+#include <hwy/base.h> // kMaxVectorSize
+#include <limits>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace {
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+ void* allocated;
+ size_t allocated_size;
+ uint8_t left_padding[hwy::kMaxVectorSize];
+};
+#pragma pack(pop)
+
+std::atomic<uint64_t> num_allocations{0};
+std::atomic<uint64_t> bytes_in_use{0};
+std::atomic<uint64_t> max_bytes_in_use{0};
+
+} // namespace
+
+// Avoids linker errors in pre-C++17 builds.
+constexpr size_t CacheAligned::kPointerSize;
+constexpr size_t CacheAligned::kCacheLineSize;
+constexpr size_t CacheAligned::kAlignment;
+constexpr size_t CacheAligned::kAlias;
+
+void CacheAligned::PrintStats() {
+ fprintf(
+ stderr, "Allocations: %" PRIuS " (max bytes in use: %E)\n",
+ static_cast<size_t>(num_allocations.load(std::memory_order_relaxed)),
+ static_cast<double>(max_bytes_in_use.load(std::memory_order_relaxed)));
+}
+
+size_t CacheAligned::NextOffset() {
+ static std::atomic<uint32_t> next{0};
+ constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment;
+ const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+ return CacheAligned::kAlignment * group;
+}
+
+void* CacheAligned::Allocate(const size_t payload_size, size_t offset) {
+ JXL_ASSERT(payload_size <= std::numeric_limits<size_t>::max() / 2);
+ JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
+
+ // What: | misalign | unused | AllocationHeader |payload
+ // Size: |<= kAlias | offset | |payload_size
+ // ^allocated.^aligned.^header............^payload
+ // The header must immediately precede payload, which must remain aligned.
+ // To avoid wasting space, the header resides at the end of `unused`,
+ // which therefore cannot be empty (offset == 0).
+ if (offset == 0) {
+ // SVE/RVV vectors can be large, so we cannot rely on them (including the
+ // padding at the end of AllocationHeader) to fit in kAlignment.
+ offset = hwy::RoundUpTo(sizeof(AllocationHeader), kAlignment);
+ }
+
+#if JXL_USE_MMAP
+ const size_t allocated_size = offset + payload_size;
+ const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE;
+ void* allocated =
+ mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (allocated == MAP_FAILED) return nullptr;
+ const uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated);
+#else
+ const size_t allocated_size = kAlias + offset + payload_size;
+ void* allocated = malloc(allocated_size);
+ if (allocated == nullptr) return nullptr;
+ // Always round up even if already aligned - we already asked for kAlias
+ // extra bytes and there's no way to give them back.
+ uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+ static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+ static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+ aligned &= ~(kAlias - 1);
+#endif
+
+#if 0
+ // No effect.
+ uintptr_t page_aligned = reinterpret_cast<uintptr_t>(allocated);
+ page_aligned &= ~(4096 - 1);
+ if (madvise(reinterpret_cast<void*>(page_aligned), allocated_size,
+ MADV_WILLNEED) != 0) {
+ JXL_NOTIFY_ERROR("madvise failed");
+ }
+#elif 0
+ // INCREASES both first and subsequent decode times.
+ if (mlock(allocated, allocated_size) != 0) {
+ JXL_NOTIFY_ERROR("mlock failed");
+ }
+#endif
+
+ // Update statistics (#allocations and max bytes in use)
+ num_allocations.fetch_add(1, std::memory_order_relaxed);
+ const uint64_t prev_bytes =
+ bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel);
+ uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire);
+ for (;;) {
+ const uint64_t desired =
+ std::max(expected_max, prev_bytes + allocated_size);
+ if (max_bytes_in_use.compare_exchange_strong(expected_max, desired,
+ std::memory_order_acq_rel)) {
+ break;
+ }
+ }
+
+ const uintptr_t payload = aligned + offset; // still aligned
+
+ // Stash `allocated` and payload_size inside header for use by Free().
+ AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
+ header->allocated = allocated;
+ header->allocated_size = allocated_size;
+
+ return JXL_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), 64);
+}
+
+void CacheAligned::Free(const void* aligned_pointer) {
+ if (aligned_pointer == nullptr) {
+ return;
+ }
+ const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+ JXL_ASSERT(payload % kAlignment == 0);
+ const AllocationHeader* header =
+ reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+ // Subtract (2's complement negation).
+ bytes_in_use.fetch_add(~header->allocated_size + 1,
+ std::memory_order_acq_rel);
+
+#if JXL_USE_MMAP
+ munmap(header->allocated, header->allocated_size);
+#else
+ free(header->allocated);
+#endif
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
new file mode 100644
index 0000000000..e57df14837
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_
+#define LIB_JXL_BASE_CACHE_ALIGNED_H_
+
+// Memory allocator with support for alignment + misalignment.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Functions that depend on the cache line size.
+class CacheAligned {
+ public:
+ static void PrintStats();
+
+ static constexpr size_t kPointerSize = sizeof(void*);
+ static constexpr size_t kCacheLineSize = 64;
+ // To avoid RFOs, match L2 fill size (pairs of lines).
+ static constexpr size_t kAlignment = 2 * kCacheLineSize;
+ // Minimum multiple for which cache set conflicts and/or loads blocked by
+ // preceding stores can occur.
+ static constexpr size_t kAlias = 2048;
+
+ // Returns a 'random' (cyclical) offset suitable for Allocate.
+ static size_t NextOffset();
+
+ // Returns null or memory whose address is congruent to `offset` (mod kAlias).
+ // This reduces cache conflicts and load/store stalls, especially with large
+ // allocations that would otherwise have similar alignments. At least
+ // `payload_size` (which can be zero) bytes will be accessible.
+ static void* Allocate(size_t payload_size, size_t offset);
+
+ static void* Allocate(const size_t payload_size) {
+ return Allocate(payload_size, NextOffset());
+ }
+
+ static void Free(const void* aligned_pointer);
+};
+
+// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr.
+struct CacheAlignedDeleter {
+ void operator()(uint8_t* aligned_pointer) const {
+ return CacheAligned::Free(aligned_pointer);
+ }
+};
+
+using CacheAlignedUniquePtr = std::unique_ptr<uint8_t[], CacheAlignedDeleter>;
+
+// Does not invoke constructors.
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) {
+ return CacheAlignedUniquePtr(
+ static_cast<uint8_t*>(CacheAligned::Allocate(bytes)),
+ CacheAlignedDeleter());
+}
+
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes,
+ const size_t offset) {
+ return CacheAlignedUniquePtr(
+ static_cast<uint8_t*>(CacheAligned::Allocate(bytes, offset)),
+ CacheAlignedDeleter());
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_CACHE_ALIGNED_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
new file mode 100644
index 0000000000..abe1261f48
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+
+// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect.
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
+// otherwise 100 * major + minor version. Note that other packages check for
+// #ifdef COMPILER_MSVC, so we cannot use that same name.
+
+#ifdef _MSC_VER
+#define JXL_COMPILER_MSVC _MSC_VER
+#else
+#define JXL_COMPILER_MSVC 0
+#endif
+
+#ifdef __GNUC__
+#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define JXL_COMPILER_GCC 0
+#endif
+
+#ifdef __clang__
+#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+// Clang pretends to be GCC for compatibility.
+#undef JXL_COMPILER_GCC
+#define JXL_COMPILER_GCC 0
+#else
+#define JXL_COMPILER_CLANG 0
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_RESTRICT __restrict
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_RESTRICT __restrict__
+#else
+#define JXL_RESTRICT
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_INLINE __forceinline
+#define JXL_NOINLINE __declspec(noinline)
+#else
+#define JXL_INLINE inline __attribute__((always_inline))
+#define JXL_NOINLINE __attribute__((noinline))
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_NORETURN __declspec(noreturn)
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_NORETURN __attribute__((noreturn))
+#else
+#define JXL_NORETURN
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_UNREACHABLE __assume(false)
+#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405
+#define JXL_UNREACHABLE __builtin_unreachable()
+#else
+#define JXL_UNREACHABLE
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_MAYBE_UNUSED
+#else
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define JXL_MAYBE_UNUSED __attribute__((unused))
+#endif
+
+// MSAN execution won't hurt if some code it not inlined, but this can greatly
+// improve compilation time. Unfortunately this macro can not be used just
+// everywhere - inside header files it leads to "multiple definition" error;
+// though it would be better not to have JXL_INLINE in header overall.
+#if JXL_MEMORY_SANITIZER || JXL_ADDRESS_SANITIZER || JXL_THREAD_SANITIZER
+#define JXL_MAYBE_INLINE JXL_MAYBE_UNUSED
+#else
+#define JXL_MAYBE_INLINE JXL_INLINE
+#endif
+
+#if JXL_COMPILER_MSVC
+// Unsupported, __assume is not the same.
+#define JXL_LIKELY(expr) expr
+#define JXL_UNLIKELY(expr) expr
+#else
+#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if JXL_COMPILER_CLANG
+// Early versions of Clang did not support __builtin_assume_aligned.
+#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned)
+#elif JXL_COMPILER_GCC
+#define JXL_HAS_ASSUME_ALIGNED 1
+#else
+#define JXL_HAS_ASSUME_ALIGNED 0
+#endif
+
+#if JXL_HAS_ASSUME_ALIGNED
+#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+
+#ifdef __has_attribute
+#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define JXL_HAVE_ATTRIBUTE(x) 0
+#endif
+
+// Raises warnings if the function return value is unused. Should appear as the
+// first part of a function definition/declaration.
+#if JXL_HAVE_ATTRIBUTE(nodiscard)
+#define JXL_MUST_USE_RESULT [[nodiscard]]
+#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result)
+#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define JXL_MUST_USE_RESULT
+#endif
+
+// Disable certain -fsanitize flags for functions that are expected to include
+// things like unsigned integer overflow. For example use in the function
+// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned
+// integer overflow ubsan messages.
+#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize)
+#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X)))
+#else
+#define JXL_NO_SANITIZE(X)
+#endif
+
+#if JXL_HAVE_ATTRIBUTE(__format__)
+#define JXL_FORMAT(idx_fmt, idx_arg) \
+ __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define JXL_FORMAT(idx_fmt, idx_arg)
+#endif
+
+#if JXL_COMPILER_MSVC
+using ssize_t = intptr_t;
+#endif
+
+#endif // LIB_JXL_BASE_COMPILER_SPECIFIC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
new file mode 100644
index 0000000000..20a911255c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+
+namespace jxl {
+
+// static
+JxlParallelRetCode ThreadPool::SequentialRunnerStatic(
+ void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+ JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+ JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1);
+ if (init_ret != 0) return init_ret;
+
+ for (uint32_t i = start_range; i < end_range; i++) {
+ (*func)(jpegxl_opaque, i, 0);
+ }
+ return 0;
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
new file mode 100644
index 0000000000..ba7e7adfad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
@@ -0,0 +1,120 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_
+#define LIB_JXL_BASE_DATA_PARALLEL_H_
+
+// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for
+// data-parallel computations.
+
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#if JXL_COMPILER_MSVC
+// suppress warnings about the const & applied to function types
+#pragma warning(disable : 4180)
+#endif
+
+namespace jxl {
+
+class ThreadPool {
+ public:
+ ThreadPool(JxlParallelRunner runner, void* runner_opaque)
+ : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic),
+ runner_opaque_(runner ? runner_opaque : static_cast<void*>(this)) {}
+
+ ThreadPool(const ThreadPool&) = delete;
+ ThreadPool& operator&(const ThreadPool&) = delete;
+
+ JxlParallelRunner runner() const { return runner_; }
+ void* runner_opaque() const { return runner_opaque_; }
+
+ // Runs init_func(num_threads) followed by data_func(task, thread) on worker
+ // thread(s) for every task in [begin, end). init_func() must return a Status
+ // indicating whether the initialization succeeded.
+ // "thread" is an integer smaller than num_threads.
+ // Not thread-safe - no two calls to Run may overlap.
+ // Subsequent calls will reuse the same threads.
+ //
+ // Precondition: begin <= end.
+ template <class InitFunc, class DataFunc>
+ Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func,
+ const DataFunc& data_func, const char* caller = "") {
+ JXL_ASSERT(begin <= end);
+ if (begin == end) return true;
+ RunCallState<InitFunc, DataFunc> call_state(init_func, data_func);
+ // The runner_ uses the C convention and returns 0 in case of error, so we
+ // convert it to a Status.
+ return (*runner_)(runner_opaque_, static_cast<void*>(&call_state),
+ &call_state.CallInitFunc, &call_state.CallDataFunc, begin,
+ end) == 0;
+ }
+
+ // Use this as init_func when no initialization is needed.
+ static Status NoInit(size_t num_threads) { return true; }
+
+ private:
+ // class holding the state of a Run() call to pass to the runner_ as an
+ // opaque_jpegxl pointer.
+ template <class InitFunc, class DataFunc>
+ class RunCallState final {
+ public:
+ RunCallState(const InitFunc& init_func, const DataFunc& data_func)
+ : init_func_(init_func), data_func_(data_func) {}
+
+ // JxlParallelRunInit interface.
+ static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) {
+ const auto* self =
+ static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+ // Returns -1 when the internal init function returns false Status to
+ // indicate an error.
+ return self->init_func_(num_threads) ? 0 : -1;
+ }
+
+ // JxlParallelRunFunction interface.
+ static void CallDataFunc(void* jpegxl_opaque, uint32_t value,
+ size_t thread_id) {
+ const auto* self =
+ static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+ return self->data_func_(value, thread_id);
+ }
+
+ private:
+ const InitFunc& init_func_;
+ const DataFunc& data_func_;
+ };
+
+ // Default JxlParallelRunner used when no runner is provided by the
+ // caller. This runner doesn't use any threading and thread_id is always 0.
+ static JxlParallelRetCode SequentialRunnerStatic(
+ void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+ JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+ // The caller supplied runner function and its opaque void*.
+ const JxlParallelRunner runner_;
+ void* const runner_opaque_;
+};
+
+template <class InitFunc, class DataFunc>
+Status RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end,
+ const InitFunc& init_func, const DataFunc& data_func,
+ const char* caller) {
+ if (pool == nullptr) {
+ ThreadPool default_pool(nullptr, nullptr);
+ return default_pool.Run(begin, end, init_func, data_func, caller);
+ } else {
+ return pool->Run(begin, end, init_func, data_func, caller);
+ }
+}
+
+} // namespace jxl
+#if JXL_COMPILER_MSVC
+#pragma warning(default : 4180)
+#endif
+
+#endif // LIB_JXL_BASE_DATA_PARALLEL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/file_io.h b/third_party/jpeg-xl/lib/jxl/base/file_io.h
new file mode 100644
index 0000000000..64d5860915
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/file_io.h
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FILE_IO_H_
+#define LIB_JXL_BASE_FILE_IO_H_
+
+// Helper functions for reading/writing files.
+
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Returns extension including the dot, or empty string if none. Assumes
+// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname
+// if the filename contains a dot and/or no other path component does.
+static inline std::string Extension(const std::string& filename) {
+ const size_t pos = filename.rfind('.');
+ if (pos == std::string::npos) return std::string();
+ return filename.substr(pos);
+}
+
+// RAII, ensures files are closed even when returning early.
+class FileWrapper {
+ public:
+ FileWrapper(const FileWrapper& other) = delete;
+ FileWrapper& operator=(const FileWrapper& other) = delete;
+
+ explicit FileWrapper(const std::string& pathname, const char* mode)
+ : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout)
+ : fopen(pathname.c_str(), mode)),
+ close_on_delete_(pathname != "-") {
+#ifdef _WIN32
+ struct __stat64 s = {};
+ const int err = _stat64(pathname.c_str(), &s);
+ const bool is_file = (s.st_mode & S_IFREG) != 0;
+#else
+ struct stat s = {};
+ const int err = stat(pathname.c_str(), &s);
+ const bool is_file = S_ISREG(s.st_mode);
+#endif
+ if (err == 0 && is_file) {
+ size_ = s.st_size;
+ }
+ }
+
+ ~FileWrapper() {
+ if (file_ != nullptr && close_on_delete_) {
+ const int err = fclose(file_);
+ JXL_CHECK(err == 0);
+ }
+ }
+
+ // We intend to use FileWrapper as a replacement of FILE.
+ // NOLINTNEXTLINE(google-explicit-constructor)
+ operator FILE*() const { return file_; }
+
+ int64_t size() { return size_; }
+
+ private:
+ FILE* const file_;
+ bool close_on_delete_ = true;
+ int64_t size_ = -1;
+};
+
+template <typename ContainerType>
+static inline Status ReadFile(const std::string& pathname,
+ ContainerType* JXL_RESTRICT bytes) {
+ FileWrapper f(pathname, "rb");
+ if (f == nullptr)
+ return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str());
+
+ // Get size of file in bytes
+ const int64_t size = f.size();
+ if (size < 0) {
+ // Size is unknown, loop reading chunks until EOF.
+ bytes->clear();
+ std::list<std::vector<uint8_t>> chunks;
+
+ size_t total_size = 0;
+ while (true) {
+ std::vector<uint8_t> chunk(16 * 1024);
+ const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f);
+ if (ferror(f) || bytes_read > chunk.size()) {
+ return JXL_FAILURE("Error reading %s", pathname.c_str());
+ }
+
+ chunk.resize(bytes_read);
+ total_size += bytes_read;
+ if (bytes_read != 0) {
+ chunks.emplace_back(std::move(chunk));
+ }
+ if (feof(f)) {
+ break;
+ }
+ }
+ bytes->resize(total_size);
+ size_t pos = 0;
+ for (const auto& chunk : chunks) {
+ // Needed in case ContainerType is std::string, whose data() is const.
+ char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+ memcpy(bytes_writable + pos, chunk.data(), chunk.size());
+ pos += chunk.size();
+ }
+ } else {
+ // Size is known, read the file directly.
+ bytes->resize(static_cast<size_t>(size));
+ size_t pos = 0;
+ while (pos < bytes->size()) {
+ // Needed in case ContainerType is std::string, whose data() is const.
+ char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+ const size_t bytes_read =
+ fread(bytes_writable + pos, 1, bytes->size() - pos, f);
+ if (bytes_read == 0) return JXL_FAILURE("Failed to read");
+ pos += bytes_read;
+ }
+ JXL_ASSERT(pos == bytes->size());
+ }
+ return true;
+}
+
+template <typename ContainerType>
+static inline Status WriteFile(const ContainerType& bytes,
+ const std::string& pathname) {
+ FileWrapper f(pathname, "wb");
+ if (f == nullptr)
+ return JXL_FAILURE("Failed to open file for writing: %s", pathname.c_str());
+
+ size_t pos = 0;
+ while (pos < bytes.size()) {
+ const size_t bytes_written =
+ fwrite(bytes.data() + pos, 1, bytes.size() - pos, f);
+ if (bytes_written == 0) return JXL_FAILURE("Failed to write");
+ pos += bytes_written;
+ }
+ JXL_ASSERT(pos == bytes.size());
+
+ return true;
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_FILE_IO_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h
new file mode 100644
index 0000000000..90bdeedf54
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/float.h
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FLOAT_H_
+#define LIB_JXL_BASE_FLOAT_H_
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace {
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16) {
+ const uint32_t sign = bits16 >> 15;
+ const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+ const uint32_t mantissa = bits16 & 0x3FF;
+
+ // Subnormal or zero
+ if (biased_exp == 0) {
+ const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+ return sign ? -subnormal : subnormal;
+ }
+
+ // Normalized: convert the representation directly (faster than ldexp/tables).
+ const uint32_t biased_exp32 = biased_exp + (127 - 15);
+ const uint32_t mantissa32 = mantissa << (23 - 10);
+ const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+ float result;
+ memcpy(&result, &bits32, 4);
+ return result;
+}
+} // namespace
+
+template <typename SaveFloatAtFn>
+static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count,
+ size_t stride, JxlDataType type,
+ bool little_endian, float scale,
+ SaveFloatAtFn callback) {
+ switch (type) {
+ case JXL_TYPE_FLOAT:
+ if (little_endian) {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadLEFloat(src + stride * i));
+ }
+ } else {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadBEFloat(src + stride * i));
+ }
+ }
+ return true;
+
+ case JXL_TYPE_UINT8:
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, src[stride * i] * scale);
+ }
+ return true;
+
+ case JXL_TYPE_UINT16:
+ if (little_endian) {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadLE16(src + stride * i) * scale);
+ }
+ } else {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadBE16(src + stride * i) * scale);
+ }
+ }
+ return true;
+
+ case JXL_TYPE_FLOAT16:
+ if (little_endian) {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadFloat16(LoadLE16(src + stride * i)));
+ }
+ } else {
+ for (size_t i = 0; i < count; ++i) {
+ callback(i, LoadFloat16(LoadBE16(src + stride * i)));
+ }
+ }
+ return true;
+
+ default:
+ return JXL_FAILURE("Unsupported sample format");
+ }
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_FLOAT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/iaca.h b/third_party/jpeg-xl/lib/jxl/base/iaca.h
new file mode 100644
index 0000000000..e5732dae5c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/iaca.h
@@ -0,0 +1,65 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_IACA_H_
+#define LIB_JXL_BASE_IACA_H_
+
+#include "lib/jxl/base/compiler_specific.h"
+
+// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for
+// code between special markers. These functions embed such markers in an
+// executable, but only for reading via IACA - they deliberately trigger a
+// crash if executed to ensure they are removed in normal builds.
+
+#ifndef JXL_IACA_ENABLED
+#define JXL_IACA_ENABLED 0
+#endif
+
+namespace jxl {
+
+// Call before the region of interest.
+static JXL_INLINE void BeginIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+ asm volatile(
+ // UD2 "instruction" raises an invalid opcode exception.
+ ".byte 0x0F, 0x0B\n\t"
+ // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually
+ // clobbers EBX, but we don't care because the code won't be run, and we
+ // want IACA to observe the same code the compiler would have generated
+ // without this marker.
+ "movl $111, %%ebx\n\t"
+ ".byte 0x64, 0x67, 0x90\n\t"
+ :
+ :
+ // (Allegedly) clobbering memory may prevent reordering.
+ : "memory");
+#endif
+}
+
+// Call after the region of interest.
+static JXL_INLINE void EndIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+ asm volatile(
+ // See above.
+ "movl $222, %%ebx\n\t"
+ ".byte 0x64, 0x67, 0x90\n\t"
+ // UD2
+ ".byte 0x0F, 0x0B\n\t"
+ :
+ :
+ // (Allegedly) clobbering memory may prevent reordering.
+ : "memory");
+#endif
+}
+
+// Add to a scope to mark a region.
+struct ScopeIACA {
+ JXL_INLINE ScopeIACA() { BeginIACA(); }
+ JXL_INLINE ~ScopeIACA() { EndIACA(); }
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_IACA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/os_macros.h b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
new file mode 100644
index 0000000000..84d0b82bf5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OS_MACROS_H_
+#define LIB_JXL_BASE_OS_MACROS_H_
+
+// Defines the JXL_OS_* macros.
+
+#if defined(_WIN32) || defined(_WIN64)
+#define JXL_OS_WIN 1
+#else
+#define JXL_OS_WIN 0
+#endif
+
+#ifdef __linux__
+#define JXL_OS_LINUX 1
+#else
+#define JXL_OS_LINUX 0
+#endif
+
+#ifdef __APPLE__
+#define JXL_OS_MAC 1
+#else
+#define JXL_OS_MAC 0
+#endif
+
+#define JXL_OS_IOS 0
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#undef JXL_OS_IOS
+#define JXL_OS_IOS 1
+#endif
+#endif
+
+#ifdef __FreeBSD__
+#define JXL_OS_FREEBSD 1
+#else
+#define JXL_OS_FREEBSD 0
+#endif
+
+#ifdef __HAIKU__
+#define JXL_OS_HAIKU 1
+#else
+#define JXL_OS_HAIKU 0
+#endif
+
+#endif // LIB_JXL_BASE_OS_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/override.h b/third_party/jpeg-xl/lib/jxl/base/override.h
new file mode 100644
index 0000000000..1f8b657974
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/override.h
@@ -0,0 +1,29 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OVERRIDE_H_
+#define LIB_JXL_BASE_OVERRIDE_H_
+
+// 'Trool' for command line arguments: force enable/disable, or use default.
+
+namespace jxl {
+
+// No effect if kDefault, otherwise forces a feature (typically a FrameHeader
+// flag) on or off.
+enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 };
+
+static inline Override OverrideFromBool(bool flag) {
+ return flag ? Override::kOn : Override::kOff;
+}
+
+static inline bool ApplyOverride(Override o, bool default_condition) {
+ if (o == Override::kOn) return true;
+ if (o == Override::kOff) return false;
+ return default_condition;
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_OVERRIDE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
new file mode 100644
index 0000000000..11e4bff6fe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/padded_bytes.h"
+
+namespace jxl {
+
+void PaddedBytes::IncreaseCapacityTo(size_t capacity) {
+ JXL_ASSERT(capacity > capacity_);
+
+ size_t new_capacity = std::max(capacity, 3 * capacity_ / 2);
+ new_capacity = std::max<size_t>(64, new_capacity);
+
+ // BitWriter writes up to 7 bytes past the end.
+ CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8);
+ if (new_data == nullptr) {
+ // Allocation failed, discard all data to ensure this is noticed.
+ size_ = capacity_ = 0;
+ return;
+ }
+
+ if (data_ == nullptr) {
+ // First allocation: ensure first byte is initialized (won't be copied).
+ new_data[0] = 0;
+ } else {
+ // Subsequent resize: copy existing data to new location.
+ memcpy(new_data.get(), data_.get(), size_);
+ // Ensure that the first new byte is initialized, to allow write_bits to
+ // safely append to the newly-resized PaddedBytes.
+ new_data[size_] = 0;
+ }
+
+ capacity_ = new_capacity;
+ std::swap(new_data, data_);
+}
+
+void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) {
+ JXL_DASSERT(new_begin <= new_end);
+ const size_t new_size = static_cast<size_t>(new_end - new_begin);
+
+ // memcpy requires non-overlapping ranges, and resizing might invalidate the
+ // new range. Neither happens if the new range is completely to the left or
+ // right of the _allocated_ range (irrespective of size_).
+ const uint8_t* allocated_end = begin() + capacity_;
+ const bool outside = new_end <= begin() || new_begin >= allocated_end;
+ if (outside) {
+ resize(new_size); // grow or shrink
+ memcpy(data(), new_begin, new_size);
+ return;
+ }
+
+ // There is overlap. The new size cannot be larger because we own the memory
+ // and the new range cannot include anything outside the allocated range.
+ JXL_ASSERT(new_size <= capacity_);
+
+ // memmove allows overlap and capacity_ is sufficient.
+ memmove(data(), new_begin, new_size);
+ size_ = new_size; // shrink
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
new file mode 100644
index 0000000000..4534ddf863
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
@@ -0,0 +1,197 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PADDED_BYTES_H_
+#define LIB_JXL_BASE_PADDED_BYTES_H_
+
+// std::vector replacement with padding to reduce bounds checks in WriteBits
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h> // memcpy
+
+#include <algorithm> // max
+#include <initializer_list>
+#include <utility> // swap
+
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Provides a subset of the std::vector interface with some differences:
+// - allows BitWriter to write 64 bits at a time without bounds checking;
+// - ONLY zero-initializes the first byte (required by BitWriter);
+// - ensures cache-line alignment.
+class PaddedBytes {
+ public:
+ // Required for output params.
+ PaddedBytes() : size_(0), capacity_(0) {}
+
+ explicit PaddedBytes(size_t size) : size_(size), capacity_(0) {
+ if (size != 0) IncreaseCapacityTo(size);
+ }
+
+ PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) {
+ if (size != 0) {
+ IncreaseCapacityTo(size);
+ }
+ if (size_ != 0) {
+ memset(data(), value, size);
+ }
+ }
+
+ PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) {
+ if (size_ != 0) IncreaseCapacityTo(size_);
+ if (data() != nullptr) memcpy(data(), other.data(), size_);
+ }
+ PaddedBytes& operator=(const PaddedBytes& other) {
+ // Self-assignment is safe.
+ resize(other.size());
+ if (data() != nullptr) memmove(data(), other.data(), size_);
+ return *this;
+ }
+
+ // default is not OK - need to set other.size_ to 0!
+ PaddedBytes(PaddedBytes&& other) noexcept
+ : size_(other.size_),
+ capacity_(other.capacity_),
+ data_(std::move(other.data_)) {
+ other.size_ = other.capacity_ = 0;
+ }
+ PaddedBytes& operator=(PaddedBytes&& other) noexcept {
+ size_ = other.size_;
+ capacity_ = other.capacity_;
+ data_ = std::move(other.data_);
+
+ if (&other != this) {
+ other.size_ = other.capacity_ = 0;
+ }
+ return *this;
+ }
+
+ void swap(PaddedBytes& other) {
+ std::swap(size_, other.size_);
+ std::swap(capacity_, other.capacity_);
+ std::swap(data_, other.data_);
+ }
+
+ void reserve(size_t capacity) {
+ if (capacity > capacity_) IncreaseCapacityTo(capacity);
+ }
+
+ // NOTE: unlike vector, this does not initialize the new data!
+ // However, we guarantee that write_bits can safely append after
+ // the resize, as we zero-initialize the first new byte of data.
+ // If size < capacity(), does not invalidate the memory.
+ void resize(size_t size) {
+ if (size > capacity_) IncreaseCapacityTo(size);
+ size_ = (data() == nullptr) ? 0 : size;
+ }
+
+ // resize(size) plus explicit initialization of the new data with `value`.
+ void resize(size_t size, uint8_t value) {
+ size_t old_size = size_;
+ resize(size);
+ if (size_ > old_size) {
+ memset(data() + old_size, value, size_ - old_size);
+ }
+ }
+
+ // Amortized constant complexity due to exponential growth.
+ void push_back(uint8_t x) {
+ if (size_ == capacity_) {
+ IncreaseCapacityTo(capacity_ + 1);
+ if (data() == nullptr) return;
+ }
+
+ data_[size_++] = x;
+ }
+
+ size_t size() const { return size_; }
+ size_t capacity() const { return capacity_; }
+
+ uint8_t* data() { return data_.get(); }
+ const uint8_t* data() const { return data_.get(); }
+
+ // std::vector operations implemented in terms of the public interface above.
+
+ void clear() { resize(0); }
+ bool empty() const { return size() == 0; }
+
+ void assign(std::initializer_list<uint8_t> il) {
+ resize(il.size());
+ memcpy(data(), il.begin(), il.size());
+ }
+
+ // Replaces data() with [new_begin, new_end); potentially reallocates.
+ void assign(const uint8_t* new_begin, const uint8_t* new_end);
+
+ uint8_t* begin() { return data(); }
+ const uint8_t* begin() const { return data(); }
+ uint8_t* end() { return begin() + size(); }
+ const uint8_t* end() const { return begin() + size(); }
+
+ uint8_t& operator[](const size_t i) {
+ BoundsCheck(i);
+ return data()[i];
+ }
+ const uint8_t& operator[](const size_t i) const {
+ BoundsCheck(i);
+ return data()[i];
+ }
+
+ uint8_t& back() {
+ JXL_ASSERT(size() != 0);
+ return data()[size() - 1];
+ }
+ const uint8_t& back() const {
+ JXL_ASSERT(size() != 0);
+ return data()[size() - 1];
+ }
+
+ template <typename T>
+ void append(const T& other) {
+ append(reinterpret_cast<const uint8_t*>(other.data()),
+ reinterpret_cast<const uint8_t*>(other.data()) + other.size());
+ }
+
+ void append(const uint8_t* begin, const uint8_t* end) {
+ if (end - begin > 0) {
+ size_t old_size = size();
+ resize(size() + (end - begin));
+ memcpy(data() + old_size, begin, end - begin);
+ }
+ }
+
+ private:
+ void BoundsCheck(size_t i) const {
+ // <= is safe due to padding and required by BitWriter.
+ JXL_ASSERT(i <= size());
+ }
+
+ // Copies existing data to newly allocated "data_". If allocation fails,
+ // data() == nullptr and size_ = capacity_ = 0.
+ // The new capacity will be at least 1.5 times the old capacity. This ensures
+ // that we avoid quadratic behaviour.
+ void IncreaseCapacityTo(size_t capacity);
+
+ size_t size_;
+ size_t capacity_;
+ CacheAlignedUniquePtr data_;
+};
+
+template <typename T>
+static inline void Append(const T& s, PaddedBytes* out,
+ size_t* JXL_RESTRICT byte_pos) {
+ memcpy(out->data() + *byte_pos, s.data(), s.size());
+ *byte_pos += s.size();
+ JXL_CHECK(*byte_pos <= out->size());
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_PADDED_BYTES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/printf_macros.h b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
new file mode 100644
index 0000000000..3215052afd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PRINTF_MACROS_H_
+#define LIB_JXL_BASE_PRINTF_MACROS_H_
+
+// Format string macros. These should be included after any other system
+// library since those may unconditionally define these, depending on the
+// platform.
+
+// PRIuS and PRIdS macros to print size_t and ssize_t respectively.
+#if !defined(PRIdS)
+#if defined(_WIN64)
+#define PRIdS "lld"
+#elif defined(_WIN32)
+#define PRIdS "d"
+#else
+#define PRIdS "zd"
+#endif
+#endif // PRIdS
+
+#if !defined(PRIuS)
+#if defined(_WIN64)
+#define PRIuS "llu"
+#elif defined(_WIN32)
+#define PRIuS "u"
+#else
+#define PRIuS "zu"
+#endif
+#endif // PRIuS
+
+#endif // LIB_JXL_BASE_PRINTF_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
new file mode 100644
index 0000000000..a38d9b82b7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
@@ -0,0 +1,540 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/profiler.h"
+
+#if JXL_PROFILER_ENABLED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> // memcpy
+
+#include <algorithm> // sort
+#include <atomic>
+#include <cinttypes> // PRIu64
+#include <hwy/cache_control.h>
+#include <limits>
+#include <new>
+
+// Optionally use SIMD in StreamCacheLine if available.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace profiler {
+namespace HWY_NAMESPACE {
+
+// Overwrites `to` without loading it into cache (read-for-ownership).
+// Copies 64 bytes from/to naturally aligned addresses.
+void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
+#if HWY_TARGET == HWY_SCALAR
+ hwy::CopyBytes<64>(from, to);
+#else
+ const HWY_CAPPED(uint64_t, 2) d;
+ HWY_FENCE;
+ const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
+ const auto v0 = Load(d, from64 + 0);
+ const auto v1 = Load(d, from64 + 2);
+ const auto v2 = Load(d, from64 + 4);
+ const auto v3 = Load(d, from64 + 6);
+ // Fences prevent the compiler from reordering loads/stores, which may
+ // interfere with write-combining.
+ HWY_FENCE;
+ uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
+ Stream(v0, d, to64 + 0);
+ Stream(v1, d, to64 + 2);
+ Stream(v2, d, to64 + 4);
+ Stream(v3, d, to64 + 6);
+ HWY_FENCE;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace profiler
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace profiler {
+
+HWY_EXPORT(StreamCacheLine);
+
+namespace {
+
+// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// packets (two per zone), which introduces observer overhead.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 32ULL
+#endif
+
+#define PROFILER_PRINT_OVERHEAD 0
+
+// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
+constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
+constexpr size_t kMaxZones = 256; // Total number of zones.
+
+// Stack of active (entered but not exited) zones. POD, uninitialized.
+// Used to deduct child duration from the parent's self time.
+struct ActiveZone {
+ const char* name;
+ uint64_t entry_timestamp;
+ uint64_t child_total;
+};
+
+// Totals for all Zones with the same name. POD, must be zero-initialized.
+struct ZoneTotals {
+ uint64_t total_duration;
+ const char* name;
+ uint64_t num_calls;
+};
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+ if (subtrahend > minuend) {
+ return 0;
+ }
+ return minuend - subtrahend;
+}
+
+} // namespace
+
+// Per-thread call graph (stack) and ZoneTotals for each zone.
+class Results {
+ public:
+ Results() {
+ // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
+ memset(zones_, 0, sizeof(zones_));
+ }
+
+ // Used for computing overhead when this thread encounters its first Zone.
+ // This has no observable effect apart from increasing "analyze_elapsed_".
+ uint64_t ZoneDuration(const Packet* packets) {
+ HWY_ASSERT(depth_ == 0);
+ HWY_ASSERT(num_zones_ == 0);
+ AnalyzePackets(packets, 2);
+ const uint64_t duration = zones_[0].total_duration;
+ zones_[0].num_calls = 0;
+ zones_[0].total_duration = 0;
+ HWY_ASSERT(depth_ == 0);
+ num_zones_ = 0;
+ return duration;
+ }
+
+ void SetSelfOverhead(const uint64_t self_overhead) {
+ self_overhead_ = self_overhead;
+ }
+
+ void SetChildOverhead(const uint64_t child_overhead) {
+ child_overhead_ = child_overhead;
+ }
+
+ // Draw all required information from the packets, which can be discarded
+ // afterwards. Called whenever this thread's storage is full.
+ void AnalyzePackets(const Packet* HWY_RESTRICT packets,
+ const size_t num_packets) {
+ // Ensures prior weakly-ordered streaming stores are globally visible.
+ hwy::FlushStream();
+
+ const uint64_t t0 = TicksBefore();
+
+ for (size_t i = 0; i < num_packets; ++i) {
+ const uint64_t timestamp = packets[i].timestamp;
+ // Entering a zone
+ if (packets[i].name != nullptr) {
+ HWY_ASSERT(depth_ < kMaxDepth);
+ zone_stack_[depth_].name = packets[i].name;
+ zone_stack_[depth_].entry_timestamp = timestamp;
+ zone_stack_[depth_].child_total = 0;
+ ++depth_;
+ continue;
+ }
+
+ HWY_ASSERT(depth_ != 0);
+ const ActiveZone& active = zone_stack_[depth_ - 1];
+ const uint64_t duration = timestamp - active.entry_timestamp;
+ const uint64_t self_duration = ClampedSubtract(
+ duration, self_overhead_ + child_overhead_ + active.child_total);
+
+ UpdateOrAdd(active.name, 1, self_duration);
+ --depth_;
+
+ // "Deduct" the nested time from its parent's self_duration.
+ if (depth_ != 0) {
+ zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
+ }
+ }
+
+ const uint64_t t1 = TicksAfter();
+ analyze_elapsed_ += t1 - t0;
+ }
+
+ // Incorporates results from another thread. Call after all threads have
+ // exited any zones.
+ void Assimilate(const Results& other) {
+ const uint64_t t0 = TicksBefore();
+ HWY_ASSERT(depth_ == 0);
+ HWY_ASSERT(other.depth_ == 0);
+
+ for (size_t i = 0; i < other.num_zones_; ++i) {
+ const ZoneTotals& zone = other.zones_[i];
+ UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
+ }
+ const uint64_t t1 = TicksAfter();
+ analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+ }
+
+ // Single-threaded.
+ void Print() {
+ const uint64_t t0 = TicksBefore();
+ MergeDuplicates();
+
+ // Sort by decreasing total (self) cost.
+ std::sort(zones_, zones_ + num_zones_,
+ [](const ZoneTotals& r1, const ZoneTotals& r2) {
+ return r1.total_duration > r2.total_duration;
+ });
+
+ uint64_t total_visible_duration = 0;
+ for (size_t i = 0; i < num_zones_; ++i) {
+ const ZoneTotals& r = zones_[i];
+ if (r.name[0] != '@') {
+ total_visible_duration += r.total_duration;
+ printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
+ r.num_calls, r.total_duration / r.num_calls, r.total_duration);
+ }
+ }
+
+ const uint64_t t1 = TicksAfter();
+ analyze_elapsed_ += t1 - t0;
+ printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
+ printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
+ }
+
+ // Single-threaded. Clears all results as if no zones had been recorded.
+ void Reset() {
+ analyze_elapsed_ = 0;
+ HWY_ASSERT(depth_ == 0);
+ num_zones_ = 0;
+ memset(zone_stack_, 0, sizeof(zone_stack_));
+ memset(zones_, 0, sizeof(zones_));
+ }
+
+ private:
+ // Updates ZoneTotals of the same name, or inserts a new one if this thread
+ // has not yet seen that name. Uses a self-organizing list data structure,
+ // which avoids dynamic memory allocations and is faster than unordered_map.
+ void UpdateOrAdd(const char* name, const uint64_t num_calls,
+ const uint64_t duration) {
+ // Special case for first zone: (maybe) update, without swapping.
+ if (zones_[0].name == name) {
+ zones_[0].total_duration += duration;
+ zones_[0].num_calls += num_calls;
+ return;
+ }
+
+ // Look for a zone with the same name.
+ for (size_t i = 1; i < num_zones_; ++i) {
+ if (zones_[i].name == name) {
+ zones_[i].total_duration += duration;
+ zones_[i].num_calls += num_calls;
+ // Swap with predecessor (more conservative than move to front,
+ // but at least as successful).
+ std::swap(zones_[i - 1], zones_[i]);
+ return;
+ }
+ }
+
+ // Not found; create a new ZoneTotals.
+ HWY_ASSERT(num_zones_ < kMaxZones);
+ ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
+ zone->name = name;
+ zone->num_calls = num_calls;
+ zone->total_duration = duration;
+ ++num_zones_;
+ }
+
+ // Each instantiation of a function template seems to get its own copy of
+ // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+ // acceptable because we only expect a few dozen zones.
+ void MergeDuplicates() {
+ for (size_t i = 0; i < num_zones_; ++i) {
+ // Add any subsequent duplicates to num_calls and total_duration.
+ for (size_t j = i + 1; j < num_zones_;) {
+ if (!strcmp(zones_[i].name, zones_[j].name)) {
+ zones_[i].num_calls += zones_[j].num_calls;
+ zones_[i].total_duration += zones_[j].total_duration;
+ // Fill hole with last item.
+ zones_[j] = zones_[--num_zones_];
+ } else { // Name differed, try next ZoneTotals.
+ ++j;
+ }
+ }
+ }
+ }
+
+ uint64_t analyze_elapsed_ = 0;
+ uint64_t self_overhead_ = 0;
+ uint64_t child_overhead_ = 0;
+
+ size_t depth_ = 0; // Number of active zones <= kMaxDepth.
+ size_t num_zones_ = 0; // Number of unique zones <= kMaxZones.
+
+ // After other members to avoid large pointer offsets.
+ alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest
+ alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list
+};
+
+ThreadSpecific::ThreadSpecific()
+ : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet)
+ packets_(hwy::AllocateAligned<Packet>(max_packets_)),
+ num_packets_(0),
+ results_(hwy::MakeUniqueAligned<Results>()) {}
+
+ThreadSpecific::~ThreadSpecific() {}
+
+void ThreadSpecific::FlushBuffer() {
+ if (num_packets_ + kBufferCapacity > max_packets_) {
+ results_->AnalyzePackets(packets_.get(), num_packets_);
+ num_packets_ = 0;
+ }
+ // This buffering halves observer overhead and decreases the overall
+ // runtime by about 3%.
+ HWY_DYNAMIC_DISPATCH(StreamCacheLine)
+ (buffer_, packets_.get() + num_packets_);
+ num_packets_ += kBufferCapacity;
+ buffer_size_ = 0;
+}
+
+void ThreadSpecific::AnalyzeRemainingPackets() {
+ // Storage full => empty it.
+ if (num_packets_ + buffer_size_ > max_packets_) {
+ results_->AnalyzePackets(packets_.get(), num_packets_);
+ num_packets_ = 0;
+ }
+
+ // Move buffer to storage
+ memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+ num_packets_ += buffer_size_;
+ buffer_size_ = 0;
+
+ results_->AnalyzePackets(packets_.get(), num_packets_);
+ num_packets_ = 0;
+}
+
+namespace {
+
+class HalfSampleMode {
+ public:
+ // Returns mode. "sorted" must be in ascending order.
+ template <typename T>
+ T operator()(const T* const HWY_RESTRICT sorted,
+ const size_t num_values) const {
+ int64_t center = num_values / 2;
+ int64_t width = num_values;
+
+ // Zoom in on modal intervals of decreasing width. Stop before we reach
+ // width=1, i.e. single values, for which there is no "slope".
+ while (width > 2) {
+ // Round up so we can still reach the outer edges of odd widths.
+ width = (width + 1) / 2;
+
+ center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
+ }
+
+ return sorted[center]; // mode := middle value in modal interval.
+ }
+
+ private:
+ // Returns center of the densest region [c-radius, c+radius].
+ template <typename T>
+ static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
+ const T* HWY_RESTRICT sorted, const int64_t total_values,
+ const int64_t center, const int64_t width) {
+ const int64_t radius = (width + 1) / 2;
+
+ auto compute_slope = [radius, total_values, sorted](
+ int64_t c, int64_t* actual_center = nullptr) {
+ // For symmetry, check 2*radius+1 values, i.e. [min, max].
+ const int64_t min = std::max(c - radius, int64_t(0));
+ const int64_t max = std::min(c + radius, total_values - 1);
+ HWY_ASSERT(min < max);
+ HWY_ASSERT(sorted[min] <=
+ sorted[max] + std::numeric_limits<float>::epsilon());
+ const float dx = max - min + 1;
+ const float slope = (sorted[max] - sorted[min]) / dx;
+
+ if (actual_center != nullptr) {
+ // c may be out of bounds, so return center of the clamped bounds.
+ *actual_center = (min + max + 1) / 2;
+ }
+ return slope;
+ };
+
+ // First find min_slope for all centers.
+ float min_slope = std::numeric_limits<float>::max();
+ for (int64_t c = center - radius; c <= center + radius; ++c) {
+ min_slope = std::min(min_slope, compute_slope(c));
+ }
+
+ // Candidates := centers with slope ~= min_slope.
+ std::vector<int64_t> candidates;
+ for (int64_t c = center - radius; c <= center + radius; ++c) {
+ int64_t actual_center;
+ const float slope = compute_slope(c, &actual_center);
+ if (slope <= min_slope * 1.001f) {
+ candidates.push_back(actual_center);
+ }
+ }
+
+ // Keep the median.
+ HWY_ASSERT(!candidates.empty());
+ if (candidates.size() == 1) return candidates[0];
+ std::nth_element(candidates.begin(),
+ candidates.begin() + candidates.size() / 2,
+ candidates.end());
+ return candidates[candidates.size() / 2];
+ }
+};
+
+} // namespace
+
+void ThreadSpecific::ComputeOverhead() {
+ // Delay after capturing timestamps before/after the actual zone runs. Even
+ // with frequency throttling disabled, this has a multimodal distribution,
+ // including 32, 34, 48, 52, 59, 62.
+ uint64_t self_overhead;
+ {
+ const size_t kNumSamples = 32;
+ uint32_t samples[kNumSamples];
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+ const size_t kNumDurations = 1024;
+ uint32_t durations[kNumDurations];
+
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
+ ++idx_duration) {
+ { //
+ PROFILER_ZONE("Dummy Zone (never shown)");
+ }
+ const uint64_t duration = results_->ZoneDuration(buffer_);
+ buffer_size_ = 0;
+ durations[idx_duration] = static_cast<uint32_t>(duration);
+ HWY_ASSERT(num_packets_ == 0);
+ }
+ std::sort(durations, durations + kNumDurations);
+ samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+ }
+ // Median.
+ std::sort(samples, samples + kNumSamples);
+ self_overhead = samples[kNumSamples / 2];
+#if PROFILER_PRINT_OVERHEAD
+ printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
+#endif
+ results_->SetSelfOverhead(self_overhead);
+ }
+
+ // Delay before capturing start timestamp / after end timestamp.
+ const size_t kNumSamples = 32;
+ uint32_t samples[kNumSamples];
+ for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+ const size_t kNumDurations = 16;
+ uint32_t durations[kNumDurations];
+ for (size_t idx_duration = 0; idx_duration < kNumDurations;
+ ++idx_duration) {
+ const size_t kReps = 10000;
+ // Analysis time should not be included => must fit within buffer.
+ HWY_ASSERT(kReps * 2 < max_packets_);
+ hwy::FlushStream();
+ const uint64_t t0 = TicksBefore();
+ for (size_t i = 0; i < kReps; ++i) {
+ PROFILER_ZONE("Dummy");
+ }
+ hwy::FlushStream();
+ const uint64_t t1 = TicksAfter();
+ HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
+ buffer_size_ = 0;
+ num_packets_ = 0;
+ const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+ durations[idx_duration] =
+ static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+ }
+ std::sort(durations, durations + kNumDurations);
+ samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+ }
+ std::sort(samples, samples + kNumSamples);
+ const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+#if PROFILER_PRINT_OVERHEAD
+ printf("Child overhead: %" PRIu64 "\n",
+ static_cast<uint64_t>(child_overhead));
+#endif
+ results_->SetChildOverhead(child_overhead);
+}
+
+namespace {
+
+// Could be a static member of Zone, but that would expose <atomic> in header.
+std::atomic<ThreadSpecific*>& GetHead() {
+ static std::atomic<ThreadSpecific*> head_{nullptr}; // Owning
+ return head_;
+}
+
+} // namespace
+
+// Thread-safe.
+ThreadSpecific* Zone::InitThreadSpecific() {
+ ThreadSpecific* thread_specific =
+ hwy::MakeUniqueAligned<ThreadSpecific>().release();
+
+ // Insert into unordered list
+ std::atomic<ThreadSpecific*>& head = GetHead();
+ ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
+ thread_specific->SetNext(old_head);
+ while (!head.compare_exchange_weak(old_head, thread_specific,
+ std::memory_order_release,
+ std::memory_order_relaxed)) {
+ thread_specific->SetNext(old_head);
+ // TODO(janwas): pause
+ }
+
+ // ComputeOverhead also creates a Zone, so this needs to be set before that
+ // to prevent infinite recursion.
+ GetThreadSpecific() = thread_specific;
+
+ thread_specific->ComputeOverhead();
+ return thread_specific;
+}
+
+// Single-threaded.
+/*static*/ void Zone::PrintResults() {
+ ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
+ ThreadSpecific* p = head;
+ while (p) {
+ p->AnalyzeRemainingPackets();
+
+ // Combine all threads into a single Result.
+ if (p != head) {
+ head->GetResults().Assimilate(p->GetResults());
+ p->GetResults().Reset();
+ }
+
+ p = p->GetNext();
+ }
+
+ if (head != nullptr) {
+ head->GetResults().Print();
+ head->GetResults().Reset();
+ }
+}
+
+} // namespace profiler
+} // namespace jxl
+
+#endif // HWY_ONCE
+#endif // JXL_PROFILER_ENABLED
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.h b/third_party/jpeg-xl/lib/jxl/base/profiler.h
new file mode 100644
index 0000000000..4c0efa4b3a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.h
@@ -0,0 +1,170 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PROFILER_H_
+#define LIB_JXL_BASE_PROFILER_H_
+
+// High precision, low overhead time measurements. Returns exact call counts and
+// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
+//
+// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which
+// defines JXL_PROFILER_ENABLED.
+//
+// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
+// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
+// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
+// print call counts and average durations [CPU cycles] to stdout, sorted in
+// descending order of total duration.
+
+// If zero, this file has no effect and no measurements will be recorded.
+#ifndef JXL_PROFILER_ENABLED
+#define JXL_PROFILER_ENABLED 0
+#endif
+#if JXL_PROFILER_ENABLED
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/aligned_allocator.h>
+#include <hwy/base.h>
+
+#include "lib/jxl/base/tsc_timer.h"
+
+#if HWY_COMPILER_MSVC
+#define PROFILER_PUBLIC
+#else
+#define PROFILER_PUBLIC __attribute__((visibility("default")))
+#endif
+
+namespace jxl {
+namespace profiler {
+
+// Represents zone entry/exit events. POD.
+#pragma pack(push, 1)
+struct Packet {
+ // Computing a hash or string table is likely too expensive, and offsets
+ // from other libraries' string literals can be too large to combine them and
+ // a full-resolution timestamp into 64 bits.
+ uint64_t timestamp;
+ const char* name; // nullptr for exit packets
+#if UINTPTR_MAX <= 0xFFFFFFFFu
+ uint32_t padding;
+#endif
+};
+#pragma pack(pop)
+static_assert(sizeof(Packet) == 16, "Wrong Packet size");
+
+class Results; // pImpl
+
+// Per-thread packet storage, dynamically allocated and aligned.
+class ThreadSpecific {
+ static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);
+
+ public:
+ PROFILER_PUBLIC explicit ThreadSpecific();
+ PROFILER_PUBLIC ~ThreadSpecific();
+
+ // Depends on Zone => defined out of line.
+ PROFILER_PUBLIC void ComputeOverhead();
+
+ HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
+ HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }
+
+ PROFILER_PUBLIC void AnalyzeRemainingPackets();
+
+ // Accessors instead of public member for well-defined data layout.
+ void SetNext(ThreadSpecific* next) { next_ = next; }
+ ThreadSpecific* GetNext() const { return next_; }
+
+ Results& GetResults() { return *results_; }
+
+ private:
+ PROFILER_PUBLIC void FlushBuffer();
+
+ // Write packet to buffer/storage, emptying them as needed.
+ void Write(const char* name, const uint64_t timestamp) {
+ if (buffer_size_ == kBufferCapacity) { // Full
+ FlushBuffer();
+ }
+ buffer_[buffer_size_].name = name;
+ buffer_[buffer_size_].timestamp = timestamp;
+ ++buffer_size_;
+ }
+
+ // Write-combining buffer to avoid cache pollution. Must be the first
+ // non-static member to ensure cache-line alignment.
+ Packet buffer_[kBufferCapacity];
+ size_t buffer_size_ = 0;
+
+ // Contiguous storage for zone enter/exit packets.
+ const size_t max_packets_;
+ hwy::AlignedFreeUniquePtr<Packet[]> packets_;
+ size_t num_packets_;
+
+ // Linked list of all threads.
+ ThreadSpecific* next_ = nullptr; // Owned, never released.
+
+ hwy::AlignedUniquePtr<Results> results_;
+};
+
+// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
+// responsible for initializing ThreadSpecific.
+class Zone {
+ public:
+ HWY_NOINLINE explicit Zone(const char* name) {
+ HWY_FENCE;
+ ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
+ if (HWY_UNLIKELY(thread_specific == nullptr)) {
+ thread_specific = InitThreadSpecific();
+ }
+
+ thread_specific->WriteEntry(name);
+ }
+
+ HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }
+
+ // Call exactly once after all threads have exited all zones.
+ PROFILER_PUBLIC static void PrintResults();
+
+ private:
+ // Returns reference to the thread's ThreadSpecific pointer (initially null).
+ // Function-local static avoids needing a separate definition.
+ static ThreadSpecific*& GetThreadSpecific() {
+ static thread_local ThreadSpecific* thread_specific;
+ return thread_specific;
+ }
+
+ // Non time-critical.
+ PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
+};
+
+// Creates a zone starting from here until the end of the current scope.
+// Timestamps will be recorded when entering and exiting the zone.
+// To ensure the name pointer remains valid, we require it to be a string
+// literal (by merging with ""). We also compare strings by address.
+#define PROFILER_ZONE(name) \
+ HWY_FENCE; \
+ const ::jxl::profiler::Zone zone("" name); \
+ HWY_FENCE
+
+// Creates a zone for an entire function (when placed at its beginning).
+// Shorter/more convenient than ZONE.
+#define PROFILER_FUNC \
+ HWY_FENCE; \
+ const ::jxl::profiler::Zone zone(__func__); \
+ HWY_FENCE
+
+#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults
+
+} // namespace profiler
+} // namespace jxl
+
+#else // !JXL_PROFILER_ENABLED
+#define PROFILER_ZONE(name)
+#define PROFILER_FUNC
+#define PROFILER_PRINT_RESULTS()
+#endif
+
+#endif // LIB_JXL_BASE_PROFILER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.cc b/third_party/jpeg-xl/lib/jxl/base/random.cc
new file mode 100644
index 0000000000..c99f88921c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/random.h"
+
+#include <cmath>
+
+namespace jxl {
+
+Rng::GeometricDistribution::GeometricDistribution(float p)
+ : inv_log_1mp(1.0 / std::log(1 - p)) {}
+
+uint32_t Rng::Geometric(const GeometricDistribution& dist) {
+ float f = UniformF(0, 1);
+ float log = std::log(1 - f) * dist.inv_log_1mp;
+ return static_cast<uint32_t>(log);
+}
+
+} // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.h b/third_party/jpeg-xl/lib/jxl/base/random.h
new file mode 100644
index 0000000000..663b88c95d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.h
@@ -0,0 +1,95 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_RANDOM_
+#define LIB_JXL_BASE_RANDOM_
+
+// Random number generator + distributions.
+// We don't use <random> because the implementation (and thus results) differs
+// between libstdc++ and libc++.
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+struct Rng {
+ explicit Rng(size_t seed)
+ : s{static_cast<uint64_t>(0x94D049BB133111EBull),
+ static_cast<uint64_t>(0xBF58476D1CE4E5B9ull) + seed} {}
+
+ // Xorshift128+ adapted from xorshift128+-inl.h
+ uint64_t operator()() {
+ uint64_t s1 = s[0];
+ const uint64_t s0 = s[1];
+ const uint64_t bits = s1 + s0; // b, c
+ s[0] = s0;
+ s1 ^= s1 << 23;
+ s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+ s[1] = s1;
+ return bits;
+ }
+
+ // Uniformly distributed int64_t in [begin, end), under the assumption that
+ // `end-begin` is significantly smaller than 1<<64, otherwise there is some
+ // bias.
+ int64_t UniformI(int64_t begin, int64_t end) {
+ JXL_DASSERT(end > begin);
+ return static_cast<int64_t>((*this)() %
+ static_cast<uint64_t>(end - begin)) +
+ begin;
+ }
+
+ // Same as UniformI, but for uint64_t.
+ uint64_t UniformU(uint64_t begin, uint64_t end) {
+ JXL_DASSERT(end > begin);
+ return (*this)() % (end - begin) + begin;
+ }
+
+ // Uniformly distributed float in [begin, end) range. Note: only 23 bits of
+ // randomness.
+ float UniformF(float begin, float end) {
+ float f;
+ // Bits of a random [1, 2) float.
+ uint32_t u = ((*this)() >> (64 - 23)) | 0x3F800000;
+ static_assert(sizeof(f) == sizeof(u),
+ "Float and U32 must have the same size");
+ memcpy(&f, &u, sizeof(f));
+ // Note: (end-begin) * f + (2*begin-end) may fail to return a number >=
+ // begin.
+ return (end - begin) * (f - 1.0f) + begin;
+ }
+
+ // Bernoulli trial
+ bool Bernoulli(float p) { return UniformF(0, 1) < p; }
+
+ // State for geometric distributions.
+ struct GeometricDistribution {
+ explicit GeometricDistribution(float p);
+
+ private:
+ float inv_log_1mp;
+ friend struct Rng;
+ };
+
+ uint32_t Geometric(const GeometricDistribution& dist);
+
+ template <typename T>
+ void Shuffle(T* t, size_t n) {
+ for (size_t i = 0; i + 1 < n; i++) {
+ size_t a = UniformU(i, n);
+ std::swap(t[a], t[i]);
+ }
+ }
+
+ private:
+ uint64_t s[2];
+};
+
+} // namespace jxl
+#endif // LIB_JXL_BASE_RANDOM_
diff --git a/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
new file mode 100644
index 0000000000..315f3bd003
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+#define LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#ifdef ADDRESS_SANITIZER
+#define JXL_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define JXL_ADDRESS_SANITIZER 1
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+
+#ifdef THREAD_SANITIZER
+#define JXL_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define JXL_THREAD_SANITIZER 1
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#endif // LIB_JXL_BASE_SANITIZER_DEFINITIONS_H
diff --git a/third_party/jpeg-xl/lib/jxl/base/scope_guard.h b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
new file mode 100644
index 0000000000..a18a44cb79
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SCOPE_GUARD_H_
+#define LIB_JXL_BASE_SCOPE_GUARD_H_
+
+#include <utility>
+
+namespace jxl {
+
+template <typename Callback>
+class ScopeGuard {
+ public:
+ // Discourage unnecessary moves / copies.
+ ScopeGuard(const ScopeGuard &) = delete;
+ ScopeGuard &operator=(const ScopeGuard &) = delete;
+ ScopeGuard &operator=(ScopeGuard &&) = delete;
+
+ // Pre-C++17 does not guarantee RVO -> require move constructor.
+ ScopeGuard(ScopeGuard &&other) : callback_(std::move(other.callback_)) {
+ other.armed_ = false;
+ }
+
+ template <typename CallbackParam>
+ explicit ScopeGuard(CallbackParam &&callback)
+ : callback_(std::forward<CallbackParam>(callback)), armed_(true) {}
+
+ ~ScopeGuard() {
+ if (armed_) callback_();
+ }
+
+ void Disarm() { armed_ = false; }
+
+ private:
+ Callback callback_;
+ bool armed_;
+};
+
+template <typename Callback>
+ScopeGuard<Callback> MakeScopeGuard(Callback &&callback) {
+ return ScopeGuard<Callback>{std::forward<Callback>(callback)};
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_SCOPE_GUARD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/span.h b/third_party/jpeg-xl/lib/jxl/base/span.h
new file mode 100644
index 0000000000..41c3623a4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/span.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SPAN_H_
+#define LIB_JXL_BASE_SPAN_H_
+
+// Span (array view) is a non-owning container that provides cheap "cut"
+// operations and could be used as "ArrayLike" data source for PaddedBytes.
+
+#include <stddef.h>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+template <typename T>
+class Span {
+ public:
+ constexpr Span() noexcept : Span(nullptr, 0) {}
+
+ constexpr Span(T* array, size_t length) noexcept
+ : ptr_(array), len_(length) {}
+
+ template <size_t N>
+ explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {}
+
+ template <typename ArrayLike>
+ explicit constexpr Span(const ArrayLike& other) noexcept
+ : Span(reinterpret_cast<T*>(other.data()), other.size()) {
+ static_assert(sizeof(*other.data()) == sizeof(T),
+ "Incompatible type of source.");
+ }
+
+ constexpr T* data() const noexcept { return ptr_; }
+
+ constexpr size_t size() const noexcept { return len_; }
+
+ constexpr bool empty() const noexcept { return len_ == 0; }
+
+ constexpr T& operator[](size_t i) const noexcept {
+ // MSVC 2015 accepts this as constexpr, but not ptr_[i]
+ return *(data() + i);
+ }
+
+ void remove_prefix(size_t n) noexcept {
+ JXL_ASSERT(size() >= n);
+ ptr_ += n;
+ len_ -= n;
+ }
+
+ private:
+ T* ptr_;
+ size_t len_;
+};
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_SPAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/status.h b/third_party/jpeg-xl/lib/jxl/base/status.h
new file mode 100644
index 0000000000..f40be0c434
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/status.h
@@ -0,0 +1,326 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_STATUS_H_
+#define LIB_JXL_BASE_STATUS_H_
+
+// Error handling: Status return type + helper macros.
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
+#endif // defined(*_SANITIZER)
+
+namespace jxl {
+
+// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is
+// reached:
+// #define JXL_CRASH_ON_ERROR
+
+#ifndef JXL_ENABLE_ASSERT
+#define JXL_ENABLE_ASSERT 1
+#endif
+
+#ifndef JXL_ENABLE_CHECK
+#define JXL_ENABLE_CHECK 1
+#endif
+
+// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a
+// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is
+// irrelevant if you also pass -DJXL_CRASH_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR)
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+#ifdef NDEBUG
+#define JXL_DEBUG_ON_ERROR 0
+#else // NDEBUG
+#define JXL_DEBUG_ON_ERROR 1
+#endif // NDEBUG
+#endif // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+
+// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on
+// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ALL_ERROR)
+#undef JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 1
+// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too.
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else // JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 0
+#endif // JXL_DEBUG_ON_ALL_ERROR
+
+// The Verbose level for the library
+#ifndef JXL_DEBUG_V_LEVEL
+#define JXL_DEBUG_V_LEVEL 0
+#endif // JXL_DEBUG_V_LEVEL
+
+// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT,
+// JXL_CHECK and JXL_ABORT.
+#ifndef JXL_DEBUG_ON_ABORT
+#define JXL_DEBUG_ON_ABORT 1
+#endif // JXL_DEBUG_ON_ABORT
+
+// Print a debug message on standard error. You should use the JXL_DEBUG macro
+// instead of calling Debug directly. This function returns false, so it can be
+// used as a return value in JXL_FAILURE.
+JXL_FORMAT(1, 2)
+inline JXL_NOINLINE bool Debug(const char* format, ...) {
+ va_list args;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ return false;
+}
+
+// Print a debug message on standard error if "enabled" is true. "enabled" is
+// normally a macro that evaluates to 0 or 1 at compile time, so the Debug
+// function is never called and optimized out in release builds. Note that the
+// arguments are compiled but not evaluated when enabled is false. The format
+// string must be a explicit string in the call, for example:
+// JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var);
+// Add a header at the top of your module's .cc or .h file (depending on whether
+// you have JXL_DEBUG calls from the .h as well) like this:
+// #ifndef JXL_DEBUG_MYMODULE
+// #define JXL_DEBUG_MYMODULE 0
+// #endif JXL_DEBUG_MYMODULE
+#define JXL_DEBUG_TMP(format, ...) \
+ ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define JXL_DEBUG(enabled, format, ...) \
+ do { \
+ if (enabled) { \
+ JXL_DEBUG_TMP(format, ##__VA_ARGS__); \
+ } \
+ } while (0)
+
+// JXL_DEBUG version that prints the debug message if the global verbose level
+// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the
+// passed level.
+#define JXL_DEBUG_V(level, format, ...) \
+ JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__)
+
+// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and
+// debug).
+#ifdef JXL_DEBUG_WARNING
+#undef JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#else // JXL_DEBUG_WARNING
+#ifdef NDEBUG
+#define JXL_DEBUG_WARNING 0
+#else // JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#endif // NDEBUG
+#endif // JXL_DEBUG_WARNING
+#define JXL_WARNING(format, ...) \
+ JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__)
+
+// Exits the program after printing a stack trace when possible.
+JXL_NORETURN inline JXL_NOINLINE bool Abort() {
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+ // If compiled with any sanitizer print a stack trace. This call doesn't crash
+ // the program, instead the trap below will crash it also allowing gdb to
+ // break there.
+ __sanitizer_print_stack_trace();
+#endif // *_SANITIZER)
+
+#if JXL_COMPILER_MSVC
+ __debugbreak();
+ abort();
+#else
+ __builtin_trap();
+#endif
+}
+
+// Exits the program after printing file/line plus a formatted string.
+#define JXL_ABORT(format, ...) \
+ ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \
+ __FILE__, __LINE__, ##__VA_ARGS__), \
+ ::jxl::Abort())
+
+// Does not guarantee running the code, use only for debug mode checks.
+#if JXL_ENABLE_ASSERT
+#define JXL_ASSERT(condition) \
+ do { \
+ if (!(condition)) { \
+ JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \
+ ::jxl::Abort(); \
+ } \
+ } while (0)
+#else
+#define JXL_ASSERT(condition) \
+ do { \
+ } while (0)
+#endif
+
+// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds,
+// but not opt or release.
+#ifndef JXL_IS_DEBUG_BUILD
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
+ defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \
+ defined(__clang_analyzer__)
+#define JXL_IS_DEBUG_BUILD 1
+#else
+#define JXL_IS_DEBUG_BUILD 0
+#endif
+#endif // JXL_IS_DEBUG_BUILD
+
+// Same as above, but only runs in debug builds (builds where NDEBUG is not
+// defined). This is useful for slower asserts that we want to run more rarely
+// than usual. These will run on asan, msan and other debug builds, but not in
+// opt or release.
+#if JXL_IS_DEBUG_BUILD
+#define JXL_DASSERT(condition) \
+ do { \
+ if (!(condition)) { \
+ JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \
+ ::jxl::Abort(); \
+ } \
+ } while (0)
+#else
+#define JXL_DASSERT(condition) \
+ do { \
+ } while (0)
+#endif
+
+// Always runs the condition, so can be used for non-debug calls.
+#if JXL_ENABLE_CHECK
+#define JXL_CHECK(condition) \
+ do { \
+ if (!(condition)) { \
+ JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \
+ ::jxl::Abort(); \
+ } \
+ } while (0)
+#else
+#define JXL_CHECK(condition) \
+ do { \
+ (void)(condition); \
+ } while (0)
+#endif
+
+// A jxl::Status value from a StatusCode or Status which prints a debug message
+// when enabled.
+#define JXL_STATUS(status, format, ...) \
+ ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \
+ __LINE__, ##__VA_ARGS__)
+
+// Notify of an error but discard the resulting Status value. This is only
+// useful for debug builds or when building with JXL_CRASH_ON_ERROR.
+#define JXL_NOTIFY_ERROR(format, ...) \
+ (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \
+ ##__VA_ARGS__)
+
+// An error Status with a message. The JXL_STATUS() macro will return a Status
+// object with a kGenericError code, but the comma operator helps with
+// clang-tidy inference and potentially with optimizations.
+#define JXL_FAILURE(format, ...) \
+ ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \
+ ##__VA_ARGS__), \
+ ::jxl::Status(::jxl::StatusCode::kGenericError))
+
+// Always evaluates the status exactly once, so can be used for non-debug calls.
+// Returns from the current context if the passed Status expression is an error
+// (fatal or non-fatal). The return value is the passed Status.
+#define JXL_RETURN_IF_ERROR(status) \
+ do { \
+ ::jxl::Status jxl_return_if_error_status = (status); \
+ if (!jxl_return_if_error_status) { \
+ (void)::jxl::StatusMessage( \
+ jxl_return_if_error_status, \
+ "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \
+ static_cast<int>(jxl_return_if_error_status.code()), #status); \
+ return jxl_return_if_error_status; \
+ } \
+ } while (0)
+
+// As above, but without calling StatusMessage. Intended for bundles (see
+// fields.h), which have numerous call sites (-> relevant for code size) and do
+// not want to generate excessive messages when decoding partial headers.
+#define JXL_QUIET_RETURN_IF_ERROR(status) \
+ do { \
+ ::jxl::Status jxl_return_if_error_status = (status); \
+ if (!jxl_return_if_error_status) { \
+ return jxl_return_if_error_status; \
+ } \
+ } while (0)
+
+enum class StatusCode : int32_t {
+ // Non-fatal errors (negative values).
+ kNotEnoughBytes = -1,
+
+ // The only non-error status code.
+ kOk = 0,
+
+ // Fatal-errors (positive values)
+ kGenericError = 1,
+};
+
+// Drop-in replacement for bool that raises compiler warnings if not used
+// after being returned from a function. Example:
+// Status LoadFile(...) { return true; } is more compact than
+// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; }
+// In case of error, the status can carry an extra error code in its value which
+// is split between fatal and non-fatal error codes.
+class JXL_MUST_USE_RESULT Status {
+ public:
+ // We want implicit constructor from bool to allow returning "true" or "false"
+ // on a function when using Status. "true" means kOk while "false" means a
+ // generic fatal error.
+ // NOLINTNEXTLINE(google-explicit-constructor)
+ constexpr Status(bool ok)
+ : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {}
+
+ // NOLINTNEXTLINE(google-explicit-constructor)
+ constexpr Status(StatusCode code) : code_(code) {}
+
+ // We also want implicit cast to bool to check for return values of functions.
+ // NOLINTNEXTLINE(google-explicit-constructor)
+ constexpr operator bool() const { return code_ == StatusCode::kOk; }
+
+ constexpr StatusCode code() const { return code_; }
+
+ // Returns whether the status code is a fatal error.
+ constexpr bool IsFatalError() const {
+ return static_cast<int32_t>(code_) > 0;
+ }
+
+ private:
+ StatusCode code_;
+};
+
+// Helper function to create a Status and print the debug message or abort when
+// needed.
+inline JXL_FORMAT(2, 3) Status
+ StatusMessage(const Status status, const char* format, ...) {
+ // This block will be optimized out when JXL_DEBUG_ON_ERROR and
+ // JXL_DEBUG_ON_ALL_ERROR are both disabled.
+ if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) ||
+ (JXL_DEBUG_ON_ALL_ERROR && !status)) {
+ va_list args;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ }
+#ifdef JXL_CRASH_ON_ERROR
+ // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors.
+ if (status.IsFatalError()) {
+ Abort();
+ }
+#endif // JXL_CRASH_ON_ERROR
+ return status;
+}
+
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_STATUS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
new file mode 100644
index 0000000000..74d51f72d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
@@ -0,0 +1,172 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_TSC_TIMER_H_
+#define LIB_JXL_BASE_TSC_TIMER_H_
+
+// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
+// ensure exactly the desired regions are measured.
+
+#include <stdint.h>
+#include <time.h> // clock_gettime
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif // WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif // NOMINMAX
+#ifndef NOGDI
+#define NOGDI
+#endif // NOGDI
+#include <windows.h>
+// Undef macros to avoid collisions
+#undef LoadFence
+#endif
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <OS.h>
+#endif
+
+#include <ctime>
+#include <hwy/base.h>
+#include <hwy/cache_control.h> // LoadFence
+
+namespace jxl {
+namespace profiler {
+
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
+
+// TicksBefore/After return absolute timestamps and must be placed immediately
+// before and after the region to measure. We provide separate Before/After
+// functions because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE.
+//
+// Using Before+Before leads to higher variance and overhead than After+After.
+// However, After+After includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Before+After
+// is faster than Before+Before and more consistent than After+After because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() {
+ Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+ hwy::LoadFence();
+ HWY_FENCE;
+ t = __rdtsc();
+ hwy::LoadFence();
+ HWY_FENCE;
+#elif HWY_ARCH_X86_64
+ asm volatile(
+ "lfence\n\t"
+ "rdtsc\n\t"
+ "shl $32, %%rdx\n\t"
+ "or %%rdx, %0\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rdx = TSC >> 32.
+ // "cc" = flags modified by SHL.
+ : "rdx", "memory", "cc");
+#elif HWY_ARCH_RVV
+ asm volatile("rdcycle %0" : "=r"(t));
+#elif defined(_WIN32) || defined(_WIN64)
+ LARGE_INTEGER counter;
+ (void)QueryPerformanceCounter(&counter);
+ t = counter.QuadPart;
+#elif defined(__APPLE__)
+ t = mach_absolute_time();
+#elif defined(__HAIKU__)
+ t = system_time_nsecs(); // since boot
+#else // POSIX
+ timespec ts;
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+#endif
+ return t;
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() {
+ Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+ asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+ HWY_FENCE;
+ unsigned aux;
+ t = __rdtscp(&aux);
+ hwy::LoadFence();
+ HWY_FENCE;
+#elif HWY_ARCH_X86_64
+ // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+ asm volatile(
+ "rdtscp\n\t"
+ "shl $32, %%rdx\n\t"
+ "or %%rdx, %0\n\t"
+ "lfence"
+ : "=a"(t)
+ :
+ // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+ // "cc" = flags modified by SHL.
+ : "rcx", "rdx", "memory", "cc");
+#else
+ t = TicksBefore(); // no difference on other platforms.
+#endif
+ return t;
+}
+
+} // namespace profiler
+} // namespace jxl
+
+#endif // LIB_JXL_BASE_TSC_TIMER_H_