diff options
Diffstat (limited to '')
25 files changed, 3150 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/base/arch_macros.h b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h new file mode 100644 index 0000000000..a98301915e --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h @@ -0,0 +1,33 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_ARCH_MACROS_H_ +#define LIB_JXL_BASE_ARCH_MACROS_H_ + +// Defines the JXL_ARCH_* macros. + +namespace jxl { + +#if defined(__x86_64__) || defined(_M_X64) +#define JXL_ARCH_X64 1 +#else +#define JXL_ARCH_X64 0 +#endif + +#if defined(__powerpc64__) || defined(_M_PPC) +#define JXL_ARCH_PPC 1 +#else +#define JXL_ARCH_PPC 0 +#endif + +#if defined(__aarch64__) || defined(__arm__) +#define JXL_ARCH_ARM 1 +#else +#define JXL_ARCH_ARM 0 +#endif + +} // namespace jxl + +#endif // LIB_JXL_BASE_ARCH_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/bits.h b/third_party/jpeg-xl/lib/jxl/base/bits.h new file mode 100644 index 0000000000..9f86118e72 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/bits.h @@ -0,0 +1,147 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BITS_H_ +#define LIB_JXL_BASE_BITS_H_ + +// Specialized instructions for processing register-sized bit arrays. + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +#if JXL_COMPILER_MSVC +#include <intrin.h> +#endif + +#include <stddef.h> +#include <stdint.h> + +namespace jxl { + +// Empty struct used as a size tag type. +template <size_t N> +struct SizeTag {}; + +template <typename T> +constexpr bool IsSigned() { + return T(0) > T(-1); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanReverse(&index, x); + return 31 - index; +#else + return static_cast<size_t>(__builtin_clz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanReverse64(&index, x); + return 63 - index; +#else // JXL_ARCH_X64 + // _BitScanReverse64 not available + uint32_t msb = static_cast<uint32_t>(x >> 32u); + unsigned long index; + if (msb == 0) { + uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); + _BitScanReverse(&index, lsb); + return 63 - index; + } else { + _BitScanReverse(&index, msb); + return 31 - index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast<size_t>(__builtin_clzll(x)); +#endif +} +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsAboveMS1Bit_Nonzero(const T x) { + static_assert(!IsSigned<T>(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned"); + return Num0BitsAboveMS1Bit_Nonzero(SizeTag<sizeof(T)>(), x); +} + +// Undefined results for x == 0. +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC + unsigned long index; + _BitScanForward(&index, x); + return index; +#else + return static_cast<size_t>(__builtin_ctz(x)); +#endif +} +static JXL_INLINE JXL_MAYBE_UNUSED size_t +Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) { + JXL_DASSERT(x != 0); +#if JXL_COMPILER_MSVC +#if JXL_ARCH_X64 + unsigned long index; + _BitScanForward64(&index, x); + return index; +#else // JXL_ARCH_64 + // _BitScanForward64 not available + uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF); + unsigned long index; + if (lsb == 0) { + uint32_t msb = static_cast<uint32_t>(x >> 32u); + _BitScanForward(&index, msb); + return 32 + index; + } else { + _BitScanForward(&index, lsb); + return index; + } +#endif // JXL_ARCH_X64 +#else + return static_cast<size_t>(__builtin_ctzll(x)); +#endif +} +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) { + static_assert(!IsSigned<T>(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned"); + return Num0BitsBelowLS1Bit_Nonzero(SizeTag<sizeof(T)>(), x); +} + +// Returns bit width for x == 0. +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns bit width for x == 0. +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) { + return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded down. +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) { + return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x); +} + +// Returns base-2 logarithm, rounded up. +template <typename T> +static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) { + const size_t floor_log2 = FloorLog2Nonzero(x); + if ((x & (x - 1)) == 0) return floor_log2; // power of two + return floor_log2 + 1; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_BITS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/byte_order.h b/third_party/jpeg-xl/lib/jxl/base/byte_order.h new file mode 100644 index 0000000000..8966834e08 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/byte_order.h @@ -0,0 +1,274 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_BYTE_ORDER_H_ +#define LIB_JXL_BASE_BYTE_ORDER_H_ + +#include <jxl/types.h> +#include <stdint.h> +#include <string.h> // memcpy + +#include "lib/jxl/base/compiler_specific.h" + +#if JXL_COMPILER_MSVC +#include <intrin.h> // _byteswap_* +#endif + +#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +#define JXL_BYTE_ORDER_LITTLE 1 +#else +// This means that we don't know that the byte order is little endian, in +// this case we use endian-neutral code that works for both little- and +// big-endian. +#define JXL_BYTE_ORDER_LITTLE 0 +#endif + +// Returns whether the system is little-endian (least-significant byte first). +#if JXL_BYTE_ORDER_LITTLE +static constexpr bool IsLittleEndian() { return true; } +#else +static inline bool IsLittleEndian() { + const uint32_t multibyte = 1; + uint8_t byte; + memcpy(&byte, &multibyte, 1); + return byte == 1; +} +#endif + +static inline bool SwapEndianness(JxlEndianness endianness) { + return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) || + (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian())); +} + +#if JXL_COMPILER_MSVC +#define JXL_BSWAP16(x) _byteswap_ushort(x) +#define JXL_BSWAP32(x) _byteswap_ulong(x) +#define JXL_BSWAP64(x) _byteswap_uint64(x) +#else +#define JXL_BSWAP16(x) __builtin_bswap16(x) +#define JXL_BSWAP32(x) __builtin_bswap32(x) +#define JXL_BSWAP64(x) __builtin_bswap64(x) +#endif + +static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) { + const uint32_t byte1 = p[0]; + const uint32_t byte0 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) { + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + return (byte1 << 8) | byte0; +} + +static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t big; + memcpy(&big, p, 4); + return JXL_BSWAP32(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte3 = p[0]; + const uint32_t byte2 = p[1]; + const uint32_t byte1 = p[2]; + const uint32_t byte0 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t big; + memcpy(&big, p, 8); + return JXL_BSWAP64(big); +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte7 = p[0]; + const uint64_t byte6 = p[1]; + const uint64_t byte5 = p[2]; + const uint64_t byte4 = p[3]; + const uint64_t byte3 = p[4]; + const uint64_t byte2 = p[5]; + const uint64_t byte1 = p[6]; + const uint64_t byte0 = p[7]; + return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) | + (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) | + (byte1 << 8ull) | byte0; +#endif +} + +static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint32_t little; + memcpy(&little, p, 4); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint32_t byte0 = p[0]; + const uint32_t byte1 = p[1]; + const uint32_t byte2 = p[2]; + const uint32_t byte3 = p[3]; + return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + uint64_t little; + memcpy(&little, p, 8); + return little; +#else + // Byte-order-independent - can't assume this machine is big endian. + const uint64_t byte0 = p[0]; + const uint64_t byte1 = p[1]; + const uint64_t byte2 = p[2]; + const uint64_t byte3 = p[3]; + const uint64_t byte4 = p[4]; + const uint64_t byte5 = p[5]; + const uint64_t byte6 = p[6]; + const uint64_t byte7 = p[7]; + return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) | + (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0; +#endif +} + +// Loads a Big-Endian float +static JXL_INLINE float LoadBEFloat(const uint8_t* p) { + uint32_t u = LoadBE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +// Loads a Little-Endian float +static JXL_INLINE float LoadLEFloat(const uint8_t* p) { + uint32_t u = LoadLE32(p); + float result; + memcpy(&result, &u, 4); + return result; +} + +static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) { + p[0] = (native >> 8) & 0xFF; + p[1] = native & 0xFF; +} + +static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) { + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +} + +static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t big = JXL_BSWAP32(native); + memcpy(p, &big, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 24; + p[1] = (native >> 16) & 0xFF; + p[2] = (native >> 8) & 0xFF; + p[3] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t big = JXL_BSWAP64(native); + memcpy(p, &big, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[0] = native >> 56ull; + p[1] = (native >> 48ull) & 0xFF; + p[2] = (native >> 40ull) & 0xFF; + p[3] = (native >> 32ull) & 0xFF; + p[4] = (native >> 24ull) & 0xFF; + p[5] = (native >> 16ull) & 0xFF; + p[6] = (native >> 8ull) & 0xFF; + p[7] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint32_t little = native; + memcpy(p, &little, 4); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[3] = native >> 24; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) { +#if JXL_BYTE_ORDER_LITTLE + const uint64_t little = native; + memcpy(p, &little, 8); +#else + // Byte-order-independent - can't assume this machine is big endian. + p[7] = native >> 56; + p[6] = (native >> 48) & 0xFF; + p[5] = (native >> 40) & 0xFF; + p[4] = (native >> 32) & 0xFF; + p[3] = (native >> 24) & 0xFF; + p[2] = (native >> 16) & 0xFF; + p[1] = (native >> 8) & 0xFF; + p[0] = native & 0xFF; +#endif +} + +static JXL_INLINE float BSwapFloat(float x) { + uint32_t u; + memcpy(&u, &x, 4); + uint32_t uswap = JXL_BSWAP32(u); + float xswap; + memcpy(&xswap, &uswap, 4); + return xswap; +} + +// Big/Little Endian order. +struct OrderBE {}; +struct OrderLE {}; + +// Wrappers for calling from generic code. +static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE16(native, p); +} + +static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE16(native, p); +} + +static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreBE32(native, p); +} + +static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native, + uint8_t* p) { + return StoreLE32(native, p); +} + +static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE16(p); +} + +static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE16(p); +} + +static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) { + return LoadBE32(p); +} + +static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) { + return LoadLE32(p); +} + +#endif // LIB_JXL_BASE_BYTE_ORDER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc new file mode 100644 index 0000000000..9a9cc585a1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/cache_aligned.h" + +#include <stdio.h> +#include <stdlib.h> + +// Disabled: slower than malloc + alignment. +#define JXL_USE_MMAP 0 + +#if JXL_USE_MMAP +#include <sys/mman.h> +#endif + +#include <algorithm> // std::max +#include <atomic> +#include <hwy/base.h> // kMaxVectorSize +#include <limits> + +#include "lib/jxl/base/printf_macros.h" +#include "lib/jxl/base/status.h" + +namespace jxl { +namespace { + +#pragma pack(push, 1) +struct AllocationHeader { + void* allocated; + size_t allocated_size; + uint8_t left_padding[hwy::kMaxVectorSize]; +}; +#pragma pack(pop) + +std::atomic<uint64_t> num_allocations{0}; +std::atomic<uint64_t> bytes_in_use{0}; +std::atomic<uint64_t> max_bytes_in_use{0}; + +} // namespace + +// Avoids linker errors in pre-C++17 builds. +constexpr size_t CacheAligned::kPointerSize; +constexpr size_t CacheAligned::kCacheLineSize; +constexpr size_t CacheAligned::kAlignment; +constexpr size_t CacheAligned::kAlias; + +void CacheAligned::PrintStats() { + fprintf( + stderr, "Allocations: %" PRIuS " (max bytes in use: %E)\n", + static_cast<size_t>(num_allocations.load(std::memory_order_relaxed)), + static_cast<double>(max_bytes_in_use.load(std::memory_order_relaxed))); +} + +size_t CacheAligned::NextOffset() { + static std::atomic<uint32_t> next{0}; + constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment; + const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; + return CacheAligned::kAlignment * group; +} + +void* CacheAligned::Allocate(const size_t payload_size, size_t offset) { + JXL_ASSERT(payload_size <= std::numeric_limits<size_t>::max() / 2); + JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias); + + // What: | misalign | unused | AllocationHeader |payload + // Size: |<= kAlias | offset | |payload_size + // ^allocated.^aligned.^header............^payload + // The header must immediately precede payload, which must remain aligned. + // To avoid wasting space, the header resides at the end of `unused`, + // which therefore cannot be empty (offset == 0). + if (offset == 0) { + // SVE/RVV vectors can be large, so we cannot rely on them (including the + // padding at the end of AllocationHeader) to fit in kAlignment. + offset = hwy::RoundUpTo(sizeof(AllocationHeader), kAlignment); + } + +#if JXL_USE_MMAP + const size_t allocated_size = offset + payload_size; + const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE; + void* allocated = + mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (allocated == MAP_FAILED) return nullptr; + const uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated); +#else + const size_t allocated_size = kAlias + offset + payload_size; + void* allocated = malloc(allocated_size); + if (allocated == nullptr) return nullptr; + // Always round up even if already aligned - we already asked for kAlias + // extra bytes and there's no way to give them back. + uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias; + static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); + static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); + aligned &= ~(kAlias - 1); +#endif + +#if 0 + // No effect. + uintptr_t page_aligned = reinterpret_cast<uintptr_t>(allocated); + page_aligned &= ~(4096 - 1); + if (madvise(reinterpret_cast<void*>(page_aligned), allocated_size, + MADV_WILLNEED) != 0) { + JXL_NOTIFY_ERROR("madvise failed"); + } +#elif 0 + // INCREASES both first and subsequent decode times. + if (mlock(allocated, allocated_size) != 0) { + JXL_NOTIFY_ERROR("mlock failed"); + } +#endif + + // Update statistics (#allocations and max bytes in use) + num_allocations.fetch_add(1, std::memory_order_relaxed); + const uint64_t prev_bytes = + bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel); + uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire); + for (;;) { + const uint64_t desired = + std::max(expected_max, prev_bytes + allocated_size); + if (max_bytes_in_use.compare_exchange_strong(expected_max, desired, + std::memory_order_acq_rel)) { + break; + } + } + + const uintptr_t payload = aligned + offset; // still aligned + + // Stash `allocated` and payload_size inside header for use by Free(). + AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1; + header->allocated = allocated; + header->allocated_size = allocated_size; + + return JXL_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), 64); +} + +void CacheAligned::Free(const void* aligned_pointer) { + if (aligned_pointer == nullptr) { + return; + } + const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer); + JXL_ASSERT(payload % kAlignment == 0); + const AllocationHeader* header = + reinterpret_cast<const AllocationHeader*>(payload) - 1; + + // Subtract (2's complement negation). + bytes_in_use.fetch_add(~header->allocated_size + 1, + std::memory_order_acq_rel); + +#if JXL_USE_MMAP + munmap(header->allocated, header->allocated_size); +#else + free(header->allocated); +#endif +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h new file mode 100644 index 0000000000..e57df14837 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h @@ -0,0 +1,74 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_ +#define LIB_JXL_BASE_CACHE_ALIGNED_H_ + +// Memory allocator with support for alignment + misalignment. + +#include <stddef.h> +#include <stdint.h> + +#include <memory> + +#include "lib/jxl/base/compiler_specific.h" + +namespace jxl { + +// Functions that depend on the cache line size. +class CacheAligned { + public: + static void PrintStats(); + + static constexpr size_t kPointerSize = sizeof(void*); + static constexpr size_t kCacheLineSize = 64; + // To avoid RFOs, match L2 fill size (pairs of lines). + static constexpr size_t kAlignment = 2 * kCacheLineSize; + // Minimum multiple for which cache set conflicts and/or loads blocked by + // preceding stores can occur. + static constexpr size_t kAlias = 2048; + + // Returns a 'random' (cyclical) offset suitable for Allocate. + static size_t NextOffset(); + + // Returns null or memory whose address is congruent to `offset` (mod kAlias). + // This reduces cache conflicts and load/store stalls, especially with large + // allocations that would otherwise have similar alignments. At least + // `payload_size` (which can be zero) bytes will be accessible. + static void* Allocate(size_t payload_size, size_t offset); + + static void* Allocate(const size_t payload_size) { + return Allocate(payload_size, NextOffset()); + } + + static void Free(const void* aligned_pointer); +}; + +// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr. +struct CacheAlignedDeleter { + void operator()(uint8_t* aligned_pointer) const { + return CacheAligned::Free(aligned_pointer); + } +}; + +using CacheAlignedUniquePtr = std::unique_ptr<uint8_t[], CacheAlignedDeleter>; + +// Does not invoke constructors. +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) { + return CacheAlignedUniquePtr( + static_cast<uint8_t*>(CacheAligned::Allocate(bytes)), + CacheAlignedDeleter()); +} + +static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes, + const size_t offset) { + return CacheAlignedUniquePtr( + static_cast<uint8_t*>(CacheAligned::Allocate(bytes, offset)), + CacheAlignedDeleter()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_CACHE_ALIGNED_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h new file mode 100644 index 0000000000..abe1261f48 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h @@ -0,0 +1,157 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_ +#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_ + +// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect. + +#include <stdint.h> +#include <sys/types.h> + +#include "lib/jxl/base/sanitizer_definitions.h" + +// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected, +// otherwise 100 * major + minor version. Note that other packages check for +// #ifdef COMPILER_MSVC, so we cannot use that same name. + +#ifdef _MSC_VER +#define JXL_COMPILER_MSVC _MSC_VER +#else +#define JXL_COMPILER_MSVC 0 +#endif + +#ifdef __GNUC__ +#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) +#else +#define JXL_COMPILER_GCC 0 +#endif + +#ifdef __clang__ +#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) +// Clang pretends to be GCC for compatibility. +#undef JXL_COMPILER_GCC +#define JXL_COMPILER_GCC 0 +#else +#define JXL_COMPILER_CLANG 0 +#endif + +#if JXL_COMPILER_MSVC +#define JXL_RESTRICT __restrict +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_RESTRICT __restrict__ +#else +#define JXL_RESTRICT +#endif + +#if JXL_COMPILER_MSVC +#define JXL_INLINE __forceinline +#define JXL_NOINLINE __declspec(noinline) +#else +#define JXL_INLINE inline __attribute__((always_inline)) +#define JXL_NOINLINE __attribute__((noinline)) +#endif + +#if JXL_COMPILER_MSVC +#define JXL_NORETURN __declspec(noreturn) +#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG +#define JXL_NORETURN __attribute__((noreturn)) +#else +#define JXL_NORETURN +#endif + +#if JXL_COMPILER_MSVC +#define JXL_UNREACHABLE __assume(false) +#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405 +#define JXL_UNREACHABLE __builtin_unreachable() +#else +#define JXL_UNREACHABLE +#endif + +#if JXL_COMPILER_MSVC +#define JXL_MAYBE_UNUSED +#else +// Encountered "attribute list cannot appear here" when using the C++17 +// [[maybe_unused]], so only use the old style attribute for now. +#define JXL_MAYBE_UNUSED __attribute__((unused)) +#endif + +// MSAN execution won't hurt if some code it not inlined, but this can greatly +// improve compilation time. Unfortunately this macro can not be used just +// everywhere - inside header files it leads to "multiple definition" error; +// though it would be better not to have JXL_INLINE in header overall. +#if JXL_MEMORY_SANITIZER || JXL_ADDRESS_SANITIZER || JXL_THREAD_SANITIZER +#define JXL_MAYBE_INLINE JXL_MAYBE_UNUSED +#else +#define JXL_MAYBE_INLINE JXL_INLINE +#endif + +#if JXL_COMPILER_MSVC +// Unsupported, __assume is not the same. +#define JXL_LIKELY(expr) expr +#define JXL_UNLIKELY(expr) expr +#else +#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1) +#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#endif + +// Returns a void* pointer which the compiler then assumes is N-byte aligned. +// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32); +// +// The assignment semantics are required by GCC/Clang. ICC provides an in-place +// __assume_aligned, whereas MSVC's __assume appears unsuitable. +#if JXL_COMPILER_CLANG +// Early versions of Clang did not support __builtin_assume_aligned. +#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned) +#elif JXL_COMPILER_GCC +#define JXL_HAS_ASSUME_ALIGNED 1 +#else +#define JXL_HAS_ASSUME_ALIGNED 0 +#endif + +#if JXL_HAS_ASSUME_ALIGNED +#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) +#else +#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ +#endif + +#ifdef __has_attribute +#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x) +#else +#define JXL_HAVE_ATTRIBUTE(x) 0 +#endif + +// Raises warnings if the function return value is unused. Should appear as the +// first part of a function definition/declaration. +#if JXL_HAVE_ATTRIBUTE(nodiscard) +#define JXL_MUST_USE_RESULT [[nodiscard]] +#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result) +#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define JXL_MUST_USE_RESULT +#endif + +// Disable certain -fsanitize flags for functions that are expected to include +// things like unsigned integer overflow. For example use in the function +// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned +// integer overflow ubsan messages. +#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize) +#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X))) +#else +#define JXL_NO_SANITIZE(X) +#endif + +#if JXL_HAVE_ATTRIBUTE(__format__) +#define JXL_FORMAT(idx_fmt, idx_arg) \ + __attribute__((__format__(__printf__, idx_fmt, idx_arg))) +#else +#define JXL_FORMAT(idx_fmt, idx_arg) +#endif + +#if JXL_COMPILER_MSVC +using ssize_t = intptr_t; +#endif + +#endif // LIB_JXL_BASE_COMPILER_SPECIFIC_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc new file mode 100644 index 0000000000..20a911255c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc @@ -0,0 +1,23 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/data_parallel.h" + +namespace jxl { + +// static +JxlParallelRetCode ThreadPool::SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1); + if (init_ret != 0) return init_ret; + + for (uint32_t i = start_range; i < end_range; i++) { + (*func)(jpegxl_opaque, i, 0); + } + return 0; +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h new file mode 100644 index 0000000000..ba7e7adfad --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h @@ -0,0 +1,120 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_ +#define LIB_JXL_BASE_DATA_PARALLEL_H_ + +// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for +// data-parallel computations. + +#include <jxl/parallel_runner.h> +#include <stddef.h> +#include <stdint.h> + +#include "lib/jxl/base/bits.h" +#include "lib/jxl/base/status.h" +#if JXL_COMPILER_MSVC +// suppress warnings about the const & applied to function types +#pragma warning(disable : 4180) +#endif + +namespace jxl { + +class ThreadPool { + public: + ThreadPool(JxlParallelRunner runner, void* runner_opaque) + : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic), + runner_opaque_(runner ? runner_opaque : static_cast<void*>(this)) {} + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + JxlParallelRunner runner() const { return runner_; } + void* runner_opaque() const { return runner_opaque_; } + + // Runs init_func(num_threads) followed by data_func(task, thread) on worker + // thread(s) for every task in [begin, end). init_func() must return a Status + // indicating whether the initialization succeeded. + // "thread" is an integer smaller than num_threads. + // Not thread-safe - no two calls to Run may overlap. + // Subsequent calls will reuse the same threads. + // + // Precondition: begin <= end. + template <class InitFunc, class DataFunc> + Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func, + const DataFunc& data_func, const char* caller = "") { + JXL_ASSERT(begin <= end); + if (begin == end) return true; + RunCallState<InitFunc, DataFunc> call_state(init_func, data_func); + // The runner_ uses the C convention and returns 0 in case of error, so we + // convert it to a Status. + return (*runner_)(runner_opaque_, static_cast<void*>(&call_state), + &call_state.CallInitFunc, &call_state.CallDataFunc, begin, + end) == 0; + } + + // Use this as init_func when no initialization is needed. + static Status NoInit(size_t num_threads) { return true; } + + private: + // class holding the state of a Run() call to pass to the runner_ as an + // opaque_jpegxl pointer. + template <class InitFunc, class DataFunc> + class RunCallState final { + public: + RunCallState(const InitFunc& init_func, const DataFunc& data_func) + : init_func_(init_func), data_func_(data_func) {} + + // JxlParallelRunInit interface. + static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) { + const auto* self = + static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque); + // Returns -1 when the internal init function returns false Status to + // indicate an error. + return self->init_func_(num_threads) ? 0 : -1; + } + + // JxlParallelRunFunction interface. + static void CallDataFunc(void* jpegxl_opaque, uint32_t value, + size_t thread_id) { + const auto* self = + static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque); + return self->data_func_(value, thread_id); + } + + private: + const InitFunc& init_func_; + const DataFunc& data_func_; + }; + + // Default JxlParallelRunner used when no runner is provided by the + // caller. This runner doesn't use any threading and thread_id is always 0. + static JxlParallelRetCode SequentialRunnerStatic( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range); + + // The caller supplied runner function and its opaque void*. + const JxlParallelRunner runner_; + void* const runner_opaque_; +}; + +template <class InitFunc, class DataFunc> +Status RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end, + const InitFunc& init_func, const DataFunc& data_func, + const char* caller) { + if (pool == nullptr) { + ThreadPool default_pool(nullptr, nullptr); + return default_pool.Run(begin, end, init_func, data_func, caller); + } else { + return pool->Run(begin, end, init_func, data_func, caller); + } +} + +} // namespace jxl +#if JXL_COMPILER_MSVC +#pragma warning(default : 4180) +#endif + +#endif // LIB_JXL_BASE_DATA_PARALLEL_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/file_io.h b/third_party/jpeg-xl/lib/jxl/base/file_io.h new file mode 100644 index 0000000000..64d5860915 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/file_io.h @@ -0,0 +1,153 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_FILE_IO_H_ +#define LIB_JXL_BASE_FILE_IO_H_ + +// Helper functions for reading/writing files. + +#include <stdio.h> +#include <sys/stat.h> + +#include <list> +#include <string> +#include <vector> + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/padded_bytes.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Returns extension including the dot, or empty string if none. Assumes +// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname +// if the filename contains a dot and/or no other path component does. +static inline std::string Extension(const std::string& filename) { + const size_t pos = filename.rfind('.'); + if (pos == std::string::npos) return std::string(); + return filename.substr(pos); +} + +// RAII, ensures files are closed even when returning early. +class FileWrapper { + public: + FileWrapper(const FileWrapper& other) = delete; + FileWrapper& operator=(const FileWrapper& other) = delete; + + explicit FileWrapper(const std::string& pathname, const char* mode) + : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout) + : fopen(pathname.c_str(), mode)), + close_on_delete_(pathname != "-") { +#ifdef _WIN32 + struct __stat64 s = {}; + const int err = _stat64(pathname.c_str(), &s); + const bool is_file = (s.st_mode & S_IFREG) != 0; +#else + struct stat s = {}; + const int err = stat(pathname.c_str(), &s); + const bool is_file = S_ISREG(s.st_mode); +#endif + if (err == 0 && is_file) { + size_ = s.st_size; + } + } + + ~FileWrapper() { + if (file_ != nullptr && close_on_delete_) { + const int err = fclose(file_); + JXL_CHECK(err == 0); + } + } + + // We intend to use FileWrapper as a replacement of FILE. + // NOLINTNEXTLINE(google-explicit-constructor) + operator FILE*() const { return file_; } + + int64_t size() { return size_; } + + private: + FILE* const file_; + bool close_on_delete_ = true; + int64_t size_ = -1; +}; + +template <typename ContainerType> +static inline Status ReadFile(const std::string& pathname, + ContainerType* JXL_RESTRICT bytes) { + FileWrapper f(pathname, "rb"); + if (f == nullptr) + return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str()); + + // Get size of file in bytes + const int64_t size = f.size(); + if (size < 0) { + // Size is unknown, loop reading chunks until EOF. + bytes->clear(); + std::list<std::vector<uint8_t>> chunks; + + size_t total_size = 0; + while (true) { + std::vector<uint8_t> chunk(16 * 1024); + const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f); + if (ferror(f) || bytes_read > chunk.size()) { + return JXL_FAILURE("Error reading %s", pathname.c_str()); + } + + chunk.resize(bytes_read); + total_size += bytes_read; + if (bytes_read != 0) { + chunks.emplace_back(std::move(chunk)); + } + if (feof(f)) { + break; + } + } + bytes->resize(total_size); + size_t pos = 0; + for (const auto& chunk : chunks) { + // Needed in case ContainerType is std::string, whose data() is const. + char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]); + memcpy(bytes_writable + pos, chunk.data(), chunk.size()); + pos += chunk.size(); + } + } else { + // Size is known, read the file directly. + bytes->resize(static_cast<size_t>(size)); + size_t pos = 0; + while (pos < bytes->size()) { + // Needed in case ContainerType is std::string, whose data() is const. + char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]); + const size_t bytes_read = + fread(bytes_writable + pos, 1, bytes->size() - pos, f); + if (bytes_read == 0) return JXL_FAILURE("Failed to read"); + pos += bytes_read; + } + JXL_ASSERT(pos == bytes->size()); + } + return true; +} + +template <typename ContainerType> +static inline Status WriteFile(const ContainerType& bytes, + const std::string& pathname) { + FileWrapper f(pathname, "wb"); + if (f == nullptr) + return JXL_FAILURE("Failed to open file for writing: %s", pathname.c_str()); + + size_t pos = 0; + while (pos < bytes.size()) { + const size_t bytes_written = + fwrite(bytes.data() + pos, 1, bytes.size() - pos, f); + if (bytes_written == 0) return JXL_FAILURE("Failed to write"); + pos += bytes_written; + } + JXL_ASSERT(pos == bytes.size()); + + return true; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_FILE_IO_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h new file mode 100644 index 0000000000..90bdeedf54 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/float.h @@ -0,0 +1,98 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_FLOAT_H_ +#define LIB_JXL_BASE_FLOAT_H_ + +#include <jxl/types.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "lib/jxl/base/byte_order.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +namespace { +// Based on highway scalar implementation, for testing +float LoadFloat16(uint16_t bits16) { + const uint32_t sign = bits16 >> 15; + const uint32_t biased_exp = (bits16 >> 10) & 0x1F; + const uint32_t mantissa = bits16 & 0x3FF; + + // Subnormal or zero + if (biased_exp == 0) { + const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024)); + return sign ? -subnormal : subnormal; + } + + // Normalized: convert the representation directly (faster than ldexp/tables). + const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t mantissa32 = mantissa << (23 - 10); + const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; + + float result; + memcpy(&result, &bits32, 4); + return result; +} +} // namespace + +template <typename SaveFloatAtFn> +static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count, + size_t stride, JxlDataType type, + bool little_endian, float scale, + SaveFloatAtFn callback) { + switch (type) { + case JXL_TYPE_FLOAT: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadLEFloat(src + stride * i)); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadBEFloat(src + stride * i)); + } + } + return true; + + case JXL_TYPE_UINT8: + for (size_t i = 0; i < count; ++i) { + callback(i, src[stride * i] * scale); + } + return true; + + case JXL_TYPE_UINT16: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadLE16(src + stride * i) * scale); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadBE16(src + stride * i) * scale); + } + } + return true; + + case JXL_TYPE_FLOAT16: + if (little_endian) { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadFloat16(LoadLE16(src + stride * i))); + } + } else { + for (size_t i = 0; i < count; ++i) { + callback(i, LoadFloat16(LoadBE16(src + stride * i))); + } + } + return true; + + default: + return JXL_FAILURE("Unsupported sample format"); + } +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_FLOAT_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/iaca.h b/third_party/jpeg-xl/lib/jxl/base/iaca.h new file mode 100644 index 0000000000..e5732dae5c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/iaca.h @@ -0,0 +1,65 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_IACA_H_ +#define LIB_JXL_BASE_IACA_H_ + +#include "lib/jxl/base/compiler_specific.h" + +// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for +// code between special markers. These functions embed such markers in an +// executable, but only for reading via IACA - they deliberately trigger a +// crash if executed to ensure they are removed in normal builds. + +#ifndef JXL_IACA_ENABLED +#define JXL_IACA_ENABLED 0 +#endif + +namespace jxl { + +// Call before the region of interest. +static JXL_INLINE void BeginIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // UD2 "instruction" raises an invalid opcode exception. + ".byte 0x0F, 0x0B\n\t" + // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually + // clobbers EBX, but we don't care because the code won't be run, and we + // want IACA to observe the same code the compiler would have generated + // without this marker. + "movl $111, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Call after the region of interest. +static JXL_INLINE void EndIACA() { +#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG) + asm volatile( + // See above. + "movl $222, %%ebx\n\t" + ".byte 0x64, 0x67, 0x90\n\t" + // UD2 + ".byte 0x0F, 0x0B\n\t" + : + : + // (Allegedly) clobbering memory may prevent reordering. + : "memory"); +#endif +} + +// Add to a scope to mark a region. +struct ScopeIACA { + JXL_INLINE ScopeIACA() { BeginIACA(); } + JXL_INLINE ~ScopeIACA() { EndIACA(); } +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_IACA_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/os_macros.h b/third_party/jpeg-xl/lib/jxl/base/os_macros.h new file mode 100644 index 0000000000..84d0b82bf5 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/os_macros.h @@ -0,0 +1,50 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OS_MACROS_H_ +#define LIB_JXL_BASE_OS_MACROS_H_ + +// Defines the JXL_OS_* macros. + +#if defined(_WIN32) || defined(_WIN64) +#define JXL_OS_WIN 1 +#else +#define JXL_OS_WIN 0 +#endif + +#ifdef __linux__ +#define JXL_OS_LINUX 1 +#else +#define JXL_OS_LINUX 0 +#endif + +#ifdef __APPLE__ +#define JXL_OS_MAC 1 +#else +#define JXL_OS_MAC 0 +#endif + +#define JXL_OS_IOS 0 +#ifdef __APPLE__ +#include <TargetConditionals.h> +#if TARGET_OS_IPHONE +#undef JXL_OS_IOS +#define JXL_OS_IOS 1 +#endif +#endif + +#ifdef __FreeBSD__ +#define JXL_OS_FREEBSD 1 +#else +#define JXL_OS_FREEBSD 0 +#endif + +#ifdef __HAIKU__ +#define JXL_OS_HAIKU 1 +#else +#define JXL_OS_HAIKU 0 +#endif + +#endif // LIB_JXL_BASE_OS_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/override.h b/third_party/jpeg-xl/lib/jxl/base/override.h new file mode 100644 index 0000000000..1f8b657974 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/override.h @@ -0,0 +1,29 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_OVERRIDE_H_ +#define LIB_JXL_BASE_OVERRIDE_H_ + +// 'Trool' for command line arguments: force enable/disable, or use default. + +namespace jxl { + +// No effect if kDefault, otherwise forces a feature (typically a FrameHeader +// flag) on or off. +enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 }; + +static inline Override OverrideFromBool(bool flag) { + return flag ? Override::kOn : Override::kOff; +} + +static inline bool ApplyOverride(Override o, bool default_condition) { + if (o == Override::kOn) return true; + if (o == Override::kOff) return false; + return default_condition; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_OVERRIDE_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc new file mode 100644 index 0000000000..11e4bff6fe --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc @@ -0,0 +1,63 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/padded_bytes.h" + +namespace jxl { + +void PaddedBytes::IncreaseCapacityTo(size_t capacity) { + JXL_ASSERT(capacity > capacity_); + + size_t new_capacity = std::max(capacity, 3 * capacity_ / 2); + new_capacity = std::max<size_t>(64, new_capacity); + + // BitWriter writes up to 7 bytes past the end. + CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8); + if (new_data == nullptr) { + // Allocation failed, discard all data to ensure this is noticed. + size_ = capacity_ = 0; + return; + } + + if (data_ == nullptr) { + // First allocation: ensure first byte is initialized (won't be copied). + new_data[0] = 0; + } else { + // Subsequent resize: copy existing data to new location. + memcpy(new_data.get(), data_.get(), size_); + // Ensure that the first new byte is initialized, to allow write_bits to + // safely append to the newly-resized PaddedBytes. + new_data[size_] = 0; + } + + capacity_ = new_capacity; + std::swap(new_data, data_); +} + +void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) { + JXL_DASSERT(new_begin <= new_end); + const size_t new_size = static_cast<size_t>(new_end - new_begin); + + // memcpy requires non-overlapping ranges, and resizing might invalidate the + // new range. Neither happens if the new range is completely to the left or + // right of the _allocated_ range (irrespective of size_). + const uint8_t* allocated_end = begin() + capacity_; + const bool outside = new_end <= begin() || new_begin >= allocated_end; + if (outside) { + resize(new_size); // grow or shrink + memcpy(data(), new_begin, new_size); + return; + } + + // There is overlap. The new size cannot be larger because we own the memory + // and the new range cannot include anything outside the allocated range. + JXL_ASSERT(new_size <= capacity_); + + // memmove allows overlap and capacity_ is sufficient. + memmove(data(), new_begin, new_size); + size_ = new_size; // shrink +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h new file mode 100644 index 0000000000..4534ddf863 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h @@ -0,0 +1,197 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PADDED_BYTES_H_ +#define LIB_JXL_BASE_PADDED_BYTES_H_ + +// std::vector replacement with padding to reduce bounds checks in WriteBits + +#include <stddef.h> +#include <stdint.h> +#include <string.h> // memcpy + +#include <algorithm> // max +#include <initializer_list> +#include <utility> // swap + +#include "lib/jxl/base/cache_aligned.h" +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/status.h" + +namespace jxl { + +// Provides a subset of the std::vector interface with some differences: +// - allows BitWriter to write 64 bits at a time without bounds checking; +// - ONLY zero-initializes the first byte (required by BitWriter); +// - ensures cache-line alignment. +class PaddedBytes { + public: + // Required for output params. + PaddedBytes() : size_(0), capacity_(0) {} + + explicit PaddedBytes(size_t size) : size_(size), capacity_(0) { + if (size != 0) IncreaseCapacityTo(size); + } + + PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) { + if (size != 0) { + IncreaseCapacityTo(size); + } + if (size_ != 0) { + memset(data(), value, size); + } + } + + PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) { + if (size_ != 0) IncreaseCapacityTo(size_); + if (data() != nullptr) memcpy(data(), other.data(), size_); + } + PaddedBytes& operator=(const PaddedBytes& other) { + // Self-assignment is safe. + resize(other.size()); + if (data() != nullptr) memmove(data(), other.data(), size_); + return *this; + } + + // default is not OK - need to set other.size_ to 0! + PaddedBytes(PaddedBytes&& other) noexcept + : size_(other.size_), + capacity_(other.capacity_), + data_(std::move(other.data_)) { + other.size_ = other.capacity_ = 0; + } + PaddedBytes& operator=(PaddedBytes&& other) noexcept { + size_ = other.size_; + capacity_ = other.capacity_; + data_ = std::move(other.data_); + + if (&other != this) { + other.size_ = other.capacity_ = 0; + } + return *this; + } + + void swap(PaddedBytes& other) { + std::swap(size_, other.size_); + std::swap(capacity_, other.capacity_); + std::swap(data_, other.data_); + } + + void reserve(size_t capacity) { + if (capacity > capacity_) IncreaseCapacityTo(capacity); + } + + // NOTE: unlike vector, this does not initialize the new data! + // However, we guarantee that write_bits can safely append after + // the resize, as we zero-initialize the first new byte of data. + // If size < capacity(), does not invalidate the memory. + void resize(size_t size) { + if (size > capacity_) IncreaseCapacityTo(size); + size_ = (data() == nullptr) ? 0 : size; + } + + // resize(size) plus explicit initialization of the new data with `value`. + void resize(size_t size, uint8_t value) { + size_t old_size = size_; + resize(size); + if (size_ > old_size) { + memset(data() + old_size, value, size_ - old_size); + } + } + + // Amortized constant complexity due to exponential growth. + void push_back(uint8_t x) { + if (size_ == capacity_) { + IncreaseCapacityTo(capacity_ + 1); + if (data() == nullptr) return; + } + + data_[size_++] = x; + } + + size_t size() const { return size_; } + size_t capacity() const { return capacity_; } + + uint8_t* data() { return data_.get(); } + const uint8_t* data() const { return data_.get(); } + + // std::vector operations implemented in terms of the public interface above. + + void clear() { resize(0); } + bool empty() const { return size() == 0; } + + void assign(std::initializer_list<uint8_t> il) { + resize(il.size()); + memcpy(data(), il.begin(), il.size()); + } + + // Replaces data() with [new_begin, new_end); potentially reallocates. + void assign(const uint8_t* new_begin, const uint8_t* new_end); + + uint8_t* begin() { return data(); } + const uint8_t* begin() const { return data(); } + uint8_t* end() { return begin() + size(); } + const uint8_t* end() const { return begin() + size(); } + + uint8_t& operator[](const size_t i) { + BoundsCheck(i); + return data()[i]; + } + const uint8_t& operator[](const size_t i) const { + BoundsCheck(i); + return data()[i]; + } + + uint8_t& back() { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + const uint8_t& back() const { + JXL_ASSERT(size() != 0); + return data()[size() - 1]; + } + + template <typename T> + void append(const T& other) { + append(reinterpret_cast<const uint8_t*>(other.data()), + reinterpret_cast<const uint8_t*>(other.data()) + other.size()); + } + + void append(const uint8_t* begin, const uint8_t* end) { + if (end - begin > 0) { + size_t old_size = size(); + resize(size() + (end - begin)); + memcpy(data() + old_size, begin, end - begin); + } + } + + private: + void BoundsCheck(size_t i) const { + // <= is safe due to padding and required by BitWriter. + JXL_ASSERT(i <= size()); + } + + // Copies existing data to newly allocated "data_". If allocation fails, + // data() == nullptr and size_ = capacity_ = 0. + // The new capacity will be at least 1.5 times the old capacity. This ensures + // that we avoid quadratic behaviour. + void IncreaseCapacityTo(size_t capacity); + + size_t size_; + size_t capacity_; + CacheAlignedUniquePtr data_; +}; + +template <typename T> +static inline void Append(const T& s, PaddedBytes* out, + size_t* JXL_RESTRICT byte_pos) { + memcpy(out->data() + *byte_pos, s.data(), s.size()); + *byte_pos += s.size(); + JXL_CHECK(*byte_pos <= out->size()); +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_PADDED_BYTES_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/printf_macros.h b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h new file mode 100644 index 0000000000..3215052afd --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h @@ -0,0 +1,34 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PRINTF_MACROS_H_ +#define LIB_JXL_BASE_PRINTF_MACROS_H_ + +// Format string macros. These should be included after any other system +// library since those may unconditionally define these, depending on the +// platform. + +// PRIuS and PRIdS macros to print size_t and ssize_t respectively. +#if !defined(PRIdS) +#if defined(_WIN64) +#define PRIdS "lld" +#elif defined(_WIN32) +#define PRIdS "d" +#else +#define PRIdS "zd" +#endif +#endif // PRIdS + +#if !defined(PRIuS) +#if defined(_WIN64) +#define PRIuS "llu" +#elif defined(_WIN32) +#define PRIuS "u" +#else +#define PRIuS "zu" +#endif +#endif // PRIuS + +#endif // LIB_JXL_BASE_PRINTF_MACROS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc new file mode 100644 index 0000000000..a38d9b82b7 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc @@ -0,0 +1,540 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/profiler.h" + +#if JXL_PROFILER_ENABLED + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> // memcpy + +#include <algorithm> // sort +#include <atomic> +#include <cinttypes> // PRIu64 +#include <hwy/cache_control.h> +#include <limits> +#include <new> + +// Optionally use SIMD in StreamCacheLine if available. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc" +#include <hwy/foreach_target.h> +#include <hwy/highway.h> + +HWY_BEFORE_NAMESPACE(); +namespace jxl { +namespace profiler { +namespace HWY_NAMESPACE { + +// Overwrites `to` without loading it into cache (read-for-ownership). +// Copies 64 bytes from/to naturally aligned addresses. +void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) { +#if HWY_TARGET == HWY_SCALAR + hwy::CopyBytes<64>(from, to); +#else + const HWY_CAPPED(uint64_t, 2) d; + HWY_FENCE; + const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from); + const auto v0 = Load(d, from64 + 0); + const auto v1 = Load(d, from64 + 2); + const auto v2 = Load(d, from64 + 4); + const auto v3 = Load(d, from64 + 6); + // Fences prevent the compiler from reordering loads/stores, which may + // interfere with write-combining. + HWY_FENCE; + uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to); + Stream(v0, d, to64 + 0); + Stream(v1, d, to64 + 2); + Stream(v2, d, to64 + 4); + Stream(v3, d, to64 + 6); + HWY_FENCE; +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace profiler +} // namespace jxl +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jxl { +namespace profiler { + +HWY_EXPORT(StreamCacheLine); + +namespace { + +// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that +// enters at least one zone. Once this buffer is full, the thread will analyze +// packets (two per zone), which introduces observer overhead. +#ifndef PROFILER_THREAD_STORAGE +#define PROFILER_THREAD_STORAGE 32ULL +#endif + +#define PROFILER_PRINT_OVERHEAD 0 + +// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT): +constexpr size_t kMaxDepth = 64; // Maximum nesting of zones. +constexpr size_t kMaxZones = 256; // Total number of zones. + +// Stack of active (entered but not exited) zones. POD, uninitialized. +// Used to deduct child duration from the parent's self time. +struct ActiveZone { + const char* name; + uint64_t entry_timestamp; + uint64_t child_total; +}; + +// Totals for all Zones with the same name. POD, must be zero-initialized. +struct ZoneTotals { + uint64_t total_duration; + const char* name; + uint64_t num_calls; +}; + +template <typename T> +inline T ClampedSubtract(const T minuend, const T subtrahend) { + if (subtrahend > minuend) { + return 0; + } + return minuend - subtrahend; +} + +} // namespace + +// Per-thread call graph (stack) and ZoneTotals for each zone. +class Results { + public: + Results() { + // Zero-initialize all accumulators (avoids a check for num_zones_ == 0). + memset(zones_, 0, sizeof(zones_)); + } + + // Used for computing overhead when this thread encounters its first Zone. + // This has no observable effect apart from increasing "analyze_elapsed_". + uint64_t ZoneDuration(const Packet* packets) { + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(num_zones_ == 0); + AnalyzePackets(packets, 2); + const uint64_t duration = zones_[0].total_duration; + zones_[0].num_calls = 0; + zones_[0].total_duration = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + return duration; + } + + void SetSelfOverhead(const uint64_t self_overhead) { + self_overhead_ = self_overhead; + } + + void SetChildOverhead(const uint64_t child_overhead) { + child_overhead_ = child_overhead; + } + + // Draw all required information from the packets, which can be discarded + // afterwards. Called whenever this thread's storage is full. + void AnalyzePackets(const Packet* HWY_RESTRICT packets, + const size_t num_packets) { + // Ensures prior weakly-ordered streaming stores are globally visible. + hwy::FlushStream(); + + const uint64_t t0 = TicksBefore(); + + for (size_t i = 0; i < num_packets; ++i) { + const uint64_t timestamp = packets[i].timestamp; + // Entering a zone + if (packets[i].name != nullptr) { + HWY_ASSERT(depth_ < kMaxDepth); + zone_stack_[depth_].name = packets[i].name; + zone_stack_[depth_].entry_timestamp = timestamp; + zone_stack_[depth_].child_total = 0; + ++depth_; + continue; + } + + HWY_ASSERT(depth_ != 0); + const ActiveZone& active = zone_stack_[depth_ - 1]; + const uint64_t duration = timestamp - active.entry_timestamp; + const uint64_t self_duration = ClampedSubtract( + duration, self_overhead_ + child_overhead_ + active.child_total); + + UpdateOrAdd(active.name, 1, self_duration); + --depth_; + + // "Deduct" the nested time from its parent's self_duration. + if (depth_ != 0) { + zone_stack_[depth_ - 1].child_total += duration + child_overhead_; + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + } + + // Incorporates results from another thread. Call after all threads have + // exited any zones. + void Assimilate(const Results& other) { + const uint64_t t0 = TicksBefore(); + HWY_ASSERT(depth_ == 0); + HWY_ASSERT(other.depth_ == 0); + + for (size_t i = 0; i < other.num_zones_; ++i) { + const ZoneTotals& zone = other.zones_[i]; + UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration); + } + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; + } + + // Single-threaded. + void Print() { + const uint64_t t0 = TicksBefore(); + MergeDuplicates(); + + // Sort by decreasing total (self) cost. + std::sort(zones_, zones_ + num_zones_, + [](const ZoneTotals& r1, const ZoneTotals& r2) { + return r1.total_duration > r2.total_duration; + }); + + uint64_t total_visible_duration = 0; + for (size_t i = 0; i < num_zones_; ++i) { + const ZoneTotals& r = zones_[i]; + if (r.name[0] != '@') { + total_visible_duration += r.total_duration; + printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name, + r.num_calls, r.total_duration / r.num_calls, r.total_duration); + } + } + + const uint64_t t1 = TicksAfter(); + analyze_elapsed_ += t1 - t0; + printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_); + printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration); + } + + // Single-threaded. Clears all results as if no zones had been recorded. + void Reset() { + analyze_elapsed_ = 0; + HWY_ASSERT(depth_ == 0); + num_zones_ = 0; + memset(zone_stack_, 0, sizeof(zone_stack_)); + memset(zones_, 0, sizeof(zones_)); + } + + private: + // Updates ZoneTotals of the same name, or inserts a new one if this thread + // has not yet seen that name. Uses a self-organizing list data structure, + // which avoids dynamic memory allocations and is faster than unordered_map. + void UpdateOrAdd(const char* name, const uint64_t num_calls, + const uint64_t duration) { + // Special case for first zone: (maybe) update, without swapping. + if (zones_[0].name == name) { + zones_[0].total_duration += duration; + zones_[0].num_calls += num_calls; + return; + } + + // Look for a zone with the same name. + for (size_t i = 1; i < num_zones_; ++i) { + if (zones_[i].name == name) { + zones_[i].total_duration += duration; + zones_[i].num_calls += num_calls; + // Swap with predecessor (more conservative than move to front, + // but at least as successful). + std::swap(zones_[i - 1], zones_[i]); + return; + } + } + + // Not found; create a new ZoneTotals. + HWY_ASSERT(num_zones_ < kMaxZones); + ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_; + zone->name = name; + zone->num_calls = num_calls; + zone->total_duration = duration; + ++num_zones_; + } + + // Each instantiation of a function template seems to get its own copy of + // __func__ and GCC doesn't merge them. An N^2 search for duplicates is + // acceptable because we only expect a few dozen zones. + void MergeDuplicates() { + for (size_t i = 0; i < num_zones_; ++i) { + // Add any subsequent duplicates to num_calls and total_duration. + for (size_t j = i + 1; j < num_zones_;) { + if (!strcmp(zones_[i].name, zones_[j].name)) { + zones_[i].num_calls += zones_[j].num_calls; + zones_[i].total_duration += zones_[j].total_duration; + // Fill hole with last item. + zones_[j] = zones_[--num_zones_]; + } else { // Name differed, try next ZoneTotals. + ++j; + } + } + } + } + + uint64_t analyze_elapsed_ = 0; + uint64_t self_overhead_ = 0; + uint64_t child_overhead_ = 0; + + size_t depth_ = 0; // Number of active zones <= kMaxDepth. + size_t num_zones_ = 0; // Number of unique zones <= kMaxZones. + + // After other members to avoid large pointer offsets. + alignas(64) ActiveZone zone_stack_[kMaxDepth]; // Last = newest + alignas(64) ZoneTotals zones_[kMaxZones]; // Self-organizing list +}; + +ThreadSpecific::ThreadSpecific() + : max_packets_(PROFILER_THREAD_STORAGE << 16), // MiB / sizeof(Packet) + packets_(hwy::AllocateAligned<Packet>(max_packets_)), + num_packets_(0), + results_(hwy::MakeUniqueAligned<Results>()) {} + +ThreadSpecific::~ThreadSpecific() {} + +void ThreadSpecific::FlushBuffer() { + if (num_packets_ + kBufferCapacity > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + // This buffering halves observer overhead and decreases the overall + // runtime by about 3%. + HWY_DYNAMIC_DISPATCH(StreamCacheLine) + (buffer_, packets_.get() + num_packets_); + num_packets_ += kBufferCapacity; + buffer_size_ = 0; +} + +void ThreadSpecific::AnalyzeRemainingPackets() { + // Storage full => empty it. + if (num_packets_ + buffer_size_ > max_packets_) { + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; + } + + // Move buffer to storage + memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet)); + num_packets_ += buffer_size_; + buffer_size_ = 0; + + results_->AnalyzePackets(packets_.get(), num_packets_); + num_packets_ = 0; +} + +namespace { + +class HalfSampleMode { + public: + // Returns mode. "sorted" must be in ascending order. + template <typename T> + T operator()(const T* const HWY_RESTRICT sorted, + const size_t num_values) const { + int64_t center = num_values / 2; + int64_t width = num_values; + + // Zoom in on modal intervals of decreasing width. Stop before we reach + // width=1, i.e. single values, for which there is no "slope". + while (width > 2) { + // Round up so we can still reach the outer edges of odd widths. + width = (width + 1) / 2; + + center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width); + } + + return sorted[center]; // mode := middle value in modal interval. + } + + private: + // Returns center of the densest region [c-radius, c+radius]. + template <typename T> + static HWY_INLINE int64_t CenterOfIntervalWithMinSlope( + const T* HWY_RESTRICT sorted, const int64_t total_values, + const int64_t center, const int64_t width) { + const int64_t radius = (width + 1) / 2; + + auto compute_slope = [radius, total_values, sorted]( + int64_t c, int64_t* actual_center = nullptr) { + // For symmetry, check 2*radius+1 values, i.e. [min, max]. + const int64_t min = std::max(c - radius, int64_t(0)); + const int64_t max = std::min(c + radius, total_values - 1); + HWY_ASSERT(min < max); + HWY_ASSERT(sorted[min] <= + sorted[max] + std::numeric_limits<float>::epsilon()); + const float dx = max - min + 1; + const float slope = (sorted[max] - sorted[min]) / dx; + + if (actual_center != nullptr) { + // c may be out of bounds, so return center of the clamped bounds. + *actual_center = (min + max + 1) / 2; + } + return slope; + }; + + // First find min_slope for all centers. + float min_slope = std::numeric_limits<float>::max(); + for (int64_t c = center - radius; c <= center + radius; ++c) { + min_slope = std::min(min_slope, compute_slope(c)); + } + + // Candidates := centers with slope ~= min_slope. + std::vector<int64_t> candidates; + for (int64_t c = center - radius; c <= center + radius; ++c) { + int64_t actual_center; + const float slope = compute_slope(c, &actual_center); + if (slope <= min_slope * 1.001f) { + candidates.push_back(actual_center); + } + } + + // Keep the median. + HWY_ASSERT(!candidates.empty()); + if (candidates.size() == 1) return candidates[0]; + std::nth_element(candidates.begin(), + candidates.begin() + candidates.size() / 2, + candidates.end()); + return candidates[candidates.size() / 2]; + } +}; + +} // namespace + +void ThreadSpecific::ComputeOverhead() { + // Delay after capturing timestamps before/after the actual zone runs. Even + // with frequency throttling disabled, this has a multimodal distribution, + // including 32, 34, 48, 52, 59, 62. + uint64_t self_overhead; + { + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 1024; + uint32_t durations[kNumDurations]; + + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + { // + PROFILER_ZONE("Dummy Zone (never shown)"); + } + const uint64_t duration = results_->ZoneDuration(buffer_); + buffer_size_ = 0; + durations[idx_duration] = static_cast<uint32_t>(duration); + HWY_ASSERT(num_packets_ == 0); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + // Median. + std::sort(samples, samples + kNumSamples); + self_overhead = samples[kNumSamples / 2]; +#if PROFILER_PRINT_OVERHEAD + printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead)); +#endif + results_->SetSelfOverhead(self_overhead); + } + + // Delay before capturing start timestamp / after end timestamp. + const size_t kNumSamples = 32; + uint32_t samples[kNumSamples]; + for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) { + const size_t kNumDurations = 16; + uint32_t durations[kNumDurations]; + for (size_t idx_duration = 0; idx_duration < kNumDurations; + ++idx_duration) { + const size_t kReps = 10000; + // Analysis time should not be included => must fit within buffer. + HWY_ASSERT(kReps * 2 < max_packets_); + hwy::FlushStream(); + const uint64_t t0 = TicksBefore(); + for (size_t i = 0; i < kReps; ++i) { + PROFILER_ZONE("Dummy"); + } + hwy::FlushStream(); + const uint64_t t1 = TicksAfter(); + HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2); + buffer_size_ = 0; + num_packets_ = 0; + const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps; + durations[idx_duration] = + static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead)); + } + std::sort(durations, durations + kNumDurations); + samples[idx_sample] = HalfSampleMode()(durations, kNumDurations); + } + std::sort(samples, samples + kNumSamples); + const uint64_t child_overhead = samples[9 * kNumSamples / 10]; +#if PROFILER_PRINT_OVERHEAD + printf("Child overhead: %" PRIu64 "\n", + static_cast<uint64_t>(child_overhead)); +#endif + results_->SetChildOverhead(child_overhead); +} + +namespace { + +// Could be a static member of Zone, but that would expose <atomic> in header. +std::atomic<ThreadSpecific*>& GetHead() { + static std::atomic<ThreadSpecific*> head_{nullptr}; // Owning + return head_; +} + +} // namespace + +// Thread-safe. +ThreadSpecific* Zone::InitThreadSpecific() { + ThreadSpecific* thread_specific = + hwy::MakeUniqueAligned<ThreadSpecific>().release(); + + // Insert into unordered list + std::atomic<ThreadSpecific*>& head = GetHead(); + ThreadSpecific* old_head = head.load(std::memory_order_relaxed); + thread_specific->SetNext(old_head); + while (!head.compare_exchange_weak(old_head, thread_specific, + std::memory_order_release, + std::memory_order_relaxed)) { + thread_specific->SetNext(old_head); + // TODO(janwas): pause + } + + // ComputeOverhead also creates a Zone, so this needs to be set before that + // to prevent infinite recursion. + GetThreadSpecific() = thread_specific; + + thread_specific->ComputeOverhead(); + return thread_specific; +} + +// Single-threaded. +/*static*/ void Zone::PrintResults() { + ThreadSpecific* head = GetHead().load(std::memory_order_relaxed); + ThreadSpecific* p = head; + while (p) { + p->AnalyzeRemainingPackets(); + + // Combine all threads into a single Result. + if (p != head) { + head->GetResults().Assimilate(p->GetResults()); + p->GetResults().Reset(); + } + + p = p->GetNext(); + } + + if (head != nullptr) { + head->GetResults().Print(); + head->GetResults().Reset(); + } +} + +} // namespace profiler +} // namespace jxl + +#endif // HWY_ONCE +#endif // JXL_PROFILER_ENABLED diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.h b/third_party/jpeg-xl/lib/jxl/base/profiler.h new file mode 100644 index 0000000000..4c0efa4b3a --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/profiler.h @@ -0,0 +1,170 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_PROFILER_H_ +#define LIB_JXL_BASE_PROFILER_H_ + +// High precision, low overhead time measurements. Returns exact call counts and +// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes). +// +// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which +// defines JXL_PROFILER_ENABLED. +// +// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or +// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }. +// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to +// print call counts and average durations [CPU cycles] to stdout, sorted in +// descending order of total duration. + +// If zero, this file has no effect and no measurements will be recorded. +#ifndef JXL_PROFILER_ENABLED +#define JXL_PROFILER_ENABLED 0 +#endif +#if JXL_PROFILER_ENABLED + +#include <stddef.h> +#include <stdint.h> + +#include <hwy/aligned_allocator.h> +#include <hwy/base.h> + +#include "lib/jxl/base/tsc_timer.h" + +#if HWY_COMPILER_MSVC +#define PROFILER_PUBLIC +#else +#define PROFILER_PUBLIC __attribute__((visibility("default"))) +#endif + +namespace jxl { +namespace profiler { + +// Represents zone entry/exit events. POD. +#pragma pack(push, 1) +struct Packet { + // Computing a hash or string table is likely too expensive, and offsets + // from other libraries' string literals can be too large to combine them and + // a full-resolution timestamp into 64 bits. + uint64_t timestamp; + const char* name; // nullptr for exit packets +#if UINTPTR_MAX <= 0xFFFFFFFFu + uint32_t padding; +#endif +}; +#pragma pack(pop) +static_assert(sizeof(Packet) == 16, "Wrong Packet size"); + +class Results; // pImpl + +// Per-thread packet storage, dynamically allocated and aligned. +class ThreadSpecific { + static constexpr size_t kBufferCapacity = 64 / sizeof(Packet); + + public: + PROFILER_PUBLIC explicit ThreadSpecific(); + PROFILER_PUBLIC ~ThreadSpecific(); + + // Depends on Zone => defined out of line. + PROFILER_PUBLIC void ComputeOverhead(); + + HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); } + HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); } + + PROFILER_PUBLIC void AnalyzeRemainingPackets(); + + // Accessors instead of public member for well-defined data layout. + void SetNext(ThreadSpecific* next) { next_ = next; } + ThreadSpecific* GetNext() const { return next_; } + + Results& GetResults() { return *results_; } + + private: + PROFILER_PUBLIC void FlushBuffer(); + + // Write packet to buffer/storage, emptying them as needed. + void Write(const char* name, const uint64_t timestamp) { + if (buffer_size_ == kBufferCapacity) { // Full + FlushBuffer(); + } + buffer_[buffer_size_].name = name; + buffer_[buffer_size_].timestamp = timestamp; + ++buffer_size_; + } + + // Write-combining buffer to avoid cache pollution. Must be the first + // non-static member to ensure cache-line alignment. + Packet buffer_[kBufferCapacity]; + size_t buffer_size_ = 0; + + // Contiguous storage for zone enter/exit packets. + const size_t max_packets_; + hwy::AlignedFreeUniquePtr<Packet[]> packets_; + size_t num_packets_; + + // Linked list of all threads. + ThreadSpecific* next_ = nullptr; // Owned, never released. + + hwy::AlignedUniquePtr<Results> results_; +}; + +// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also +// responsible for initializing ThreadSpecific. +class Zone { + public: + HWY_NOINLINE explicit Zone(const char* name) { + HWY_FENCE; + ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific(); + if (HWY_UNLIKELY(thread_specific == nullptr)) { + thread_specific = InitThreadSpecific(); + } + + thread_specific->WriteEntry(name); + } + + HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); } + + // Call exactly once after all threads have exited all zones. + PROFILER_PUBLIC static void PrintResults(); + + private: + // Returns reference to the thread's ThreadSpecific pointer (initially null). + // Function-local static avoids needing a separate definition. + static ThreadSpecific*& GetThreadSpecific() { + static thread_local ThreadSpecific* thread_specific; + return thread_specific; + } + + // Non time-critical. + PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific(); +}; + +// Creates a zone starting from here until the end of the current scope. +// Timestamps will be recorded when entering and exiting the zone. +// To ensure the name pointer remains valid, we require it to be a string +// literal (by merging with ""). We also compare strings by address. +#define PROFILER_ZONE(name) \ + HWY_FENCE; \ + const ::jxl::profiler::Zone zone("" name); \ + HWY_FENCE + +// Creates a zone for an entire function (when placed at its beginning). +// Shorter/more convenient than ZONE. +#define PROFILER_FUNC \ + HWY_FENCE; \ + const ::jxl::profiler::Zone zone(__func__); \ + HWY_FENCE + +#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults + +} // namespace profiler +} // namespace jxl + +#else // !JXL_PROFILER_ENABLED +#define PROFILER_ZONE(name) +#define PROFILER_FUNC +#define PROFILER_PRINT_RESULTS() +#endif + +#endif // LIB_JXL_BASE_PROFILER_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/random.cc b/third_party/jpeg-xl/lib/jxl/base/random.cc new file mode 100644 index 0000000000..c99f88921c --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/random.cc @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/jxl/base/random.h" + +#include <cmath> + +namespace jxl { + +Rng::GeometricDistribution::GeometricDistribution(float p) + : inv_log_1mp(1.0 / std::log(1 - p)) {} + +uint32_t Rng::Geometric(const GeometricDistribution& dist) { + float f = UniformF(0, 1); + float log = std::log(1 - f) * dist.inv_log_1mp; + return static_cast<uint32_t>(log); +} + +} // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/base/random.h b/third_party/jpeg-xl/lib/jxl/base/random.h new file mode 100644 index 0000000000..663b88c95d --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/random.h @@ -0,0 +1,95 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_RANDOM_ +#define LIB_JXL_BASE_RANDOM_ + +// Random number generator + distributions. +// We don't use <random> because the implementation (and thus results) differs +// between libstdc++ and libc++. + +#include <stdint.h> +#include <string.h> + +#include <algorithm> + +#include "lib/jxl/base/status.h" + +namespace jxl { +struct Rng { + explicit Rng(size_t seed) + : s{static_cast<uint64_t>(0x94D049BB133111EBull), + static_cast<uint64_t>(0xBF58476D1CE4E5B9ull) + seed} {} + + // Xorshift128+ adapted from xorshift128+-inl.h + uint64_t operator()() { + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + const uint64_t bits = s1 + s0; // b, c + s[0] = s0; + s1 ^= s1 << 23; + s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5); + s[1] = s1; + return bits; + } + + // Uniformly distributed int64_t in [begin, end), under the assumption that + // `end-begin` is significantly smaller than 1<<64, otherwise there is some + // bias. + int64_t UniformI(int64_t begin, int64_t end) { + JXL_DASSERT(end > begin); + return static_cast<int64_t>((*this)() % + static_cast<uint64_t>(end - begin)) + + begin; + } + + // Same as UniformI, but for uint64_t. + uint64_t UniformU(uint64_t begin, uint64_t end) { + JXL_DASSERT(end > begin); + return (*this)() % (end - begin) + begin; + } + + // Uniformly distributed float in [begin, end) range. Note: only 23 bits of + // randomness. + float UniformF(float begin, float end) { + float f; + // Bits of a random [1, 2) float. + uint32_t u = ((*this)() >> (64 - 23)) | 0x3F800000; + static_assert(sizeof(f) == sizeof(u), + "Float and U32 must have the same size"); + memcpy(&f, &u, sizeof(f)); + // Note: (end-begin) * f + (2*begin-end) may fail to return a number >= + // begin. + return (end - begin) * (f - 1.0f) + begin; + } + + // Bernoulli trial + bool Bernoulli(float p) { return UniformF(0, 1) < p; } + + // State for geometric distributions. + struct GeometricDistribution { + explicit GeometricDistribution(float p); + + private: + float inv_log_1mp; + friend struct Rng; + }; + + uint32_t Geometric(const GeometricDistribution& dist); + + template <typename T> + void Shuffle(T* t, size_t n) { + for (size_t i = 0; i + 1 < n; i++) { + size_t a = UniformU(i, n); + std::swap(t[a], t[i]); + } + } + + private: + uint64_t s[2]; +}; + +} // namespace jxl +#endif // LIB_JXL_BASE_RANDOM_ diff --git a/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h new file mode 100644 index 0000000000..315f3bd003 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h @@ -0,0 +1,44 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+#define LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#ifdef ADDRESS_SANITIZER
+#define JXL_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define JXL_ADDRESS_SANITIZER 1
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+
+#ifdef THREAD_SANITIZER
+#define JXL_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define JXL_THREAD_SANITIZER 1
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#endif // LIB_JXL_BASE_SANITIZER_DEFINITIONS_H
diff --git a/third_party/jpeg-xl/lib/jxl/base/scope_guard.h b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h new file mode 100644 index 0000000000..a18a44cb79 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h @@ -0,0 +1,48 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SCOPE_GUARD_H_ +#define LIB_JXL_BASE_SCOPE_GUARD_H_ + +#include <utility> + +namespace jxl { + +template <typename Callback> +class ScopeGuard { + public: + // Discourage unnecessary moves / copies. + ScopeGuard(const ScopeGuard &) = delete; + ScopeGuard &operator=(const ScopeGuard &) = delete; + ScopeGuard &operator=(ScopeGuard &&) = delete; + + // Pre-C++17 does not guarantee RVO -> require move constructor. + ScopeGuard(ScopeGuard &&other) : callback_(std::move(other.callback_)) { + other.armed_ = false; + } + + template <typename CallbackParam> + explicit ScopeGuard(CallbackParam &&callback) + : callback_(std::forward<CallbackParam>(callback)), armed_(true) {} + + ~ScopeGuard() { + if (armed_) callback_(); + } + + void Disarm() { armed_ = false; } + + private: + Callback callback_; + bool armed_; +}; + +template <typename Callback> +ScopeGuard<Callback> MakeScopeGuard(Callback &&callback) { + return ScopeGuard<Callback>{std::forward<Callback>(callback)}; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_SCOPE_GUARD_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/span.h b/third_party/jpeg-xl/lib/jxl/base/span.h new file mode 100644 index 0000000000..41c3623a4b --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/span.h @@ -0,0 +1,60 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_SPAN_H_ +#define LIB_JXL_BASE_SPAN_H_ + +// Span (array view) is a non-owning container that provides cheap "cut" +// operations and could be used as "ArrayLike" data source for PaddedBytes. + +#include <stddef.h> + +#include "lib/jxl/base/status.h" + +namespace jxl { + +template <typename T> +class Span { + public: + constexpr Span() noexcept : Span(nullptr, 0) {} + + constexpr Span(T* array, size_t length) noexcept + : ptr_(array), len_(length) {} + + template <size_t N> + explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {} + + template <typename ArrayLike> + explicit constexpr Span(const ArrayLike& other) noexcept + : Span(reinterpret_cast<T*>(other.data()), other.size()) { + static_assert(sizeof(*other.data()) == sizeof(T), + "Incompatible type of source."); + } + + constexpr T* data() const noexcept { return ptr_; } + + constexpr size_t size() const noexcept { return len_; } + + constexpr bool empty() const noexcept { return len_ == 0; } + + constexpr T& operator[](size_t i) const noexcept { + // MSVC 2015 accepts this as constexpr, but not ptr_[i] + return *(data() + i); + } + + void remove_prefix(size_t n) noexcept { + JXL_ASSERT(size() >= n); + ptr_ += n; + len_ -= n; + } + + private: + T* ptr_; + size_t len_; +}; + +} // namespace jxl + +#endif // LIB_JXL_BASE_SPAN_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/status.h b/third_party/jpeg-xl/lib/jxl/base/status.h new file mode 100644 index 0000000000..f40be0c434 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/status.h @@ -0,0 +1,326 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_STATUS_H_ +#define LIB_JXL_BASE_STATUS_H_ + +// Error handling: Status return type + helper macros. + +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "lib/jxl/base/compiler_specific.h" +#include "lib/jxl/base/sanitizer_definitions.h" + +#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif // defined(*_SANITIZER) + +namespace jxl { + +// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is +// reached: +// #define JXL_CRASH_ON_ERROR + +#ifndef JXL_ENABLE_ASSERT +#define JXL_ENABLE_ASSERT 1 +#endif + +#ifndef JXL_ENABLE_CHECK +#define JXL_ENABLE_CHECK 1 +#endif + +// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a +// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is +// irrelevant if you also pass -DJXL_CRASH_ON_ERROR. +#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR) +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR +#ifdef NDEBUG +#define JXL_DEBUG_ON_ERROR 0 +#else // NDEBUG +#define JXL_DEBUG_ON_ERROR 1 +#endif // NDEBUG +#endif // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR + +// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on +// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR. +#if defined(JXL_DEBUG_ON_ALL_ERROR) +#undef JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 1 +// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too. +#undef JXL_DEBUG_ON_ERROR +#define JXL_DEBUG_ON_ERROR 1 +#else // JXL_DEBUG_ON_ALL_ERROR +#define JXL_DEBUG_ON_ALL_ERROR 0 +#endif // JXL_DEBUG_ON_ALL_ERROR + +// The Verbose level for the library +#ifndef JXL_DEBUG_V_LEVEL +#define JXL_DEBUG_V_LEVEL 0 +#endif // JXL_DEBUG_V_LEVEL + +// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT, +// JXL_CHECK and JXL_ABORT. +#ifndef JXL_DEBUG_ON_ABORT +#define JXL_DEBUG_ON_ABORT 1 +#endif // JXL_DEBUG_ON_ABORT + +// Print a debug message on standard error. You should use the JXL_DEBUG macro +// instead of calling Debug directly. This function returns false, so it can be +// used as a return value in JXL_FAILURE. +JXL_FORMAT(1, 2) +inline JXL_NOINLINE bool Debug(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + return false; +} + +// Print a debug message on standard error if "enabled" is true. "enabled" is +// normally a macro that evaluates to 0 or 1 at compile time, so the Debug +// function is never called and optimized out in release builds. Note that the +// arguments are compiled but not evaluated when enabled is false. The format +// string must be a explicit string in the call, for example: +// JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var); +// Add a header at the top of your module's .cc or .h file (depending on whether +// you have JXL_DEBUG calls from the .h as well) like this: +// #ifndef JXL_DEBUG_MYMODULE +// #define JXL_DEBUG_MYMODULE 0 +// #endif JXL_DEBUG_MYMODULE +#define JXL_DEBUG_TMP(format, ...) \ + ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__) + +#define JXL_DEBUG(enabled, format, ...) \ + do { \ + if (enabled) { \ + JXL_DEBUG_TMP(format, ##__VA_ARGS__); \ + } \ + } while (0) + +// JXL_DEBUG version that prints the debug message if the global verbose level +// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the +// passed level. +#define JXL_DEBUG_V(level, format, ...) \ + JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__) + +// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and +// debug). +#ifdef JXL_DEBUG_WARNING +#undef JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#else // JXL_DEBUG_WARNING +#ifdef NDEBUG +#define JXL_DEBUG_WARNING 0 +#else // JXL_DEBUG_WARNING +#define JXL_DEBUG_WARNING 1 +#endif // NDEBUG +#endif // JXL_DEBUG_WARNING +#define JXL_WARNING(format, ...) \ + JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__) + +// Exits the program after printing a stack trace when possible. +JXL_NORETURN inline JXL_NOINLINE bool Abort() { +#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER + // If compiled with any sanitizer print a stack trace. This call doesn't crash + // the program, instead the trap below will crash it also allowing gdb to + // break there. + __sanitizer_print_stack_trace(); +#endif // *_SANITIZER) + +#if JXL_COMPILER_MSVC + __debugbreak(); + abort(); +#else + __builtin_trap(); +#endif +} + +// Exits the program after printing file/line plus a formatted string. +#define JXL_ABORT(format, ...) \ + ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \ + __FILE__, __LINE__, ##__VA_ARGS__), \ + ::jxl::Abort()) + +// Does not guarantee running the code, use only for debug mode checks. +#if JXL_ENABLE_ASSERT +#define JXL_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_ASSERT(condition) \ + do { \ + } while (0) +#endif + +// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds, +// but not opt or release. +#ifndef JXL_IS_DEBUG_BUILD +#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \ + defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \ + defined(__clang_analyzer__) +#define JXL_IS_DEBUG_BUILD 1 +#else +#define JXL_IS_DEBUG_BUILD 0 +#endif +#endif // JXL_IS_DEBUG_BUILD + +// Same as above, but only runs in debug builds (builds where NDEBUG is not +// defined). This is useful for slower asserts that we want to run more rarely +// than usual. These will run on asan, msan and other debug builds, but not in +// opt or release. +#if JXL_IS_DEBUG_BUILD +#define JXL_DASSERT(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_DASSERT(condition) \ + do { \ + } while (0) +#endif + +// Always runs the condition, so can be used for non-debug calls. +#if JXL_ENABLE_CHECK +#define JXL_CHECK(condition) \ + do { \ + if (!(condition)) { \ + JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \ + ::jxl::Abort(); \ + } \ + } while (0) +#else +#define JXL_CHECK(condition) \ + do { \ + (void)(condition); \ + } while (0) +#endif + +// A jxl::Status value from a StatusCode or Status which prints a debug message +// when enabled. +#define JXL_STATUS(status, format, ...) \ + ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \ + __LINE__, ##__VA_ARGS__) + +// Notify of an error but discard the resulting Status value. This is only +// useful for debug builds or when building with JXL_CRASH_ON_ERROR. +#define JXL_NOTIFY_ERROR(format, ...) \ + (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \ + ##__VA_ARGS__) + +// An error Status with a message. The JXL_STATUS() macro will return a Status +// object with a kGenericError code, but the comma operator helps with +// clang-tidy inference and potentially with optimizations. +#define JXL_FAILURE(format, ...) \ + ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \ + ##__VA_ARGS__), \ + ::jxl::Status(::jxl::StatusCode::kGenericError)) + +// Always evaluates the status exactly once, so can be used for non-debug calls. +// Returns from the current context if the passed Status expression is an error +// (fatal or non-fatal). The return value is the passed Status. +#define JXL_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + (void)::jxl::StatusMessage( \ + jxl_return_if_error_status, \ + "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \ + static_cast<int>(jxl_return_if_error_status.code()), #status); \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +// As above, but without calling StatusMessage. Intended for bundles (see +// fields.h), which have numerous call sites (-> relevant for code size) and do +// not want to generate excessive messages when decoding partial headers. +#define JXL_QUIET_RETURN_IF_ERROR(status) \ + do { \ + ::jxl::Status jxl_return_if_error_status = (status); \ + if (!jxl_return_if_error_status) { \ + return jxl_return_if_error_status; \ + } \ + } while (0) + +enum class StatusCode : int32_t { + // Non-fatal errors (negative values). + kNotEnoughBytes = -1, + + // The only non-error status code. + kOk = 0, + + // Fatal-errors (positive values) + kGenericError = 1, +}; + +// Drop-in replacement for bool that raises compiler warnings if not used +// after being returned from a function. Example: +// Status LoadFile(...) { return true; } is more compact than +// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; } +// In case of error, the status can carry an extra error code in its value which +// is split between fatal and non-fatal error codes. +class JXL_MUST_USE_RESULT Status { + public: + // We want implicit constructor from bool to allow returning "true" or "false" + // on a function when using Status. "true" means kOk while "false" means a + // generic fatal error. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(bool ok) + : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {} + + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr Status(StatusCode code) : code_(code) {} + + // We also want implicit cast to bool to check for return values of functions. + // NOLINTNEXTLINE(google-explicit-constructor) + constexpr operator bool() const { return code_ == StatusCode::kOk; } + + constexpr StatusCode code() const { return code_; } + + // Returns whether the status code is a fatal error. + constexpr bool IsFatalError() const { + return static_cast<int32_t>(code_) > 0; + } + + private: + StatusCode code_; +}; + +// Helper function to create a Status and print the debug message or abort when +// needed. +inline JXL_FORMAT(2, 3) Status + StatusMessage(const Status status, const char* format, ...) { + // This block will be optimized out when JXL_DEBUG_ON_ERROR and + // JXL_DEBUG_ON_ALL_ERROR are both disabled. + if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) || + (JXL_DEBUG_ON_ALL_ERROR && !status)) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } +#ifdef JXL_CRASH_ON_ERROR + // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors. + if (status.IsFatalError()) { + Abort(); + } +#endif // JXL_CRASH_ON_ERROR + return status; +} + +} // namespace jxl + +#endif // LIB_JXL_BASE_STATUS_H_ diff --git a/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h new file mode 100644 index 0000000000..74d51f72d1 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h @@ -0,0 +1,172 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef LIB_JXL_BASE_TSC_TIMER_H_ +#define LIB_JXL_BASE_TSC_TIMER_H_ + +// High-resolution (~10 ns) timestamps, using fences to prevent reordering and +// ensure exactly the desired regions are measured. + +#include <stdint.h> +#include <time.h> // clock_gettime + +#if defined(_WIN32) || defined(_WIN64) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#ifndef NOGDI +#define NOGDI +#endif // NOGDI +#include <windows.h> +// Undef macros to avoid collisions +#undef LoadFence +#endif + +#if defined(__APPLE__) +#include <mach/mach.h> +#include <mach/mach_time.h> +#endif + +#if defined(__HAIKU__) +#include <OS.h> +#endif + +#include <ctime> +#include <hwy/base.h> +#include <hwy/cache_control.h> // LoadFence + +namespace jxl { +namespace profiler { + +// Ticks := platform-specific timer values (CPU cycles on x86). Must be +// unsigned to guarantee wraparound on overflow. +using Ticks = uint64_t; + +// TicksBefore/After return absolute timestamps and must be placed immediately +// before and after the region to measure. We provide separate Before/After +// functions because they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final RDTSCP would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional RDTSCP half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, +// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE. +// +// Using Before+Before leads to higher variance and overhead than After+After. +// However, After+After includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Before+After +// is faster than Before+Before and more consistent than After+After because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, +// divide by InvariantTicksPerSecond. +static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + hwy::LoadFence(); + HWY_FENCE; + t = __rdtsc(); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#elif HWY_ARCH_RVV + asm volatile("rdcycle %0" : "=r"(t)); +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + t = counter.QuadPart; +#elif defined(__APPLE__) + t = mach_absolute_time(); +#elif defined(__HAIKU__) + t = system_time_nsecs(); // since boot +#else // POSIX + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec); +#endif + return t; +} + +static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + HWY_FENCE; + unsigned aux; + t = __rdtscp(&aux); + hwy::LoadFence(); + HWY_FENCE; +#elif HWY_ARCH_X86_64 + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else + t = TicksBefore(); // no difference on other platforms. +#endif + return t; +} + +} // namespace profiler +} // namespace jxl + +#endif // LIB_JXL_BASE_TSC_TIMER_H_ |