diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 01:47:29 +0000 |
commit | 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d (patch) | |
tree | a31f07c9bcca9d56ce61e9a1ffd30ef350d513aa /third_party/jpeg-xl/lib/threads | |
parent | Initial commit. (diff) | |
download | firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.tar.xz firefox-esr-0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d.zip |
Adding upstream version 115.8.0esr.upstream/115.8.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/jpeg-xl/lib/threads')
6 files changed, 813 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in b/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in new file mode 100644 index 0000000000..50b937a840 --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in @@ -0,0 +1,13 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=@PKGCONFIG_TARGET_LIBS@ +includedir=@PKGCONFIG_TARGET_INCLUDES@ + +Name: libjxl_threads +Description: JPEG XL multi-thread runner using std::threads. +Version: @JPEGXL_LIBRARY_VERSION@ +Requires.private: @JPEGXL_THREADS_LIBRARY_REQUIRES@ +Libs: -L${libdir} -ljxl_threads +Libs.private: -lm +Cflags: -I${includedir} +Cflags.private: -DJXL_THREADS_STATIC_DEFINE diff --git a/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc b/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc new file mode 100644 index 0000000000..db27286dea --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc @@ -0,0 +1,195 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <jxl/resizable_parallel_runner.h> + +#include <algorithm> +#include <atomic> +#include <condition_variable> +#include <mutex> +#include <thread> +#include <vector> + +namespace jpegxl { +namespace { + +// A thread pool that allows changing the number of threads it runs. It also +// runs tasks on the calling thread, which can work better on schedulers for +// heterogeneous architectures. +struct ResizeableParallelRunner { + void SetNumThreads(size_t num) { + if (num > 0) { + num -= 1; + } + { + std::unique_lock<std::mutex> l(state_mutex_); + num_desired_workers_ = num; + workers_can_proceed_.notify_all(); + } + if (workers_.size() < num) { + for (size_t i = workers_.size(); i < num; i++) { + workers_.emplace_back([this, i]() { WorkerBody(i); }); + } + } + if (workers_.size() > num) { + for (size_t i = num; i < workers_.size(); i++) { + workers_[i].join(); + } + workers_.resize(num); + } + } + + ~ResizeableParallelRunner() { SetNumThreads(0); } + + JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start, + uint32_t end) { + if (start + 1 == end) { + JxlParallelRetCode ret = init(jxl_opaque, 1); + if (ret != 0) return ret; + + func(jxl_opaque, start, 0); + return ret; + } + + size_t num_workers = std::min<size_t>(workers_.size() + 1, end - start); + JxlParallelRetCode ret = init(jxl_opaque, num_workers); + if (ret != 0) { + return ret; + } + + { + std::unique_lock<std::mutex> l(state_mutex_); + // Avoid waking up more workers than needed. + max_running_workers_ = end - start - 1; + next_task_ = start; + end_task_ = end; + func_ = func; + jxl_opaque_ = jxl_opaque; + work_available_ = true; + num_running_workers_++; + workers_can_proceed_.notify_all(); + } + + DequeueTasks(0); + + while (true) { + std::unique_lock<std::mutex> l(state_mutex_); + if (num_running_workers_ == 0) break; + work_done_.wait(l); + } + + return ret; + } + + private: + void WorkerBody(size_t worker_id) { + while (true) { + { + std::unique_lock<std::mutex> l(state_mutex_); + // Worker pool was reduced, resize down. + if (worker_id >= num_desired_workers_) { + return; + } + // Nothing to do this time. + if (!work_available_ || worker_id >= max_running_workers_) { + workers_can_proceed_.wait(l); + continue; + } + num_running_workers_++; + } + DequeueTasks(worker_id + 1); + } + } + + void DequeueTasks(size_t thread_id) { + while (true) { + uint32_t task = next_task_++; + if (task >= end_task_) { + std::unique_lock<std::mutex> l(state_mutex_); + num_running_workers_--; + work_available_ = false; + if (num_running_workers_ == 0) { + work_done_.notify_all(); + } + break; + } + func_(jxl_opaque_, task, thread_id); + } + } + + // Checks when the worker has something to do, which can be one of: + // - quitting (when worker_id >= num_desired_workers_) + // - having work available for them (work_available_ is true and worker_id >= + // max_running_workers_) + std::condition_variable workers_can_proceed_; + + // Workers are done, and the main thread can proceed (num_running_workers_ == + // 0) + std::condition_variable work_done_; + + std::vector<std::thread> workers_; + + // Protects all the remaining variables, except for func_, jxl_opaque_ and + // end_task_ (for which only the write by the main thread is protected, and + // subsequent uses by workers happen-after it) and next_task_ (which is + // atomic). + std::mutex state_mutex_; + + // Range of tasks still need to be done. + std::atomic<uint32_t> next_task_; + uint32_t end_task_; + + // Function to run and its argument. + JxlParallelRunFunction func_; + void* jxl_opaque_; // not owned + + // Variables that control the workers: + // - work_available_ is set to true after a call to Run() and to false at the + // end of it. + // - num_desired_workers_ represents the number of workers that should be + // present. + // - max_running_workers_ represents the number of workers that should be + // executing tasks. + // - num_running_workers_ represents the number of workers that are executing + // tasks. + size_t num_desired_workers_ = 0; + size_t max_running_workers_ = 0; + size_t num_running_workers_ = 0; + bool work_available_ = false; +}; +} // namespace +} // namespace jpegxl + +extern "C" { +JXL_THREADS_EXPORT JxlParallelRetCode JxlResizableParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + return static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque) + ->Run(jpegxl_opaque, init, func, start_range, end_range); +} + +JXL_THREADS_EXPORT void* JxlResizableParallelRunnerCreate( + const JxlMemoryManager* memory_manager) { + return new jpegxl::ResizeableParallelRunner(); +} + +JXL_THREADS_EXPORT void JxlResizableParallelRunnerSetThreads( + void* runner_opaque, size_t num_threads) { + static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque) + ->SetNumThreads(num_threads); +} + +JXL_THREADS_EXPORT void JxlResizableParallelRunnerDestroy(void* runner_opaque) { + delete static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque); +} + +JXL_THREADS_EXPORT uint32_t +JxlResizableParallelRunnerSuggestThreads(uint64_t xsize, uint64_t ysize) { + // ~one thread per group. + return std::min<uint64_t>(std::thread::hardware_concurrency(), + xsize * ysize / (256 * 256)); +} +} diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc new file mode 100644 index 0000000000..47b81bdb16 --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc @@ -0,0 +1,101 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <jxl/thread_parallel_runner.h> +#include <string.h> + +#include "lib/threads/thread_parallel_runner_internal.h" + +namespace { + +// Default JxlMemoryManager using malloc and free for the jpegxl_threads +// library. Same as the default JxlMemoryManager for the jpegxl library +// itself. + +// Default alloc and free functions. +void* ThreadMemoryManagerDefaultAlloc(void* opaque, size_t size) { + return malloc(size); +} + +void ThreadMemoryManagerDefaultFree(void* opaque, void* address) { + free(address); +} + +// Initializes the memory manager instance with the passed one. The +// MemoryManager passed in |memory_manager| may be NULL or contain NULL +// functions which will be initialized with the default ones. If either alloc +// or free are NULL, then both must be NULL, otherwise this function returns an +// error. +bool ThreadMemoryManagerInit(JxlMemoryManager* self, + const JxlMemoryManager* memory_manager) { + if (memory_manager) { + *self = *memory_manager; + } else { + memset(self, 0, sizeof(*self)); + } + if (!self->alloc != !self->free) { + return false; + } + if (!self->alloc) self->alloc = ThreadMemoryManagerDefaultAlloc; + if (!self->free) self->free = ThreadMemoryManagerDefaultFree; + + return true; +} + +void* ThreadMemoryManagerAlloc(const JxlMemoryManager* memory_manager, + size_t size) { + return memory_manager->alloc(memory_manager->opaque, size); +} + +void ThreadMemoryManagerFree(const JxlMemoryManager* memory_manager, + void* address) { + return memory_manager->free(memory_manager->opaque, address); +} + +} // namespace + +JxlParallelRetCode JxlThreadParallelRunner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + return jpegxl::ThreadParallelRunner::Runner( + runner_opaque, jpegxl_opaque, init, func, start_range, end_range); +} + +/// Starts the given number of worker threads and blocks until they are ready. +/// "num_worker_threads" defaults to one per hyperthread. If zero, all tasks +/// run on the main thread. +void* JxlThreadParallelRunnerCreate(const JxlMemoryManager* memory_manager, + size_t num_worker_threads) { + JxlMemoryManager local_memory_manager; + if (!ThreadMemoryManagerInit(&local_memory_manager, memory_manager)) + return nullptr; + + void* alloc = ThreadMemoryManagerAlloc(&local_memory_manager, + sizeof(jpegxl::ThreadParallelRunner)); + if (!alloc) return nullptr; + // Placement new constructor on allocated memory + jpegxl::ThreadParallelRunner* runner = + new (alloc) jpegxl::ThreadParallelRunner(num_worker_threads); + runner->memory_manager = local_memory_manager; + + return runner; +} + +void JxlThreadParallelRunnerDestroy(void* runner_opaque) { + jpegxl::ThreadParallelRunner* runner = + reinterpret_cast<jpegxl::ThreadParallelRunner*>(runner_opaque); + if (runner) { + JxlMemoryManager local_memory_manager = runner->memory_manager; + // Call destructor directly since custom free function is used. + runner->~ThreadParallelRunner(); + ThreadMemoryManagerFree(&local_memory_manager, runner); + } +} + +// Get default value for num_worker_threads parameter of +// InitJxlThreadParallelRunner. +size_t JxlThreadParallelRunnerDefaultNumWorkerThreads() { + return std::thread::hardware_concurrency(); +} diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc new file mode 100644 index 0000000000..f26a9ba263 --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc @@ -0,0 +1,215 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "lib/threads/thread_parallel_runner_internal.h" + +#include <algorithm> + +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) +#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace +#endif // defined(*_SANITIZER) + +#include <jxl/thread_parallel_runner.h> + +#include "lib/jxl/base/profiler.h" + +namespace { + +// Important: JXL_ASSERT does not guarantee running the `condition` code, +// use only for debug mode checks. + +#if JXL_ENABLE_ASSERT +// Exits the program after printing a stack trace when possible. +bool Abort() { +#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ + defined(THREAD_SANITIZER) + // If compiled with any sanitizer print a stack trace. This call doesn't crash + // the program, instead the trap below will crash it also allowing gdb to + // break there. + __sanitizer_print_stack_trace(); +#endif // defined(*_SANITIZER) + +#ifdef _MSC_VER + __debugbreak(); + abort(); +#else + __builtin_trap(); +#endif +} +#define JXL_ASSERT(condition) \ + do { \ + if (!(condition)) { \ + Abort(); \ + } \ + } while (0) +#else +#define JXL_ASSERT(condition) \ + do { \ + } while (0) +#endif +} // namespace + +namespace jpegxl { + +// static +JxlParallelRetCode ThreadParallelRunner::Runner( + void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init, + JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) { + ThreadParallelRunner* self = + static_cast<ThreadParallelRunner*>(runner_opaque); + if (start_range > end_range) return -1; + if (start_range == end_range) return 0; + + int ret = init(jpegxl_opaque, std::max<size_t>(self->num_worker_threads_, 1)); + if (ret != 0) return ret; + + // Use a sequential run when num_worker_threads_ is zero since we have no + // worker threads. + if (self->num_worker_threads_ == 0) { + const size_t thread = 0; + for (uint32_t task = start_range; task < end_range; ++task) { + func(jpegxl_opaque, task, thread); + } + return 0; + } + + if (self->depth_.fetch_add(1, std::memory_order_acq_rel) != 0) { + return -1; // Must not re-enter. + } + + const WorkerCommand worker_command = + (static_cast<WorkerCommand>(start_range) << 32) + end_range; + // Ensure the inputs do not result in a reserved command. + JXL_ASSERT(worker_command != kWorkerWait); + JXL_ASSERT(worker_command != kWorkerOnce); + JXL_ASSERT(worker_command != kWorkerExit); + + self->data_func_ = func; + self->jpegxl_opaque_ = jpegxl_opaque; + self->num_reserved_.store(0, std::memory_order_relaxed); + + self->StartWorkers(worker_command); + self->WorkersReadyBarrier(); + + if (self->depth_.fetch_add(-1, std::memory_order_acq_rel) != 1) { + return -1; + } + return 0; +} + +// static +void ThreadParallelRunner::RunRange(ThreadParallelRunner* self, + const WorkerCommand command, + const int thread) { + const uint32_t begin = command >> 32; + const uint32_t end = command & 0xFFFFFFFF; + const uint32_t num_tasks = end - begin; + const uint32_t num_worker_threads = self->num_worker_threads_; + + // OpenMP introduced several "schedule" strategies: + // "single" (static assignment of exactly one chunk per thread): slower. + // "dynamic" (allocates k tasks at a time): competitive for well-chosen k. + // "guided" (allocates k tasks, decreases k): computing k = remaining/n + // is faster than halving k each iteration. We prefer this strategy + // because it avoids user-specified parameters. + + for (;;) { +#if 0 + // dynamic + const uint32_t my_size = std::max(num_tasks / (num_worker_threads * 4), 1); +#else + // guided + const uint32_t num_reserved = + self->num_reserved_.load(std::memory_order_relaxed); + // It is possible that more tasks are reserved than ready to run. + const uint32_t num_remaining = + num_tasks - std::min(num_reserved, num_tasks); + const uint32_t my_size = + std::max(num_remaining / (num_worker_threads * 4), 1u); +#endif + const uint32_t my_begin = begin + self->num_reserved_.fetch_add( + my_size, std::memory_order_relaxed); + const uint32_t my_end = std::min(my_begin + my_size, begin + num_tasks); + // Another thread already reserved the last task. + if (my_begin >= my_end) { + break; + } + for (uint32_t task = my_begin; task < my_end; ++task) { + self->data_func_(self->jpegxl_opaque_, task, thread); + } + } +} + +// static +void ThreadParallelRunner::ThreadFunc(ThreadParallelRunner* self, + const int thread) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock<std::mutex> lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerOnce: + lock.unlock(); + self->data_func_(self->jpegxl_opaque_, thread, thread); + break; + case kWorkerExit: + return; // exits thread + default: + lock.unlock(); + RunRange(self, command, thread); + break; + } + } +} + +ThreadParallelRunner::ThreadParallelRunner(const int num_worker_threads) + : num_worker_threads_(num_worker_threads), + num_threads_(std::max(num_worker_threads, 1)) { + PROFILER_ZONE("ThreadParallelRunner ctor"); + + threads_.reserve(num_worker_threads_); + + // Suppress "unused-private-field" warning. + (void)padding1; + (void)padding2; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + + for (uint32_t i = 0; i < num_worker_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this, i); + } + + if (num_worker_threads_ != 0) { + WorkersReadyBarrier(); + } + + // Warm up profiler on worker threads so its expensive initialization + // doesn't count towards other timer measurements. + RunOnEachThread( + [](const int task, const int thread) { PROFILER_ZONE("@InitWorkers"); }); +} + +ThreadParallelRunner::~ThreadParallelRunner() { + if (num_worker_threads_ != 0) { + StartWorkers(kWorkerExit); + } + + for (std::thread& thread : threads_) { + JXL_ASSERT(thread.joinable()); + thread.join(); + } +} +} // namespace jpegxl diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h new file mode 100644 index 0000000000..199a5f2a8b --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h @@ -0,0 +1,166 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// + +// C++ implementation using std::thread of a ::JxlParallelRunner. + +// The main class in this module, ThreadParallelRunner, implements a static +// method ThreadParallelRunner::Runner than can be passed as a +// JxlParallelRunner when using the JPEG XL library. This uses std::thread +// internally and related synchronization functions. The number of threads +// created is fixed at construction time and the threads are re-used for every +// ThreadParallelRunner::Runner call. Only one concurrent Runner() call per +// instance is allowed at a time. +// +// This is a scalable, lower-overhead thread pool runner, especially suitable +// for data-parallel computations in the fork-join model, where clients need to +// know when all tasks have completed. +// +// This thread pool can efficiently load-balance millions of tasks using an +// atomic counter, thus avoiding per-task virtual or system calls. With 48 +// hyperthreads and 1M tasks that add to an atomic counter, overall runtime is +// 10-20x higher when using std::async, and ~200x for a queue-based thread +// pool. +// +// Usage: +// ThreadParallelRunner runner; +// JxlDecode( +// ... , &ThreadParallelRunner::Runner, static_cast<void*>(&runner)); + +#ifndef LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ +#define LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ + +#include <jxl/memory_manager.h> +#include <jxl/parallel_runner.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +#include <atomic> +#include <condition_variable> //NOLINT +#include <mutex> //NOLINT +#include <thread> //NOLINT +#include <vector> + +namespace jpegxl { + +// Main helper class implementing the ::JxlParallelRunner interface. +class ThreadParallelRunner { + public: + // ::JxlParallelRunner interface. + static JxlParallelRetCode Runner(void* runner_opaque, void* jpegxl_opaque, + JxlParallelRunInit init, + JxlParallelRunFunction func, + uint32_t start_range, uint32_t end_range); + + // Starts the given number of worker threads and blocks until they are ready. + // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks + // run on the main thread. + explicit ThreadParallelRunner( + int num_worker_threads = std::thread::hardware_concurrency()); + + // Waits for all threads to exit. + ~ThreadParallelRunner(); + + // Returns maximum number of main/worker threads that may call Func. Useful + // for allocating per-thread storage. + size_t NumThreads() const { return num_threads_; } + + // Runs func(thread, thread) on all thread(s) that may participate in Run. + // If NumThreads() == 0, runs on the main thread with thread == 0, otherwise + // concurrently called by each worker thread in [0, NumThreads()). + template <class Func> + void RunOnEachThread(const Func& func) { + if (num_worker_threads_ == 0) { + const int thread = 0; + func(thread, thread); + return; + } + + data_func_ = reinterpret_cast<JxlParallelRunFunction>(&CallClosure<Func>); + jpegxl_opaque_ = const_cast<void*>(static_cast<const void*>(&func)); + StartWorkers(kWorkerOnce); + WorkersReadyBarrier(); + } + + JxlMemoryManager memory_manager; + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + // Special values; all others encode the begin/end parameters. Note that all + // these are no-op ranges (begin >= end) and therefore never used to encode + // ranges. + static constexpr WorkerCommand kWorkerWait = ~1ULL; + static constexpr WorkerCommand kWorkerOnce = ~2ULL; + static constexpr WorkerCommand kWorkerExit = ~3ULL; + + // Calls f(task, thread). Used for type erasure of Func arguments. The + // signature must match JxlParallelRunFunction, hence a void* argument. + template <class Closure> + static void CallClosure(void* f, const uint32_t task, const size_t thread) { + (*reinterpret_cast<const Closure*>(f))(task, thread); + } + + void WorkersReadyBarrier() { + std::unique_lock<std::mutex> lock(mutex_); + // Typically only a single iteration. + while (workers_ready_ != threads_.size()) { + workers_ready_cv_.wait(lock); + } + workers_ready_ = 0; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + mutex_.lock(); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + mutex_.unlock(); + worker_start_cv_.notify_all(); + } + + // Attempts to reserve and perform some work from the global range of tasks, + // which is encoded within "command". Returns after all tasks are reserved. + static void RunRange(ThreadParallelRunner* self, const WorkerCommand command, + const int thread); + + static void ThreadFunc(ThreadParallelRunner* self, int thread); + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector<std::thread> threads_; + + const uint32_t num_worker_threads_; // == threads_.size() + const uint32_t num_threads_; + + std::atomic<int> depth_{0}; // detects if Run is re-entered (not supported). + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + uint32_t workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + JxlParallelRunFunction data_func_; + void* jpegxl_opaque_; + + // Updated by workers; padding avoids false sharing. + uint8_t padding1[64]; + std::atomic<uint32_t> num_reserved_{0}; + uint8_t padding2[64]; +}; + +} // namespace jpegxl + +#endif // LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_ diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc new file mode 100644 index 0000000000..a757c3018b --- /dev/null +++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc @@ -0,0 +1,123 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include <atomic> + +#include "lib/jxl/base/data_parallel.h" +#include "lib/jxl/test_utils.h" +#include "lib/jxl/testing.h" + +using jxl::test::ThreadPoolForTests; + +namespace jpegxl { +namespace { + +int PopulationCount(uint64_t bits) { + int num_set = 0; + while (bits != 0) { + num_set += bits & 1; + bits >>= 1; + } + return num_set; +} + +// Ensures task parameter is in bounds, every parameter is reached, +// pool can be reused (multiple consecutive Run calls), pool can be destroyed +// (joining with its threads), num_threads=0 works (runs on current thread). +TEST(ThreadParallelRunnerTest, TestPool) { + for (int num_threads = 0; num_threads <= 18; ++num_threads) { + ThreadPoolForTests pool(num_threads); + for (int num_tasks = 0; num_tasks < 32; ++num_tasks) { + std::vector<int> mementos(num_tasks); + for (int begin = 0; begin < 32; ++begin) { + std::fill(mementos.begin(), mementos.end(), 0); + EXPECT_TRUE(RunOnPool( + &pool, begin, begin + num_tasks, jxl::ThreadPool::NoInit, + [begin, num_tasks, &mementos](const int task, const int thread) { + // Parameter is in the given range + EXPECT_GE(task, begin); + EXPECT_LT(task, begin + num_tasks); + + // Store mementos to be sure we visited each task. + mementos.at(task - begin) = 1000 + task; + }, + "TestPool")); + for (int task = begin; task < begin + num_tasks; ++task) { + EXPECT_EQ(1000 + task, mementos.at(task - begin)); + } + } + } + } +} + +// Verify "thread" parameter when processing few tasks. +TEST(ThreadParallelRunnerTest, TestSmallAssignments) { + // WARNING: cumulative total threads must not exceed profiler.h kMaxThreads. + const int kMaxThreads = 8; + for (int num_threads = 1; num_threads <= kMaxThreads; ++num_threads) { + ThreadPoolForTests pool(num_threads); + + // (Avoid mutex because it may perturb the worker thread scheduling) + std::atomic<uint64_t> id_bits{0}; + std::atomic<int> num_calls{0}; + + EXPECT_TRUE(RunOnPool( + &pool, 0, num_threads, jxl::ThreadPool::NoInit, + [&num_calls, num_threads, &id_bits](const int task, const int thread) { + num_calls.fetch_add(1, std::memory_order_relaxed); + + EXPECT_LT(thread, num_threads); + uint64_t bits = id_bits.load(std::memory_order_relaxed); + while ( + !id_bits.compare_exchange_weak(bits, bits | (1ULL << thread))) { + } + }, + "TestSmallAssignments")); + + // Correct number of tasks. + EXPECT_EQ(num_threads, num_calls.load()); + + const int num_participants = PopulationCount(id_bits.load()); + // Can't expect equality because other workers may have woken up too late. + EXPECT_LE(num_participants, num_threads); + } +} + +struct Counter { + Counter() { + // Suppress "unused-field" warning. + (void)padding; + } + void Assimilate(const Counter& victim) { counter += victim.counter; } + int counter = 0; + int padding[31]; +}; + +TEST(ThreadParallelRunnerTest, TestCounter) { + const int kNumThreads = 12; + ThreadPoolForTests pool(kNumThreads); + alignas(128) Counter counters[kNumThreads]; + + const int kNumTasks = kNumThreads * 19; + EXPECT_TRUE(RunOnPool( + &pool, 0, kNumTasks, jxl::ThreadPool::NoInit, + [&counters](const int task, const int thread) { + counters[thread].counter += task; + }, + "TestCounter")); + + int expected = 0; + for (int i = 0; i < kNumTasks; ++i) { + expected += i; + } + + for (int i = 1; i < kNumThreads; ++i) { + counters[0].Assimilate(counters[i]); + } + EXPECT_EQ(expected, counters[0].counter); +} + +} // namespace +} // namespace jpegxl |