Adding upstream version 1:115.7.0.upstream/1%115.7.0 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:32:43 +0000
commit: 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree: a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/jpeg-xl
parent: Initial commit. (diff)
download: thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz
thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip
669 files changed, 158999 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/.bazelignore b/third_party/jpeg-xl/.bazelignore
new file mode 100644
index 0000000000..912eacc1b5
--- /dev/null
+++ b/third_party/jpeg-xl/.bazelignore
@@ -0,0 +1 @@
+third_party
diff --git a/third_party/jpeg-xl/.clang-format b/third_party/jpeg-xl/.clang-format
new file mode 100644
index 0000000000..a61b61c569
--- /dev/null
+++ b/third_party/jpeg-xl/.clang-format
@@ -0,0 +1,4 @@
+BasedOnStyle: Google
+IncludeCategories:
+  - Regex:           '^<hwy/'
+    Priority:        2
diff --git a/third_party/jpeg-xl/.clang-tidy b/third_party/jpeg-xl/.clang-tidy
new file mode 100644
index 0000000000..abccf4ed47
--- /dev/null
+++ b/third_party/jpeg-xl/.clang-tidy
@@ -0,0 +1,70 @@
+# Disabled checks:
+# - google-readability-todo: We don't use the google TODO format.
+#
+# - modernize-deprecated-headers: We don't use std:: versions of the standard
+#   types and functions like size_t or printf, so we should include <stdio.h>
+#   instead <cstdio>.
+# - modernize-return-braced-init-list: this often doesn't improve readability.
+# - modernize-use-auto: is too aggressive towards using auto.
+# - modernize-use-default-member-init: with a mix of constructors and default
+#   member initialization this can be confusing if enforced.
+# - modernize-use-trailing-return-type: does not improve readability when used
+#   systematically.
+# - modernize-use-using: typedefs are ok.
+#
+# - readability-else-after-return: It doesn't always improve readability.
+# - readability-static-accessed-through-instance
+#   It is often more useful and readable to access a constant of a passed
+#   variable (like d.N) instead of using the type of the variable that could be
+#   long and complex.
+# - readability-uppercase-literal-suffix: we write 1.0f, not 1.0F.
+
+Checks: >-
+  bugprone-*,
+  clang-*,
+  -clang-diagnostic-unused-command-line-argument,
+  google-*,
+  modernize-*,
+  performance-*,
+  readability-*,
+  -google-readability-todo,
+  -modernize-deprecated-headers,
+  -modernize-return-braced-init-list,
+  -modernize-use-auto,
+  -modernize-use-default-member-init,
+  -modernize-use-trailing-return-type,
+  -modernize-use-using,
+  -readability-else-after-return,
+  -readability-function-cognitive-complexity,
+  -readability-static-accessed-through-instance,
+  -readability-uppercase-literal-suffix,
+
+
+WarningsAsErrors: >-
+  bugprone-argument-comment,
+  bugprone-macro-parentheses,
+  bugprone-suspicious-string-compare,
+  bugprone-use-after-move,
+  clang-*,
+  clang-analyzer-*,
+  -clang-diagnostic-unused-command-line-argument,
+  google-build-using-namespace,
+  google-explicit-constructor,
+  google-readability-braces-around-statements,
+  google-readability-namespace-comments,
+  modernize-use-override,
+  readability-inconsistent-declaration-parameter-name
+
+# We are only interested in the headers from this projects, excluding
+# third_party/ and build/.
+HeaderFilterRegex: '^.*/(lib|tools)/.*\.h$'
+
+CheckOptions:
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  - key:             readability-implicit-bool-conversion.AllowPointerConditions
+    value:           '1'
+  - key:             readability-implicit-bool-conversion.AllowIntegerConditions
+    value:           '1'
diff --git a/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/bug_report.md b/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000..a7a1429861
--- /dev/null
+++ b/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,37 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots or example input/output images to help explain your problem.
+
+**Environment**
+ - OS: [e.g. Windows]
+ - Compiler version: [e.g. clang 11.0.1]
+ - CPU type: [e.g. x86_64]
+ - cjxl/djxl version string: [e.g. cjxl [v0.3.7 | SIMD supported: SSE4,Scalar]]
+
+**Additional context**
+Add any other context about the problem here.
+
+<!--
+Currently github does not allow uploading files that end in `.jxl`, but when you
+rename them for example as `image.jxl.jpg`, it will be possible to upload them
+and also view them in browsers that are configured to support it.
+
+See https://github.com/orgs/github-community/discussions/18139
+-->
diff --git a/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/feature_request.md b/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000..bbcbbe7d61
--- /dev/null
+++ b/third_party/jpeg-xl/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/third_party/jpeg-xl/.github/workflows/build_test.yml b/third_party/jpeg-xl/.github/workflows/build_test.yml
new file mode 100644
index 0000000000..ddc81b6fb0
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/build_test.yml
@@ -0,0 +1,433 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and running tests.
+
+name: Build/Test
+on:
+  push:
+    branches:
+      - main
+      - v*.*.x
+  pull_request:
+    types: [opened, reopened, labeled, synchronize]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  ubuntu_build:
+    name: Ubuntu Build ${{ matrix.name }}
+    runs-on: ${{ matrix.os || 'ubuntu-latest' }}
+    strategy:
+      matrix:
+        # We have one job per "name" in the matrix. Attributes are set on the
+        # specific job names.
+        name: [release, debug, asan, msan, scalar]
+        include:
+          - name: release
+            test_in_pr: true
+            # Track static stack size on build and check it doesn't exceed 3 kB.
+            env_stack_size: 1
+            max_stack: 3000
+            # Conformance tooling test requires numpy.
+            apt_pkgs: graphviz python3-numpy
+          - name: lowprecision
+            mode: release
+            test_in_pr: true
+            cmake_args: -DCMAKE_CXX_FLAGS=-DJXL_HIGH_PRECISION=0
+          - name: debug
+            # Runs on AVX3 CPUs require more stack than others. Make sure to
+            # test on AVX3-enabled CPUs when changing this value.
+            env_test_stack_size: 4000
+          # Build scalar-only hwy instructions.
+          - name: scalar
+            mode: release
+            cxxflags: -DHWY_COMPILE_ONLY_SCALAR
+          # Disabling optional features to speed up msan build a little bit.
+          - name: msan
+            skip_install: true
+            cmake_args: >-
+              -DJPEGXL_ENABLE_DEVTOOLS=OFF -DJPEGXL_ENABLE_PLUGINS=OFF
+              -DJPEGXL_ENABLE_VIEWERS=OFF
+          - name: asan
+            skip_install: true
+          - name: coverage
+            apt_pkgs: gcovr
+            # Coverage builds require a bit more RAM.
+            env_test_stack_size: 2048
+          # Build with support for decoding to JPEG bytes disabled. Produces a
+          # smaller build if only decoding to pixels is needed.
+          - name: release-nojpeg
+            mode: release
+            cxxflags: -DJXL_DEBUG_ON_ABORT=0
+            cmake_args: >-
+              -DJPEGXL_ENABLE_TRANSCODE_JPEG=OFF
+              -DJPEGXL_ENABLE_PLUGINS=OFF
+              -DJPEGXL_ENABLE_VIEWERS=OFF
+          # Build optimized for binary size, all features not needed for
+          # reconstructing pixels is disabled.
+          - name: release:minimal
+            mode: release
+            cxxflags: -DJXL_DEBUG_ON_ABORT=0
+            cmake_args: >-
+              -DJPEGXL_ENABLE_TRANSCODE_JPEG=OFF
+              -DJPEGXL_ENABLE_BOXES=OFF
+              -DJPEGXL_ENABLE_PLUGINS=OFF
+              -DJPEGXL_ENABLE_VIEWERS=OFF
+          # Builds with gcc in release mode
+          - name: release:gcc8
+            mode: release
+            apt_pkgs: gcc-8 g++-8
+            cmake_args: >-
+              -DCMAKE_C_COMPILER=gcc-8 -DCMAKE_CXX_COMPILER=g++-8
+          # Builds with clang-5 in release mode
+          - name: release:clang-5
+            os: ubuntu-18.04
+            mode: release
+            # TODO(eustas): investigate, why static brotli library is not found.
+            skip_install: true
+            apt_pkgs: clang-5.0
+            cmake_args: >-
+              -DCMAKE_C_COMPILER=clang-5.0 -DCMAKE_CXX_COMPILER=clang++-5.0
+              -DJPEGXL_ENABLE_PLUGINS=OFF
+
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
+      # Whether we track the stack size.
+      STACK_SIZE: ${{ matrix.env_stack_size }}
+      TEST_STACK_LIMIT: ${{ matrix.env_test_stack_size }}
+      WILL_RUN_TESTS: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full'))) }}
+
+    steps:
+    - name: Install build deps
+      run: |
+        sudo apt update
+        sudo apt install -y \
+          ccache \
+          clang-7 \
+          cmake \
+          doxygen \
+          libbenchmark-dev \
+          libbenchmark-tools \
+          libbrotli-dev \
+          libgdk-pixbuf2.0-dev \
+          libgif-dev \
+          libgtest-dev \
+          libgtk2.0-dev  \
+          libjpeg-dev \
+          libopenexr-dev \
+          libpng-dev \
+          libwebp-dev \
+          ninja-build \
+          pkg-config \
+          xvfb \
+          ${{ matrix.apt_pkgs }} \
+        #
+        echo "CC=clang-7" >> $GITHUB_ENV
+        echo "CXX=clang++-7" >> $GITHUB_ENV
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 2
+
+    - name: Setup the LLVM source path
+      if: matrix.name == 'msan'
+      run: |
+        LLVM_ROOT=${GITHUB_WORKSPACE}/llvm_root
+        mkdir -p ${LLVM_ROOT}
+        echo "LLVM_ROOT=${LLVM_ROOT}" >> $GITHUB_ENV
+    - name: Cache LLVM sources
+      if: matrix.name == 'msan'
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.LLVM_ROOT }}
+        key: llvm
+    - name: Checkout the LLVM source
+      if: matrix.name == 'msan'
+      uses: actions/checkout@v2
+      with:
+        submodules: false
+        repository: llvm/llvm-project
+        ref: llvmorg-7.0.1
+        path: llvm_root
+
+    - name: Sphinx dependencies
+      # Dependencies for sphinx HTML documentation
+      if: matrix.name == 'release'
+      run: |
+        pip3 install -r doc/sphinx/requirements.txt
+    - name: Git environment
+      id: git-env
+      run: |
+        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+      shell: bash
+    - name: ccache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        # When the cache hits the key it is not updated, so if this is a rebuild
+        # of the same Pull Request it will reuse the cache if still around. For
+        # either Pull Requests or new pushes to main, this will use the parent
+        # hash as the starting point from the restore-keys entry.
+        key: build-${{ runner.os }}-${{ github.sha }}-${{ matrix.name }}
+        restore-keys: |
+          build-${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.name }}
+    - name: Build
+      if: matrix.name != 'coverage' || env.WILL_RUN_TESTS == 'true'
+      run: |
+        mkdir -p ${CCACHE_DIR}
+        echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
+        mode="${{ matrix.mode }}"
+        build_tests=$([ "$WILL_RUN_TESTS" == "true" ] && echo "ON" || echo "OFF")
+        [[ -n "${mode}" ]] || mode="${{ matrix.name }}"
+        ./ci.sh ${mode} -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+          -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+          -DBUILD_TESTING=${build_tests} \
+          ${{ matrix.cmake_args }}
+      env:
+        SKIP_TEST: 1
+        CMAKE_CXX_FLAGS: ${{ matrix.cxxflags }}
+    - name: Build stats
+      run: |
+        awk '!/^#/ {total[$4]+=($2-$1);cntr[$4]+=1} END {for (key in total) print total[key]/cntr[key] " " key}' build/.ninja_log | sort -n | tail -n 25
+    - name: ccache stats
+      run: ccache --show-stats
+    - name: Build stats ${{ matrix.name }}
+      if: matrix.mode == 'release' || matrix.name == 'release'
+      run: |
+        tools/build_stats.py --save build/stats.json \
+          --max-stack ${{ matrix.max_stack || '0' }} \
+          cjxl djxl libjxl.so libjxl_dec.so
+    # Check that we can build the example project against the installed libs.
+    - name: Install and build examples
+      if: |
+        (matrix.mode == 'release' || matrix.name == 'release') &&
+        !matrix.skip_install
+      run: |
+        set -x
+        sudo cmake --build build -- install
+        cmake -Bbuild-example -Hexamples -G Ninja
+        cmake --build build-example
+        if ldd build-example/decode_oneshot_static | grep libjxl; then
+          echo "decode_oneshot_static is not using the static lib" >&2
+          exit 1
+        fi
+        # Test that the built binaries run.
+        echo -e -n "PF\n1 1\n-1.0\n\0\0\x80\x3f\0\0\x80\x3f\0\0\x80\x3f" > test.pfm
+        build-example/encode_oneshot test.pfm test.jxl
+        build-example/encode_oneshot_static test.pfm test-static.jxl
+        build-example/decode_oneshot test.jxl dec.pfm dec.icc
+        build-example/decode_oneshot_static test.jxl dec-static.pfm dec-static.icc
+    # Run the tests on push and when requested in pull_request.
+    - name: Test ${{ matrix.mode }}
+      if: env.WILL_RUN_TESTS == 'true'
+      run: |
+        ./ci.sh test ${{ matrix.ctest_args }}
+    # Print the running time summary for the slowest tests.
+    - name: Test runtime stats
+      run: |
+        sort build/Testing/Temporary/CTestCostData.txt -k 3 -n | tail -n 20 || true
+    - name: Build HTML documentation (sphinx/readthetdocs)
+      if: matrix.name == 'release'
+      run: |
+        cmake --build build -- rtd-html
+    - name: Coverage report
+      if: github.event_name == 'push' && matrix.name == 'coverage'
+      run: |
+        ./ci.sh coverage_report
+    - name: Coverage upload to Codecov
+      if: github.event_name == 'push' && matrix.name == 'coverage'
+      uses: codecov/codecov-action@v2
+      with:
+        flags: unittests
+        files: build/coverage.xml
+    - name: Fast benchmark ${{ matrix.mode }}
+      if: |
+        matrix.name != 'coverage' && (github.event_name == 'push' ||
+        (github.event_name == 'pull_request' && (
+         matrix.test_in_pr ||
+         contains(github.event.pull_request.labels.*.name, 'CI:full'))))
+      run: |
+        STORE_IMAGES=0 ./ci.sh fast_benchmark
+    # Run gbench once, just to make sure it runs, not for actual benchmarking.
+    # This doesn't work on msan because we use gbench library from the system
+    # which is not instrumented by msan.
+    - name: gbench check
+      if: |
+        matrix.name == 'release' || (
+          github.event_name == 'push' && matrix.name != 'msan')
+      run: |
+        ./ci.sh gbench --benchmark_min_time=0
+
+  windows_msys:
+    name: Windows MSYS2 / ${{ matrix.msystem }}
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+         - msystem: mingw64
+         - msystem: clang64
+         - msystem: mingw32
+           # TODO(eustas): investigate HWY Mul failures
+           disable_tests:
+             - HwyMulTestGroup/HwyMulTest\.TestAllMulHigh/EMU128
+             - HwyMulTestGroup/HwyMulTest\.TestAllMulFixedPoint15/EMU128
+             - DecodeTest/DecodeTestParam\.PixelTest/280x12RGBAtoRGBAf32BECallbackEarlyBufferO2
+             - DecodeTest/DecodeTestParam\.PixelTest/280x12RGBAtoRGBAf32BECallbackEarlyBufferO3
+             - DecodeTest/DecodeTestParam\.PixelTest/280x12RGBAtoRGBAf32BECallbackEarlyBufferO7
+             - DecodeTest/DecodeTestParam\.PixelTest/280x12RGBAtoRGBAf32BECallbackEarlyBufferO8
+         - msystem: clang32
+           # TODO(eustas): investigate HWY Sort and JXL ANS failures
+           disable_tests:
+             - SortTestGroup/SortTest\.TestAllSort/.*
+             - ANSTest\.RandomUnbalancedStreamRoundtrip3
+             - ANSTest\.RandomUnbalancedStreamRoundtripBig
+
+    defaults:
+      run:
+        shell: msys2 {0}
+    steps:
+      - name: Checkout the source
+        uses: actions/checkout@v2
+        with:
+          submodules: true
+          fetch-depth: 1
+      - uses: msys2/setup-msys2@v2
+        with:
+          msystem: ${{ matrix.msystem }}
+          update: true
+          path-type: inherit
+          install: >-
+            base-devel
+            git
+          pacboy: >-
+            brotli:p
+            cmake:p
+            giflib:p
+            gtest:p
+            libavif:p
+            libjpeg-turbo:p
+            libpng:p
+            libwebp:p
+            ninja:p
+            toolchain:p
+
+      - name: CMake configure
+        run: |
+          cmake \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DJPEGXL_ENABLE_JNI=OFF \
+            -DJPEGXL_ENABLE_MANPAGES=OFF \
+            -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+            -DJPEGXL_FORCE_SYSTEM_GTEST=ON \
+            -B build \
+            -G Ninja
+      - name: CMake build
+        run: cmake --build build
+      - name: Test
+        if: |
+          github.event_name == 'push' ||
+          (github.event_name == 'pull_request' &&
+           contains(github.event.pull_request.labels.*.name, 'CI:full'))
+        run: ctest --test-dir build --parallel 2 --output-on-failure -E "${{ join(matrix.disable_tests, '|') }}"
+
+  wasm32_build:
+    name: WASM wasm32/${{ matrix.variant }}
+    runs-on: ubuntu-latest
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
+      BUILD_TARGET: wasm32
+      EM_VERSION: 3.1.1
+      NODE_VERSION: 18
+
+    strategy:
+      matrix:
+        include:
+          - variant: scalar
+          - variant: simd
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 1
+    - name: Install build deps
+      shell: bash
+      run: |
+        set -x
+        sudo apt update
+        pkgs=(
+          # Build dependencies
+          ccache
+          cmake
+          doxygen
+          ninja-build
+          pkg-config
+        )
+        DEBIAN_FRONTEND=noninteractive sudo apt install -y "${pkgs[@]}"
+
+    - name: Git environment
+      id: git-env
+      run: |
+        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+      shell: bash
+    - name: ccache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        key: build-wasm-${{ runner.os }}-${{ github.sha }}-${{ matrix.variant }}
+        restore-keys: |
+          build-wasm-${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }}
+
+    - name: Install node
+      uses: actions/setup-node@v3
+      with:
+        node-version: ${{env.NODE_VERSION}}
+
+    - name: Get non-EMSDK node path
+      run: which node >> $HOME/.base_node_path
+
+    - name: Install emsdk
+      uses: mymindstorm/setup-emsdk@v11
+      # TODO(deymo): We could cache this action but it doesn't work when running
+      # in a matrix.
+      with:
+        version: ${{env.EM_VERSION}}
+        no-cache: true
+
+    - name: Set EMSDK node version
+      run: |
+        echo "NODE_JS='$(cat $HOME/.base_node_path)'" >> $EMSDK/.emscripten
+        emsdk construct_env
+
+    # TODO(deymo): Build and install other dependencies like libpng, libjpeg,
+    # etc.
+    - name: Build
+      run: |
+        mkdir -p ${CCACHE_DIR}
+        echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
+        if [[ "${{ matrix.variant }}" == "simd" ]]; then
+          export ENABLE_WASM_SIMD=1
+        fi
+        ./ci.sh release \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+          -DCMAKE_C_COMPILER_LAUNCHER=ccache
+      env:
+        SKIP_TEST: 1
+    - name: ccache stats
+      run: ccache --show-stats
+
+    - name: Test
+      if: |
+        github.event_name == 'push' ||
+        (github.event_name == 'pull_request' &&
+         contains(github.event.pull_request.labels.*.name, 'CI:full'))
+      run: |
+        ./ci.sh test
diff --git a/third_party/jpeg-xl/.github/workflows/build_test_cross.yml b/third_party/jpeg-xl/.github/workflows/build_test_cross.yml
new file mode 100644
index 0000000000..5b537205c4
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/build_test_cross.yml
@@ -0,0 +1,183 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and running tests.
+
+name: Build/Test Cross
+on:
+  push:
+    branches:
+      - main
+      - v*.*.x
+  pull_request:
+    types: [opened, reopened, labeled, synchronize]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  cross_compile_ubuntu:
+    name: Cross-compiling ${{ matrix.build_target }} ${{ matrix.variant }} 
+    runs-on: [ubuntu-22.04]
+    container:
+      image: debian:bullseye
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: arm64
+            build_target: aarch64-linux-gnu
+            cmake_args:
+             - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
+
+          - arch: arm64
+            variant: SVE
+            build_target: aarch64-linux-gnu
+            cmake_args:
+             - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
+             - -DJPEGXL_ENABLE_OPENEXR=off
+             - -DJPEGXL_ENABLE_SIZELESS_VECTORS=on
+            cmake_flags: -march=armv8-a+sve
+            c_compiler: aarch64-linux-gnu-gcc
+            cxx_compiler: aarch64-linux-gnu-g++
+            disable_tests: true
+
+          - arch: arm64
+            variant: lowprecision
+            build_target: aarch64-linux-gnu
+            cmake_args:
+             - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
+             - -DCMAKE_CXX_FLAGS=-DJXL_HIGH_PRECISION=0
+
+          - arch: armhf
+            build_target: arm-linux-gnueabihf
+            cmake_args: [-DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-arm-static]
+
+          - arch: i386
+            test_in_pr: true
+            build_target: i686-linux-gnu
+
+    env:
+      BUILD_DIR: build
+      WILL_RUN_TESTS: ${{ (github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full')))) && !matrix.disable_tests }}
+
+    steps:
+    - name: Setup apt
+      shell: bash
+      run: |
+        set -x
+        apt-get update -y
+        apt-get install -y ca-certificates debian-ports-archive-keyring
+
+        dpkg --add-architecture "${{ matrix.arch }}"
+
+        # Update the sources.list with the split of supported architectures.
+        bkplist="/etc/apt/sources.list.bkp"
+        mv /etc/apt/sources.list "${bkplist}"
+
+        newlist="/etc/apt/sources.list"
+        rm -f "${newlist}"
+
+        main_list="amd64,${{ matrix.arch }}"
+        port_list=""
+        if [[ "${{ matrix.arch }}" == "i386" ]]; then
+          main_list="amd64,i386"
+        else
+          port_list="${{ matrix.arch }}"
+        fi
+
+        grep -v -E '^#' "${bkplist}" |
+          sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
+          | tee -a "${newlist}"
+
+    - name: Install build deps
+      shell: bash
+      run: |
+        set -x
+        apt update
+        pkgs=(
+          # Build dependencies
+          cmake
+          doxygen
+          git
+          graphviz
+          ninja-build
+          pkg-config
+          qemu-user-static
+          xdg-utils
+          xvfb
+
+          # Toolchain for cross-compiling.
+          clang-11
+          g++-aarch64-linux-gnu
+          libc6-dev-${{ matrix.arch }}-cross
+          libstdc++-10-dev-${{ matrix.arch }}-cross
+          libstdc++-10-dev:${{ matrix.arch }}
+
+          # Dependencies
+          libbrotli-dev:${{ matrix.arch }}
+          libgif-dev:${{ matrix.arch }}
+          libjpeg-dev:${{ matrix.arch }}
+          libpng-dev:${{ matrix.arch }}
+          libwebp-dev:${{ matrix.arch }}
+
+          # For OpenEXR:
+          libilmbase-dev:${{ matrix.arch }}
+          libopenexr-dev:${{ matrix.arch }}
+
+          # GTK plugins
+          libgdk-pixbuf2.0-dev:${{ matrix.arch }}
+          libgtk2.0-dev:${{ matrix.arch }}
+
+          # QT
+          libqt5x11extras5-dev:${{ matrix.arch }}
+          qtbase5-dev:${{ matrix.arch }}
+        )
+        if [[ "${{ matrix.build_target }}" != "x86_64-linux-gnu" ]]; then
+          pkgs+=(
+            binutils-${{ matrix.build_target }}
+            gcc-${{ matrix.build_target }}
+          )
+        fi
+        if [[ "${{ matrix.arch }}" != "i386" ]]; then
+          pkgs+=(
+            # TCMalloc
+            libgoogle-perftools-dev:${{ matrix.arch }}
+            libgoogle-perftools4:${{ matrix.arch }}
+            libtcmalloc-minimal4:${{ matrix.arch }}
+            libunwind-dev:${{ matrix.arch }}
+          )
+        fi
+        DEBIAN_FRONTEND=noninteractive apt install -y "${pkgs[@]}"
+        echo "CC=${{ matrix.c_compiler || 'clang-11' }}" >> $GITHUB_ENV
+        echo "CXX=${{ matrix.cxx_compiler || 'clang++-11' }}" >> $GITHUB_ENV
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 1
+    - name: Build
+      run: |
+        CMAKE_FLAGS="${{ matrix.cmake_flags }}" ./ci.sh release \
+          -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+          -DJPEGXL_ENABLE_JNI=OFF \
+          ${{ join(matrix.cmake_args, ' ') }}
+      env:
+        SKIP_TEST: 1
+        BUILD_TARGET: ${{ matrix.build_target }}
+    - name: Build stats ${{ matrix.build_target }}
+      run: |
+        tools/build_stats.py --save build/stats.json \
+          --binutils ${{ matrix.build_target }}- \
+          --max-stack ${{ matrix.max_stack || '0' }} \
+          cjxl djxl libjxl.so libjxl_dec.so
+    # Run the tests on push and when requested in pull_request.
+    - name: Test
+      if: env.WILL_RUN_TESTS == 'true'
+      run: |
+        ./ci.sh test
+      env:
+        BUILD_TARGET: ${{ matrix.build_target }}
diff --git a/third_party/jpeg-xl/.github/workflows/conformance.yml b/third_party/jpeg-xl/.github/workflows/conformance.yml
new file mode 100644
index 0000000000..9c85878f98
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/conformance.yml
@@ -0,0 +1,185 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for running conformance tests.
+
+name: Conformance
+on:
+  push:
+    branches:
+      - main
+      - v*.*.x
+  pull_request:
+    types: [opened, reopened, labeled, synchronize]
+
+env:
+  LIBJXL_VERSION: 0.8.0
+  LIBJXL_ABI_VERSION: 0.8
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  warmup: # If necessary, fetch files just once, before tests are run. 
+    name: Warmup caches
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout the conformance source
+      uses: actions/checkout@v2
+      with:
+        repository: libjxl/conformance
+        # TODO(eustas): move ref to a global variable / file?
+        ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
+        path: conformance
+    - name: Cache
+      uses: actions/cache@v2
+      with:
+        path: ${{ github.workspace }}/conformance/.objects
+        key: conformance-refs
+    - name: Download and link conformance files
+      run: |
+        ${{ github.workspace }}/conformance/scripts/download_and_symlink.sh
+
+  build:
+    name: Conformance Build ${{ matrix.name }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: AVX3
+            cflags: -DHWY_DISABLED_TARGETS=HWY_AVX3-1
+          - name: AVX2
+            cflags: -DHWY_DISABLED_TARGETS=HWY_AVX2-1
+          - name: SSE4
+            cflags: -DHWY_DISABLED_TARGETS=HWY_SSE4-1
+          - name: SSSE3
+            cflags: -DHWY_DISABLED_TARGETS=HWY_SSSE3-1
+          - name: EMU128
+            cflags: -DHWY_COMPILE_ONLY_EMU128=1
+          - name: SCALAR
+            cflags: -DHWY_COMPILE_ONLY_SCALAR=1
+          - name: SCALAR_ASAN
+            cflags: -DHWY_COMPILE_ONLY_SCALAR=1
+            build_type: asan
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
+    steps:
+    - name: Install build deps
+      run: |
+        sudo apt update
+        sudo apt install -y \
+          ccache \
+          clang-7 \
+          cmake \
+          doxygen \
+          libbenchmark-dev \
+          libbenchmark-tools \
+          libbrotli-dev \
+          libgdk-pixbuf2.0-dev \
+          libgif-dev \
+          libgtest-dev \
+          libgtk2.0-dev  \
+          libjpeg-dev \
+          libopenexr-dev \
+          libpng-dev \
+          libwebp-dev \
+          ninja-build \
+          pkg-config \
+          xvfb \
+          ${{ matrix.apt_pkgs }} \
+        #
+        echo "CC=clang-7" >> $GITHUB_ENV
+        echo "CXX=clang++-7" >> $GITHUB_ENV
+    - name: Checkout the jxl source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 2
+    - name: Git environment
+      id: git-env
+      run: |
+        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+      shell: bash
+    - name: ccache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        # When the cache hits the key it is not updated, so if this is a rebuild
+        # of the same Pull Request it will reuse the cache if still around. For
+        # either Pull Requests or new pushes to main, this will use the parent
+        # hash as the starting point from the restore-keys entry.
+        key: conformance-${{ runner.os }}-${{ github.sha }}-${{ matrix.name }}
+        restore-keys: |
+          conformance-${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.name }}
+    - name: Build
+      run: |
+        mkdir -p ${CCACHE_DIR}
+        echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
+        CMAKE_FLAGS="${{ matrix.cflags }}" \
+        ./ci.sh ${{ matrix.build_type || 'release' }} -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+          -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+          -DBUILD_TESTING=OFF
+        # Flatten the artifacts directory structure
+        cp tools/conformance/conformance.py build/tools/conformance
+        cp tools/conformance/lcms2.py build/tools/conformance
+        cp build/tools/djxl build/tools/conformance
+        cp build/libjxl.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
+        cp build/libjxl_threads.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
+      env:
+        SKIP_TEST: 1
+    - uses: actions/upload-artifact@v2
+      with:
+        name: conformance_binary-${{ matrix.name }}
+        path: |
+          build/tools/conformance/conformance.py
+          build/tools/conformance/lcms2.py
+          build/tools/conformance/djxl
+          build/tools/conformance/libjxl.so.${{ env.LIBJXL_VERSION }}
+          build/tools/conformance/libjxl_threads.so.${{ env.LIBJXL_VERSION }}
+    - name: ccache stats
+      run: ccache --show-stats
+
+  run:
+    name: Conformance Test ${{ matrix.name }} on ${{ matrix.target }}
+    needs: [warmup, build]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        name: [main_level5, main_level10]
+        target: [AVX3, AVX2, SSE4, SSSE3, EMU128, SCALAR, SCALAR_ASAN]
+    steps:
+    - name: Install deps
+      run: |
+        pip install numpy
+    - name: Checkout the conformance source
+      uses: actions/checkout@v2
+      with:
+        repository: libjxl/conformance
+        ref: 43d8135b50e53167ee6fee0b842b9eb15cfc4aa2
+        path: conformance
+    - name: Cache
+      uses: actions/cache@v2
+      with:
+        path: ${{ github.workspace }}/conformance/.objects
+        key: conformance-refs
+    - name: Download and link conformance files
+      run: |
+        ${{ github.workspace }}/conformance/scripts/download_and_symlink.sh
+    - uses: actions/download-artifact@v2
+      with:
+        name: conformance_binary-${{ matrix.target }}
+    - name: Run conformance tests
+      run: |
+        chmod +x djxl
+        ln -s libjxl.so.${{ env.LIBJXL_VERSION }} libjxl.so.${{ env.LIBJXL_ABI_VERSION }}
+        ln -s libjxl_threads.so.${{ env.LIBJXL_VERSION }} libjxl_threads.so.${{ env.LIBJXL_ABI_VERSION }}
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`pwd`
+        python conformance.py \
+          --decoder=`pwd`/djxl \
+          --corpus=`pwd`/conformance/testcases/${{ matrix.name }}.txt
diff --git a/third_party/jpeg-xl/.github/workflows/debug_ci.yml b/third_party/jpeg-xl/.github/workflows/debug_ci.yml
new file mode 100644
index 0000000000..fb3522eb28
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/debug_ci.yml
@@ -0,0 +1,59 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and then debugging on a specific commit.
+
+name: Build and Test debugging
+on:
+  push:
+    branches:
+      - ci-*-debug
+
+jobs:
+  ubuntu_build:
+    name: Ubuntu Build and SSH
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - name: Install build deps
+      run: |
+        sudo apt update
+        sudo apt install -y \
+          ccache \
+          clang-7 \
+          cmake \
+          doxygen \
+          libbrotli-dev \
+          libgdk-pixbuf2.0-dev \
+          libgif-dev \
+          libgtest-dev \
+          libgtk2.0-dev  \
+          libjpeg-dev \
+          libopenexr-dev \
+          libpng-dev \
+          libwebp-dev \
+          ninja-build \
+          pkg-config \
+          xvfb \
+          ${{ matrix.apt_pkgs }} \
+        #
+        echo "CC=clang-7" >> $GITHUB_ENV
+        echo "CXX=clang++-7" >> $GITHUB_ENV
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 2
+    - name: Build
+      run: |
+        ./ci.sh $(echo ${{ github.ref }} | sed 's_refs/heads/ci-\([a-z_]*\)-debug_\1_') \
+          -DJPEGXL_FORCE_SYSTEM_BROTLI=ON
+      env:
+        SKIP_TEST: 1
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
+
+
+
diff --git a/third_party/jpeg-xl/.github/workflows/fuzz.yml b/third_party/jpeg-xl/.github/workflows/fuzz.yml
new file mode 100644
index 0000000000..188a4c79c7
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/fuzz.yml
@@ -0,0 +1,56 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# CI on pull-requests to run the fuzzer from oss-fuzz. See:
+#
+#   https://google.github.io/oss-fuzz/getting-started/continuous-integration/
+
+name: CIFuzz
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+    paths:
+      - '**.c'
+      - '**.cc'
+      - '**.cmake'
+      - '**.h'
+      - '**CMakeLists.txt'
+      - .github/workflows/fuzz.yml
+
+concurrency: 
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v2
+      id: checkout
+      with:
+        # The build_fuzzers action checks out the code to the storage/libjxl
+        # directory already, but doesn't check out the submodules. This step
+        # is a workaround for checking out the submodules.
+        path: storage/libjxl
+        submodules: true
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'libjxl'
+        language: c++
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'libjxl'
+        language: c++
+        fuzz-seconds: 600
+    - name: Upload Crash
+      uses: actions/upload-artifact@v1
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/third_party/jpeg-xl/.github/workflows/pull_request.yml b/third_party/jpeg-xl/.github/workflows/pull_request.yml
new file mode 100644
index 0000000000..b1214e1061
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/pull_request.yml
@@ -0,0 +1,42 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow to run pull-requests specific checks.
+
+name: PR
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+
+jobs:
+  # Checks that the AUTHORS files is updated with new contributors.
+  authors:
+    runs-on: [ubuntu-latest]
+    steps:
+    - name: Checkout the source
+      uses: actions/checkout@v2
+    - name: Check AUTHORS file
+      run:
+        ./ci.sh authors
+
+  format:
+    runs-on: [ubuntu-latest]
+    steps:
+    - name: Install build deps
+      run: |
+        sudo apt update
+        sudo apt install -y \
+          clang-format \
+          clang-format-7 \
+          clang-format-8 \
+          clang-format-9 \
+          clang-format-10 \
+          clang-format-11 \
+        #
+    - name: Checkout the source
+      uses: actions/checkout@v2
+    - name: clang-format
+      run:
+        ./ci.sh lint >&2
diff --git a/third_party/jpeg-xl/.github/workflows/release.yaml b/third_party/jpeg-xl/.github/workflows/release.yaml
new file mode 100644
index 0000000000..4222266598
--- /dev/null
+++ b/third_party/jpeg-xl/.github/workflows/release.yaml
@@ -0,0 +1,378 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building the release binaries.
+#
+# This workflow runs as a post-submit step, when pushing to main or the release
+# branches (v*.*.x), and when creating a release in GitHub.
+#
+# In the GitHub release case, in addition to build the release binaries it also
+# uploads the binaries to the given release automatically.
+
+name: Release build / deploy
+on:
+  push:
+    branches:
+      - main
+      - v*.*.x
+  release:
+    types: [ published ]
+
+jobs:
+  ubuntu_static_x86_64:
+    name: Release linux x86_64 static
+    runs-on: [ubuntu-latest]
+    steps:
+    - name: Install build deps
+      run: |
+        sudo apt update
+        sudo apt install -y \
+          asciidoc \
+          clang \
+          cmake \
+          doxygen \
+          libbrotli-dev \
+          libgdk-pixbuf2.0-dev \
+          libgif-dev \
+          libgtest-dev \
+          libgtk2.0-dev  \
+          libjpeg-dev \
+          libopenexr-dev \
+          libpng-dev \
+          libwebp-dev \
+          ninja-build \
+          pkg-config \
+        #
+        echo "CC=clang" >> $GITHUB_ENV
+        echo "CXX=clang++" >> $GITHUB_ENV
+
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 1
+
+    - name: Build
+      env:
+        SKIP_TEST: 1
+      run: |
+        ./ci.sh release \
+          -DJPEGXL_DEP_LICENSE_DIR=/usr/share/doc \
+          -DJPEGXL_STATIC=ON \
+          -DBUILD_TESTING=OFF \
+          -DJPEGXL_ENABLE_VIEWERS=OFF \
+          -DJPEGXL_ENABLE_PLUGINS=OFF \
+          -DJPEGXL_ENABLE_OPENEXR=OFF \
+
+    - name: Package release tarball
+      run: |
+        cd build
+        tar -zcvf ${{ runner.workspace }}/release_file.tar.gz \
+          LICENSE* tools/{cjxl,djxl,benchmark_xl}
+        ln -s ${{ runner.workspace }}/release_file.tar.gz \
+          ${{ runner.workspace }}/jxl-linux-x86_64-static-${{ github.event.release.tag_name }}.tar.gz
+
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: jxl-linux-x86_64-static
+        path: ${{ runner.workspace }}/release_file.tar.gz
+
+    - name: Upload binaries to release
+      if: github.event_name == 'release'
+      uses: AButler/upload-release-assets@v2.0
+      with:
+        files: ${{ runner.workspace }}/jxl-linux-x86_64-static-${{ github.event.release.tag_name }}.tar.gz
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+
+  # Build .deb packages Ubuntu/Debian
+  release_ubuntu_pkg:
+    name: .deb packages / ${{ matrix.os }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+        - ubuntu:20.04
+        - ubuntu:18.04
+        - debian:buster
+        - debian:bullseye
+        - debian:bookworm
+        - debian:sid
+
+    container:
+      image: ${{ matrix.os }}
+
+    steps:
+    - name: Set env
+      shell: 'bash'
+      id: 'env'
+      run: |
+        artifact_name="jxl-debs-amd64-${matrix_os/:/-}"
+        echo ${artifact_name}
+        echo "::set-output name=artifact_name::${artifact_name}"
+      env:
+        matrix_os: ${{ matrix.os }}
+
+    - name: Install build deps
+      run: |
+        apt update
+        DEBIAN_FRONTEND=noninteractive apt install -y \
+          build-essential \
+          devscripts \
+        #
+
+    - name: Install git (only 18.04)
+      if: matrix.os == 'ubuntu:18.04'
+        # Ubuntu 18.04 ships with git 2.17 but we need 2.18 or newer for
+        # actions/checkout@v2 to work
+      shell: 'bash'
+      run: |
+        apt install -y \
+          libcurl4-openssl-dev \
+          libexpat1-dev \
+          libssl-dev \
+          wget \
+          zlib1g-dev \
+        #
+        git_version="2.32.0"
+        wget -nv \
+          "https://github.com/git/git/archive/refs/tags/v${git_version}.tar.gz"
+        tar -zxf "v${git_version}.tar.gz"
+        cd "git-${git_version}"
+        make prefix=/usr -j4 install
+
+    - name: Install gcc-8 (only 18.04)
+      if: matrix.os == 'ubuntu:18.04'
+        # Compiler bug workaround: install and use gcc-8
+      shell: 'bash'
+      run: |
+        apt install -y \
+          gcc-8 \
+          g++-8 \
+        #
+        update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100
+        update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100
+        update-alternatives --set g++ /usr/bin/g++-8
+        update-alternatives --set gcc /usr/bin/gcc-8
+
+    - name: Set git safe dir
+      run: |
+        export GIT_CEILING_DIRECTORIES=/__w # only work before git v2.35.2
+        git config --global --add safe.directory /__w/libjxl/libjxl
+
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 1
+
+    - name: Stamp non-release versions
+      # Stamps the built package with the commit date as part of the version
+      # after the version number so newer release candidates can override older
+      # ones.
+      if: github.event_name != 'release'
+      shell: 'bash'
+      run: |
+        # Committer timestamp.
+        set -x
+        commit_timestamp=$(git show -s --format=%ct)
+        commit_datetime=$(date --utc "--date=@${commit_timestamp}" '+%Y%m%d%H%M%S')
+        commit_ref=$(git rev-parse --short HEAD)
+        sem_version=$(dpkg-parsechangelog --show-field Version)
+        sem_version="${sem_version%%-*}"
+        deb_version="${sem_version}~alpha${commit_datetime}-0+git${commit_ref}"
+        dch -M --distribution unstable -b --newversion "${deb_version}" \
+          "Stamping build with version ${deb_version}"
+
+    - name: Stamp release versions
+      # Mark the version as released
+      if: github.event_name == 'release'
+      shell: 'bash'
+      run: |
+        if head -n1 debian/changelog | grep UNRELEASED; then
+          dch -M --distribution unstable --release ''
+        fi
+
+    - name: Install gtest (only 18.04)
+      if: matrix.os == 'ubuntu:18.04'
+        # In Ubuntu 18.04 no package installed the libgtest.a. libgtest-dev
+        # installs the source files only.
+      run: |
+        apt install -y libgtest-dev cmake
+        for prj in googletest googlemock; do
+          (cd /usr/src/googletest/${prj}/ &&
+           cmake CMakeLists.txt -DCMAKE_INSTALL_PREFIX=/usr &&
+           make all install)
+        done
+        # Remove libgmock-dev dependency in Ubuntu 18.04. It doesn't exist there.
+        sed '/libgmock-dev,/d' -i debian/control
+
+    - name: Install gmock-dev (debian:sid)
+      # gtest-dev cmake depends on gmock-dev, but it is not installed by the
+      # package.
+      if: matrix.os == 'debian:sid'
+      run: |
+        apt install -y libgmock-dev
+
+    - name: Remove libjxl-gimp-plugin package (only 18.04)
+      if: matrix.os == 'ubuntu:18.04'
+      run: |
+        # Gimp 2.8 is not supported.
+        sed -i '/Package: libjxl-gimp-plugin/,/^$/d' debian/control
+
+    - name: Build hwy
+      run: |
+        apt build-dep -y ./third_party/highway
+        ./ci.sh debian_build highway
+        dpkg -i build/debs/libhwy-dev_*_amd64.deb
+
+    - name: Build libjxl
+      run: |
+        apt build-dep -y .
+        ./ci.sh debian_build jpeg-xl
+
+    - name: Stats
+      run: |
+        ./ci.sh debian_stats
+
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ steps.env.outputs.artifact_name }}
+        path: |
+          build/debs/*jxl*.*
+
+    - name: Package release tarball
+      if: github.event_name == 'release'
+      run: |
+        (cd build/debs/; find -maxdepth 1 -name '*jxl*.*') | \
+        tar -zcvf release_file.tar.gz -C build/debs/ -T -
+        ln -s release_file.tar.gz \
+          ${{ steps.env.outputs.artifact_name }}-${{ github.event.release.tag_name }}.tar.gz
+
+    - name: Upload binaries to release
+      if: github.event_name == 'release'
+      uses: AButler/upload-release-assets@v2.0
+      with:
+        files: ${{ steps.env.outputs.artifact_name }}-${{ github.event.release.tag_name }}.tar.gz
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+
+  windows_build:
+    name: Windows Build (vcpkg / ${{ matrix.triplet }})
+    runs-on: [windows-2019]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - triplet: x86-windows-static
+            arch: '-A Win32'
+          - triplet: x64-windows-static
+            arch: '-A x64'
+
+    env:
+      VCPKG_VERSION: '2022.06.16.1'
+      VCPKG_ROOT: vcpkg
+      VCPKG_DISABLE_METRICS: 1
+
+    steps:
+    - name: Checkout the source
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+        fetch-depth: 2
+
+    - uses: actions/cache@v2
+      id: cache-vcpkg
+      with:
+        path: vcpkg
+        key: release-${{ runner.os }}-vcpkg-${{ env.VCPKG_VERSION }}-${{ matrix.triplet }}
+
+    - name: Download vcpkg
+      if: steps.cache-vcpkg.outputs.cache-hit != 'true'
+      # wget doesn't seem to work under bash.
+      shell: 'powershell'
+      run: |
+        C:\msys64\usr\bin\wget.exe -nv `
+           https://github.com/microsoft/vcpkg/archive/refs/tags/${{ env.VCPKG_VERSION }}.zip `
+          -O vcpkg.zip
+    - name: Bootstrap vcpkg
+      if: steps.cache-vcpkg.outputs.cache-hit != 'true'
+      shell: 'bash'
+      run: |
+        set -x
+        unzip -q vcpkg.zip
+        rm -rf ${VCPKG_ROOT}
+        mv vcpkg-${VCPKG_VERSION} ${VCPKG_ROOT}
+        ${VCPKG_ROOT}/bootstrap-vcpkg.sh
+
+    - name: Install libraries with vcpkg
+      shell: 'bash'
+      run: |
+        set -x
+        ${VCPKG_ROOT}/vcpkg --triplet ${{ matrix.triplet }} install \
+          giflib \
+          libjpeg-turbo \
+          libpng \
+          libwebp \
+        #
+
+    - name: Configure
+      shell: 'bash'
+      run: |
+        set -x
+        mkdir build
+        cmake -Bbuild -H. ${{ matrix.arch }} \
+          -DBUILD_TESTING=OFF \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=`pwd`/prefix \
+          -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake \
+          -DJPEGXL_ENABLE_OPENEXR=OFF \
+          -DJPEGXL_ENABLE_PLUGINS=OFF \
+          -DJPEGXL_ENABLE_TCMALLOC=OFF \
+          -DJPEGXL_ENABLE_VIEWERS=OFF \
+          -DVCPKG_TARGET_TRIPLET=${{ matrix.triplet }} \
+        #
+    - name: Build
+      shell: 'bash'
+      run: |
+        set -x
+        cmake --build build --config Release
+    - name: Install
+      shell: 'bash'
+      run: |
+        set -x
+        cmake --build build --config Release --target install
+        for pkg in giflib libjpeg-turbo libpng libwebp zlib; do
+          cp vcpkg/installed/${{matrix.triplet}}/share/${pkg}/copyright \
+            prefix/bin/LICENSE.${pkg}
+        done
+        cp third_party/sjpeg/COPYING prefix/bin/LICENSE.sjpeg
+        cp third_party/skcms/LICENSE prefix/bin/LICENSE.skcms
+        cp third_party/highway/LICENSE prefix/bin/LICENSE.highway
+        cp third_party/brotli/LICENSE prefix/bin/LICENSE.brotli
+        cp LICENSE prefix/bin/LICENSE.libjxl
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: jxl-${{matrix.triplet}}
+        path: |
+          prefix/bin/*
+
+    - name: Package release zip
+      if: github.event_name == 'release'
+      shell: 'powershell'
+      run: |
+        Compress-Archive -Path prefix\bin\* `
+          -DestinationPath jxl-${{matrix.triplet}}.zip
+
+    - name: Upload binaries to release
+      if: github.event_name == 'release'
+      uses: AButler/upload-release-assets@v2.0
+      with:
+        files: jxl-${{matrix.triplet}}.zip
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/third_party/jpeg-xl/.readthedocs.yaml b/third_party/jpeg-xl/.readthedocs.yaml
new file mode 100644
index 0000000000..6d714ba1aa
--- /dev/null
+++ b/third_party/jpeg-xl/.readthedocs.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+#
+# readthedocs.io configuration file. See:
+#   https://docs.readthedocs.io/en/stable/config-file/v2.html
+
+version: 2
+
+sphinx:
+   configuration: doc/sphinx/conf.py
+
+python:
+   version: "3.7"
+   install:
+   - requirements: doc/sphinx/requirements.txt
diff --git a/third_party/jpeg-xl/AUTHORS b/third_party/jpeg-xl/AUTHORS
new file mode 100644
index 0000000000..2ff5d04f67
--- /dev/null
+++ b/third_party/jpeg-xl/AUTHORS
@@ -0,0 +1,67 @@
+# List of the project authors for copyright purposes. When contributing to the
+# project add your name or your organization's name to this list. See
+# CONTRIBUTING.md for details.
+#
+# For organizations:
+#   Organization <email pattern: *@domain>
+#
+# For individuals:
+#   Name <email address>
+#
+# Please keep each list sorted. If you wish to change your email address please
+# send a pull request.
+
+# Organizations:
+Cloudinary Ltd. <*@cloudinary.com>
+Google LLC <*@google.com>
+
+# Individuals:
+a-shvedov
+Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
+Alexander Sago <cagelight@gmail.com>
+Alistair Barrow
+Andrius Lukas Narbutas <andrius4669@gmail.com>
+Aous Naman <aous@unsw.edu.au>
+Artem Selishchev
+Biswapriyo Nath <nathbappai@gmail.com>
+CanadianBaconBoi <beamconnor@gmail.com>
+Damiano Albani <damiano.albani@gmail.com>
+Daniel Novomeský <dnovomesky@gmail.com>
+David Burnett <vargolsoft@gmail.com>
+Dirk Lemstra <dirk@lemstra.org>
+Don Olmstead <don.j.olmstead@gmail.com>
+Dong Xu <xdong181@gmail.com>
+Even Rouault <even.rouault@spatialys.com>
+Fred Brennan <copypaste@kittens.ph>
+gi-man
+Heiko Becker <heirecka@exherbo.org>
+Jim Robinson <jimbo2150@gmail.com>
+Jon Sneyers <jon@cloudinary.com>
+Joshua Root <jmr@macports.org>
+Kai Hollberg <Schweinepriester@users.noreply.github.com>
+Kleis Auke Wolthuizen <github@kleisauke.nl>
+L. E. Segovia
+Leo Izen <leo.izen@gmail.com>
+Lovell Fuller
+Maarten DB <anonymous.maarten@gmail.com>
+Marcin Konicki <ahwayakchih@gmail.com>
+Martin Strunz
+Mathieu Malaterre <mathieu.malaterre@gmail.com>
+Mikk Leini <mikk.leini@krakul.eu>
+Misaki Kasumi <misakikasumi@outlook.com>
+Nicholas Hayes <0xC0000054@users.noreply.github.com>
+Nigel Tao <nigeltao@golang.org>
+Petr Diblík
+Pieter Wuille
+roland-rollo
+Samuel Leong <wvvwvvvvwvvw@gmail.com>
+Sandro <sandro.jaeckel@gmail.com>
+Sergey Fedorov <vital.had@gmail.com>
+Stephan T. Lavavej <stl@nuwen.net>
+Thomas Bonfort <thomas.bonfort@airbus.com>
+tmkk <tmkkmac@gmail.com>
+Vincent Torri <vincent.torri@gmail.com>
+xiota
+Yonatan Nebenzhal <yonatan.nebenzhl@gmail.com>
+Ziemowit Zabawa <ziemek.zabawa@outlook.com>
+源文雨 <41315874+fumiama@users.noreply.github.com>
diff --git a/third_party/jpeg-xl/BUILD.bazel b/third_party/jpeg-xl/BUILD.bazel
new file mode 100644
index 0000000000..0b81fc7b8a
--- /dev/null
+++ b/third_party/jpeg-xl/BUILD.bazel
@@ -0,0 +1,22 @@
+package(default_visibility = ["//:__subpackages__"])
+
+filegroup(
+    name = "testdata",
+    srcs = glob([
+        "testdata/**/*.icc",
+        "testdata/**/*.pam",
+        "testdata/**/*.pfm",
+        "testdata/**/*.pgm",
+        "testdata/**/*.pnm",
+        "testdata/**/*.ppm",
+        "testdata/**/*.png",
+        "testdata/**/*.jpg",
+        "testdata/**/*.jxl",
+        "testdata/**/*.gif",
+        "testdata/**/*.y4m",
+        "testdata/**/*.jxl",
+        "testdata/**/*.png",
+        "testdata/**/*.jpg",
+        "testdata/position_encoding/*.txt",
+    ]),
+)
diff --git a/third_party/jpeg-xl/BUILDING.md b/third_party/jpeg-xl/BUILDING.md
new file mode 100644
index 0000000000..8fc4561aa0
--- /dev/null
+++ b/third_party/jpeg-xl/BUILDING.md
@@ -0,0 +1,98 @@
+# Compilation
+
+For more details and other workflows see the "Advanced guide" below.
+
+## Checking out the code
+
+```bash
+git clone https://github.com/libjxl/libjxl.git --recursive --shallow-submodules
+```
+
+This repository uses git submodules to handle some third party dependencies
+under `third_party`, that's why it is important to pass `--recursive`. If you
+didn't check out with `--recursive`, or any submodule has changed, run:
+
+```bash
+git submodule update --init --recursive --depth 1 --recommend-shallow
+```
+
+The `--shallow-submodules` and `--depth 1 --recommend-shallow` options create
+shallow clones which only downloads the commits requested, and is all that is
+needed to build `libjxl`. Should full clones be necessary, you could always run:
+
+```bash
+git submodule foreach git fetch --unshallow
+git submodule update --init --recursive
+```
+
+which pulls the rest of the commits in the submodules.
+
+Important: If you downloaded a zip file or tarball from the web interface you
+won't get the needed submodules and the code will not compile. You can download
+these external dependencies from source running `./deps.sh`. The git workflow
+described above is recommended instead.
+
+## Installing dependencies
+
+Required dependencies for compiling the code, in a Debian/Ubuntu based
+distribution run:
+
+```bash
+sudo apt install cmake pkg-config libbrotli-dev
+```
+
+Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
+in a Debian/Ubuntu based distribution run:
+
+```bash
+sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
+```
+
+We recommend using a recent Clang compiler (version 7 or newer), for that
+install clang and set `CC` and `CXX` variables.
+
+```bash
+sudo apt install clang
+export CC=clang CXX=clang++
+```
+
+## Building
+
+```bash
+cd libjxl
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
+cmake --build . -- -j$(nproc)
+```
+
+The encoder/decoder tools will be available in the `build/tools` directory.
+
+## <a name="installing"></a> Installing
+
+```bash
+sudo cmake --install .
+```
+
+
+
+## Advanced guide
+
+### Building with Docker
+
+We build a common environment based on Debian/Ubuntu using Docker. Other
+systems may have different combinations of versions and dependencies that
+have not been tested and may not work. For those cases we recommend using the
+Docker container as explained in the
+[step by step guide](doc/developing_in_docker.md).
+
+### Building JPEG XL for developers
+
+For experienced developers, we provide build instructions for several other environments:
+
+*   [Building on Debian](doc/developing_in_debian.md)
+*   Building on Windows with [vcpkg](doc/developing_in_windows_vcpkg.md) (Visual Studio 2019)
+*   Building on Windows with [MSYS2](doc/developing_in_windows_msys.md)
+*   [Cross Compiling for Windows with Crossroad](doc/developing_with_crossroad.md)
+
+If you encounter any difficulties, please use Docker instead.
diff --git a/third_party/jpeg-xl/BUILDING_Haiku.md b/third_party/jpeg-xl/BUILDING_Haiku.md
new file mode 100644
index 0000000000..1ffca1453c
--- /dev/null
+++ b/third_party/jpeg-xl/BUILDING_Haiku.md
@@ -0,0 +1,20 @@
+## Disclaimer
+
+Haiku builds are not officially supported, i.e. the build might not work at all,
+some tests may fail and some sub-projects are excluded from build.
+
+This manual outlines Haiku-specific setup. For general building and testing
+instructions see "[BUILDING](BUILDING.md)" and
+"[Building and Testing changes](doc/building_and_testing.md)".
+
+## Dependencies
+
+```shell
+pkgman install llvm9_clang ninja cmake doxygen libjpeg_turbo_devel giflib_devel
+```
+
+## Building
+
+```shell
+TEST_STACK_LIMIT=none CMAKE_FLAGS="-I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++ -I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++/x86_64-unknown-haiku" CMAKE_SHARED_LINKER_FLAGS="-shared -Xlinker -soname=libjpegxl.so -lpthread" ./ci.sh opt
+```
diff --git a/third_party/jpeg-xl/BUILDING_OSX.md b/third_party/jpeg-xl/BUILDING_OSX.md
new file mode 100644
index 0000000000..b5f5e34db7
--- /dev/null
+++ b/third_party/jpeg-xl/BUILDING_OSX.md
@@ -0,0 +1,41 @@
+## Disclaimer
+
+OSX builds have "best effort" support, i.e. build might not work at all, some
+tests may fail and some sub-projects are excluded from build.
+
+This manual outlines OSX specific setup. For general building and testing
+instructions see "[BUILDING](BUILDING.md)" and
+"[Building and Testing changes](doc/building_and_testing.md)".
+
+[Homebrew](https://brew.sh/) is a popular package manager. JPEG XL library and
+binaries could be installed using it:
+
+```bash
+brew install jpeg-xl
+```
+
+## Dependencies
+
+Make sure that `brew doctor` does not report serious problems and up-to-date
+version of XCode is installed.
+
+Installing (actually, building) `clang` might take a couple hours.
+
+```bash
+brew install llvm
+```
+
+```bash
+brew install coreutils cmake giflib jpeg-turbo libpng ninja zlib
+```
+
+Before building the project check that `which clang` is
+`/usr/local/opt/llvm/bin/clang`, not the one provided by XCode. If not, update
+`PATH` environment variable.
+
+Also, setting `CMAKE_PREFIX_PATH` might be necessary for correct include paths
+resolving, e.g.:
+
+```bash
+export CMAKE_PREFIX_PATH=`brew --prefix giflib`:`brew --prefix jpeg-turbo`:`brew --prefix libpng`:`brew --prefix zlib`
+```
+\ No newline at end of file
diff --git a/third_party/jpeg-xl/CHANGELOG.md b/third_party/jpeg-xl/CHANGELOG.md
new file mode 100644
index 0000000000..20c974c55c
--- /dev/null
+++ b/third_party/jpeg-xl/CHANGELOG.md
@@ -0,0 +1,294 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## Unreleased
+
+### Added
+ - encoder API: add `JxlEncoderSetExtraChannelDistance` to adjust the quality
+   of extra channels (like alpha) separately.
+
+### Removed
+
+### Changed 
+ - changed the name of the cjxl flag `photon_noise` to `photon_noise_iso`
+
+## [0.8.0] - 2023-01-18
+
+### Added
+ - decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
+   of the output buffer.
+ - decoder API proposal: add `JxlDecoderSetOutputColorProfile` and
+   `JxlDecoderSetCms` to enable decoding to desired colorspace; NB: not
+   implemented yet.
+ - encoder API: new function `JxlEncoderSetFrameBitDepth` to set the bit depth
+   of the input buffer.
+ - encoder API: add an effort 10 option for lossless compression; using this
+   setting requires calling `JxlEncoderAllowExpertOptions`.
+ - encoder API: new `JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES` enum value to
+   allow explicit control of metadata compression
+
+### Removed
+ - common API: removed `JxlIntrinsicSizeHeader`
+ - decoder API: removed deprecated `JXL_DEC_NEED_DC_OUT_BUFFER` and
+   `JXL_DEC_DC_IMAGE` events, `JxlDecoderDCOutBufferSize` and
+   `JxlDecoderSetDCOutBuffer` functions
+
+### Changed / clarified
+ - encoder API: `JxlEncoderProcessOutput` requires at least 32 bytes of output
+   space to proceed and guarantees that at least one byte will be written
+## [0.7] - 2022-07-21
+
+### Added
+ - Export version information in headers.
+ - decoder API: Ability to decode the content of metadata boxes:
+   `JXL_DEC_BOX`, `JXL_DEC_BOX_NEED_MORE_OUTPUT`, `JxlDecoderSetBoxBuffer`,
+   `JxlDecoderGetBoxType`, `JxlDecoderGetBoxSizeRaw` and
+   `JxlDecoderSetDecompressBoxes`.
+ - decoder API: ability to mark the input is finished: `JxlDecoderCloseInput`.
+ - decoder API: ability to request updates on different progressive events using
+   `JxlDecoderSetProgressiveDetail`; currently supported events are
+   `kDC`, `kLastPasses` and `kPasses`.
+ - decoder API: ability to specify desired intensity target using
+   `JxlDecoderSetDesiredIntensityTarget`
+ - decoder API: new function `JxlDecoderSetCoalesced` to allow decoding
+   non-coalesced (unblended) frames, e.g. layers of a composite still image
+   or the cropped frames of a recompressed GIF/APNG.
+ - decoder API: new function `JxlDecoderSetUnpremultiplyAlpha` to set
+   preference for getting an associated alpha channel with premultiplied or
+   unpremultiplied colors.
+ - decoder API: field added to `JxlFrameHeader`: a `JxlLayerInfo` struct
+   that contains crop dimensions and offsets and blending information for
+   the non-coalesced case.
+ - decoder API: new function `JxlDecoderGetExtraChannelBlendInfo` to get
+   the blending information for extra channels in the non-coalesced case.
+ - decoder API: new function `JxlDecoderSetMultithreadedImageOutCallback`,
+   allowing output callbacks to receive more information about the number of
+   threads on which they are running.
+ - decoder API: new function `JxlDecoderSkipCurrentFrame` to skip processing
+   the current frame after a progressive detail is reached.
+ - decoder API: new function `JxlDecoderGetIntendedDownsamplingRatio` to get
+   the intended downsampling ratio of progressive steps, based on the
+   information in the frame header.
+ - decoder API: new function `JxlDecoderSetRenderSpotcolors` to allow disabling
+   rendering of spot colors.
+ - decoder/encoder API: add two fields to `JXLBasicInfo`: `intrinsic_xsize`
+   and `intrinsic_ysize` to signal the intrinsic size.
+ - encoder API: ability to add metadata boxes, added new functions
+   `JxlEncoderAddBox`, `JxlEncoderUseBoxes`, `JxlEncoderCloseBoxes` and
+   `JxlEncoderCloseFrames`.
+ - encoder API: added ability to set several encoder options / extra fields to
+   frames using `JxlEncoderSetFrameName`, `JxlEncoderFrameSettingsSetOption`,
+   `JxlEncoderFrameSettingsSetFloatOption`.
+ - encoder API: added ability to check required codestream compatibility level
+   and force specified using `JxlEncoderGetRequiredCodestreamLevel` and
+   `JxlEncoderSetCodestreamLevel`.
+ - encoder API: added ability to force emitting box-based container format
+   using `JxlEncoderUseContainer`.
+ - encoder API: added ability to store JPEG metadata for lossless reconstruction
+   using `JxlEncoderStoreJPEGMetadata`
+ - encoder API: new functions `JxlEncoderSetFrameHeader` and
+   `JxlEncoderSetExtraChannelBlendInfo` to set animation
+   and blending parameters of the frame, and `JxlEncoderInitFrameHeader` and
+   `JxlEncoderInitBlendInfo` to initialize the structs to set.
+ - encoder API: ability to encode arbitrary extra channels:
+  `JxlEncoderInitExtraChannelInfo`, `JxlEncoderSetExtraChannelInfo`,
+  `JxlEncoderSetExtraChannelName` and `JxlEncoderSetExtraChannelBuffer`.
+ - encoder API: ability to plug custom CMS implementation using
+   `JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms)`
+ - encoder API: added `JxlEncoderGetError` to retrieve last encoder error.
+
+### Changed
+- decoder API: using `JxlDecoderCloseInput` at the end of all input is required
+  when using JXL_DEC_BOX, and is now also encouraged in other cases, but not
+  required in those other cases for backwards compatibility.
+- encoder API: `JxlEncoderCloseInput` now closes both frames and boxes input.
+- CLI: `cjxl` and `djxl` have been reimplemented on the base of public decoder
+  and encoder API; dropped dependency on `gflags` for argument parsing.
+
+### Deprecated
+- decoder API: `JXL_DEC_EXTENSIONS` event: use `JXL_DEC_BASIC_INFO`
+- decoder / encoder API: pixel types `JXL_TYPE_BOOLEAN` and `JXL_TYPE_UINT32`:
+  consider using `JXL_TYPE_UINT8` and `JXL_TYPE_FLOAT` correspondingly.
+- decoder API: pixel format parameter for `JxlDecoderGetColorAsEncodedProfile`
+  and `JxlDecoderGetICCProfileSize`: pass `NULL`.
+- decoder API: `JxlDecoderDefaultPixelFormat`
+- encoder API: `JxlEncoderOptions`: use `JxlEncoderFrameSettings` instead.
+- encoder API: `JxlEncoderOptionsCreate`: use `JxlEncoderFrameSettingsCreate`
+  instead.
+- encoder API: `JxlEncoderOptionsSetDistance`: use `JxlEncoderSetFrameDistance`
+  instead.
+- encoder API: `JxlEncoderOptionsSetLossless`: use `JxlEncoderSetFrameLossless`
+  instead.
+- encoder API: `JxlEncoderOptionsSetEffort`: use
+  `JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, effort)`
+  instead.
+- encoder API: `JxlEncoderOptionsSetDecodingSpeed`: use
+  `JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier)`
+  instead.
+- encoder API: deprecated `JXL_ENC_NOT_SUPPORTED`, the encoder returns
+  `JXL_ENC_ERROR` instead and there is no need to handle
+  `JXL_ENC_NOT_SUPPORTED`.
+
+## [0.6.1] - 2021-10-29
+### Changed
+ - Security: Fix OOB read in splines rendering (#735 -
+   [CVE-2021-22563](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-22563))
+ - Security: Fix OOB copy (read/write) in out-of-order/multi-threaded decoding
+   (#708 - [CVE-2021-22564](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-22564))
+ - Fix segfault in `djxl` tool with `--allow_partial_files` flag (#781).
+ - Fix border in extra channels when using upsampling (#796)
+
+## [0.6] - 2021-10-04
+### Added
+ - API: New functions to decode extra channels:
+   `JxlDecoderExtraChannelBufferSize` and `JxlDecoderSetExtraChannelBuffer`.
+ - API: New function `JxlEncoderInitBasicInfo` to initialize `JxlBasicInfo`
+   (only needed when encoding). NOTE: it is now required to call this function
+   when using the encoder. Padding was added to the struct for forward
+   compatibility.
+ - API: Support for encoding oriented images.
+ - API: FLOAT16 support in the encoder API.
+ - Rewrite of the GDK pixbuf loader plugin. Added proper color management and
+   animation support.
+ - Rewrite of GIMP plugin. Added compression parameters dialog and switched to
+   using the public C API.
+ - Debian packages for GDK pixbuf loader (`libjxl-gdk-pixbuf`) and GIMP
+   (`libjxl-gimp-plugin`) plugins.
+ - `cjxl`/`djxl` support for `stdin` and `stdout`.
+
+### Changed
+ - API: Renamed the field `alpha_associated` in `JxlExtraChannelInfo` to
+   `alpha_premultiplied`, to match the corresponding name in `JxlBasicInfo`.
+ - Improved the 2x2 downscaling method in the encoder for the optional color
+   channel resampling for low bit rates.
+ - Fixed: the combination of floating point original data, XYB color encoding,
+   and Modular mode was broken (in both encoder and decoder). It now works.
+   NOTE: this can cause the current encoder to write jxl bitstreams that do
+   not decode with the old decoder. In particular this will happen when using
+   cjxl with PFM, EXR, or floating point PSD input, and a combination of XYB
+   and modular mode is used (which caused an encoder error before), e.g.
+   using options like `-m -q 80` (lossy modular), `-d 4.5` or `--progressive_dc=1`
+   (modular DC frame), or default lossy encoding on an image where patches
+   end up being used. There is no problem when using cjxl with PNG, JPEG, GIF,
+   APNG, PPM, PGM, PGX, or integer (8-bit or 16-bit) PSD input.
+ - `libjxl` static library now bundles skcms, fixing static linking in
+   downstream projects when skcms is used.
+ - Spline rendering performance improvements.
+ - Butteraugli changes for less visual masking.
+
+## [0.5] - 2021-08-02
+### Added
+ - API: New function to decode the image using a callback outputting a part of a
+   row per call.
+ - API: 16-bit float output support.
+ - API: `JxlDecoderRewind` and `JxlDecoderSkipFrames` functions to skip more
+   efficiently to earlier animation frames.
+ - API: `JxlDecoderSetPreferredColorProfile` function to choose color profile in
+   certain circumstances.
+ - encoder: Adding `center_x` and `center_y` flags for more control of the tile
+   order.
+ - New encoder speeds `lightning` (1) and `thunder` (2).
+
+### Changed
+ - Re-licensed the project under a BSD 3-Clause license. See the
+   [LICENSE](LICENSE) and [PATENTS](PATENTS) files for details.
+ - Full JPEG XL part 1 specification support: Implemented all the spec required
+   to decode files to pixels, including cases that are not used by the encoder
+   yet. Part 2 of the spec (container format) is final but not fully implemented
+   here.
+ - Butteraugli metric improvements. Exact numbers are different from previous
+   versions.
+ - Memory reductions during decoding.
+ - Reduce the size of the jxl_dec library by removing dependencies.
+ - A few encoding speedups.
+ - Clarify the security policy.
+ - Significant encoding improvements (~5 %) and less ringing.
+ - Butteraugli metric to have some less masking.
+ - `cjxl` flag `--speed` is deprecated and replaced by the `--effort` synonym.
+
+### Removed
+- API for returning a downsampled DC was deprecated
+  (`JxlDecoderDCOutBufferSize` and `JxlDecoderSetDCOutBuffer`) and will be
+  removed in the next release.
+
+## [0.3.7] - 2021-03-29
+### Changed
+ - Fix a rounding issue in 8-bit decoding.
+
+## [0.3.6] - 2021-03-25
+### Changed
+ - Fix a bug that could result in the generation of invalid codestreams as
+   well as failure to decode valid streams.
+
+## [0.3.5] - 2021-03-23
+### Added
+ - New encode-time options for faster decoding at the cost of quality.
+ - Man pages for cjxl and djxl.
+
+### Changed
+ - Memory usage improvements.
+ - Faster decoding to 8-bit output with the C API.
+ - GIMP plugin: avoid the sRGB conversion dialog for sRGB images, do not show
+   a console window on Windows.
+ - Various bug fixes.
+
+## [0.3.4] - 2021-03-16
+### Changed
+ - Improved box parsing.
+ - Improved metadata handling.
+ - Performance and memory usage improvements.
+
+## [0.3.3] - 2021-03-05
+### Changed
+ - Performance improvements for small images.
+ - Add a (flag-protected) non-high-precision mode with better speed.
+ - Significantly speed up the PQ EOTF.
+ - Allow optional HDR tone mapping in djxl (--tone_map, --display_nits).
+ - Change the behavior of djxl -j to make it consistent with cjxl (#153).
+ - Improve image quality.
+ - Improve EXIF handling.
+
+## [0.3.2] - 2021-02-12
+### Changed
+ - Fix embedded ICC encoding regression
+   [#149](https://gitlab.com/wg1/jpeg-xl/-/issues/149).
+
+## [0.3.1] - 2021-02-10
+### Changed
+ - New experimental Butteraugli API (`jxl/butteraugli.h`).
+ - Encoder improvements to low quality settings.
+ - Bug fixes, including fuzzer-found potential security bug fixes.
+ - Fixed `-q 100` and `-d 0` not triggering lossless modes.
+
+## [0.3] - 2021-01-29
+### Changed
+ - Minor change to the Decoder C API to accommodate future work for other ways
+   to provide input.
+ - Future decoder C API changes will be backwards compatible.
+ - Lots of bug fixes since the previous version.
+
+## [0.2] - 2020-12-24
+### Added
+ - JPEG XL bitstream format is frozen. Files encoded with 0.2 will be supported
+   by future versions.
+
+### Changed
+ - Files encoded with previous versions are not supported.
+
+## [0.1.1] - 2020-12-01
+
+## [0.1] - 2020-11-14
+### Added
+ - Initial release of an encoder (`cjxl`) and decoder (`djxl`) that work
+   together as well as a benchmark tool for comparison with other codecs
+   (`benchmark_xl`).
+ - Note: JPEG XL format is in the final stages of standardization, minor changes
+   to the codestream format are still possible but we are not expecting any
+   changes beyond what is required by bug fixing.
+ - API: new decoder API in C, check the `examples/` directory for its example
+   usage. The C API is a work in progress and likely to change both in API and
+   ABI in future releases.
diff --git a/third_party/jpeg-xl/CMakeLists.txt b/third_party/jpeg-xl/CMakeLists.txt
new file mode 100644
index 0000000000..02af13f78e
--- /dev/null
+++ b/third_party/jpeg-xl/CMakeLists.txt
@@ -0,0 +1,494 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Ubuntu bionic ships with cmake 3.10.
+cmake_minimum_required(VERSION 3.10)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
+if(POLICY CMP0063)
+  cmake_policy(SET CMP0063 NEW)
+endif()
+# Pass CMAKE_EXE_LINKER_FLAGS to CC and CXX compilers when testing if they work.
+if(POLICY CMP0065)
+  cmake_policy(SET CMP0065 NEW)
+endif()
+
+# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
+if(POLICY CMP0083)
+  cmake_policy(SET CMP0083 NEW)
+endif()
+
+project(LIBJXL LANGUAGES C CXX)
+
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__EMSCRIPTEN__)
+      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
+      #endif
+      return 0;
+    }"
+  JPEGXL_EMSCRIPTEN
+)
+
+message(STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-fsanitize=fuzzer-no-link" CXX_FUZZERS_SUPPORTED)
+check_cxx_compiler_flag("-Xclang -mconstructor-aliases" CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
+check_cxx_compiler_flag("-fmacro-prefix-map=OLD=NEW" CXX_MACRO_PREFIX_MAP)
+check_cxx_compiler_flag("-fno-rtti" CXX_NO_RTTI_SUPPORTED)
+
+# Enabled PIE binaries by default if supported.
+include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
+if(CHECK_PIE_SUPPORTED)
+  check_pie_supported(LANGUAGES CXX)
+  if(CMAKE_CXX_LINK_PIE_SUPPORTED)
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+  endif()
+endif()
+
+### Project build options:
+if(CXX_FUZZERS_SUPPORTED)
+  # Enabled by default except on arm64, Windows and Apple builds.
+  set(ENABLE_FUZZERS_DEFAULT true)
+endif()
+find_package(PkgConfig)
+if(NOT APPLE AND NOT WIN32 AND NOT HAIKU AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+  pkg_check_modules(TCMallocMinimalVersionCheck QUIET IMPORTED_TARGET
+      libtcmalloc_minimal)
+  if(TCMallocMinimalVersionCheck_FOUND AND
+     NOT TCMallocMinimalVersionCheck_VERSION VERSION_EQUAL 2.8.0)
+    # Enabled by default except on Windows and Apple builds for
+    # tcmalloc != 2.8.0. tcmalloc 2.8.1 already has a fix for this issue.
+    set(ENABLE_TCMALLOC_DEFAULT true)
+  else()
+    message(STATUS
+        "tcmalloc version ${TCMallocMinimalVersionCheck_VERSION} -- "
+        "tcmalloc 2.8.0 disabled due to "
+        "https://github.com/gperftools/gperftools/issues/1204")
+  endif()
+endif()
+
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(HWY_DISABLED_TARGETS)
+      static_assert(false, \"HWY_DISABLED_TARGETS is not defined\");
+      #endif
+      return 0;
+    }"
+  JXL_HWY_DISABLED_TARGETS_FORCED
+)
+
+set(WARNINGS_AS_ERRORS_DEFAULT false)
+
+if((SANITIZER STREQUAL "msan") OR JPEGXL_EMSCRIPTEN)
+  set(BUNDLE_LIBPNG_DEFAULT YES)
+else()
+  set(BUNDLE_LIBPNG_DEFAULT NO)
+endif()
+
+# Standard cmake naming for building shared libraries.
+get_property(SHARED_LIBS_SUPPORTED GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS)
+option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ${SHARED_LIBS_SUPPORTED})
+
+set(JPEGXL_ENABLE_FUZZERS ${ENABLE_FUZZERS_DEFAULT} CACHE BOOL
+    "Build JPEGXL fuzzer targets.")
+set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
+    "Build JPEGXL developer tools.")
+set(JPEGXL_ENABLE_TOOLS true CACHE BOOL
+    "Build JPEGXL user tools: cjxl and djxl.")
+set(JPEGXL_ENABLE_JPEGLI true CACHE BOOL
+    "Build jpegli library.")
+set(JPEGXL_ENABLE_JPEGLI_LIBJPEG true CACHE BOOL
+    "Build libjpeg.so shared library based on jpegli.")
+set(JPEGLI_LIBJPEG_LIBRARY_VERSION "62.3.0" CACHE STRING
+    "Library version of the libjpeg.so shared library that we build.")
+set(JPEGLI_LIBJPEG_LIBRARY_SOVERSION "62" CACHE STRING
+    "Library so-version of the libjpeg.so shared library that we build.")
+set(JPEGXL_ENABLE_DOXYGEN true CACHE BOOL
+    "Generate C API documentation using Doxygen.")
+set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
+    "Build and install man pages for the command-line tools.")
+set(JPEGXL_ENABLE_BENCHMARK true CACHE BOOL
+    "Build JPEGXL benchmark tools.")
+set(JPEGXL_ENABLE_EXAMPLES true CACHE BOOL
+    "Build JPEGXL library usage examples.")
+set(JPEGXL_BUNDLE_LIBPNG ${BUNDLE_LIBPNG_DEFAULT} CACHE BOOL
+    "Build libpng from source and link it statically.")
+set(JPEGXL_ENABLE_JNI true CACHE BOOL
+    "Build JPEGXL JNI Java wrapper, if Java dependencies are installed.")
+set(JPEGXL_ENABLE_SJPEG true CACHE BOOL
+    "Build JPEGXL with support for encoding with sjpeg.")
+set(JPEGXL_ENABLE_OPENEXR true CACHE BOOL
+    "Build JPEGXL with support for OpenEXR if available.")
+set(JPEGXL_ENABLE_SKCMS true CACHE BOOL
+    "Build with skcms instead of lcms2.")
+set(JPEGXL_BUNDLE_SKCMS true CACHE BOOL
+    "When building with skcms, bundle it into libjxl.a.")
+set(JPEGXL_ENABLE_VIEWERS false CACHE BOOL
+    "Build JPEGXL viewer tools for evaluation.")
+set(JPEGXL_ENABLE_TCMALLOC ${ENABLE_TCMALLOC_DEFAULT} CACHE BOOL
+    "Build JPEGXL using gperftools (tcmalloc) allocator.")
+set(JPEGXL_ENABLE_PLUGINS false CACHE BOOL
+    "Build third-party plugins to support JPEG XL in other applications.")
+set(JPEGXL_ENABLE_COVERAGE false CACHE BOOL
+    "Enable code coverage tracking for libjxl. This also enables debug and disables optimizations.")
+set(JPEGXL_ENABLE_PROFILER false CACHE BOOL
+    "Builds in support for profiling (printed by tools if extra flags given)")
+set(JPEGXL_ENABLE_SIZELESS_VECTORS false CACHE BOOL
+    "Builds in support for SVE/RVV vectorization")
+set(JPEGXL_ENABLE_TRANSCODE_JPEG true CACHE BOOL
+    "Builds in support for decoding transcoded JXL files back to JPEG,\
+ disabling it makes the decoder reject JXL_DEC_JPEG_RECONSTRUCTION events,\
+ (default enabled)")
+set(JPEGXL_ENABLE_BOXES true CACHE BOOL
+    "Builds in support for decoding boxes in JXL files,\
+ disabling it makes the decoder reject JXL_DEC_BOX events,\
+ (default enabled)")
+set(JPEGXL_STATIC false CACHE BOOL
+    "Build tools as static binaries.")
+set(JPEGXL_WARNINGS_AS_ERRORS ${WARNINGS_AS_ERRORS_DEFAULT} CACHE BOOL
+    "Treat warnings as errors during compilation.")
+set(JPEGXL_DEP_LICENSE_DIR "" CACHE STRING
+    "Directory where to search for system dependencies \"copyright\" files.")
+set(JPEGXL_FORCE_NEON false CACHE BOOL
+    "Set flags to enable NEON in arm if not enabled by your toolchain.")
+set(JPEGXL_TEST_TOOLS false CACHE BOOL
+    "Run scripts that test the encoding / decoding tools.")
+
+# Force system dependencies.
+set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
+    "Force using system installed brotli instead of third_party/brotli source.")
+set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
+    "Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
+set(JPEGXL_FORCE_SYSTEM_LCMS2 false CACHE BOOL
+    "Force using system installed lcms2 instead of third_party/lcms source.")
+set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
+    "Force using system installed highway (libhwy-dev) instead of third_party/highway source.")
+
+# Check minimum compiler versions. Older compilers are not supported and fail
+# with hard to understand errors.
+if (NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
+  message(FATAL_ERROR "Different C/C++ compilers set: "
+          "${CMAKE_C_COMPILER_ID} vs ${CMAKE_CXX_COMPILER_ID}")
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  # Android NDK's toolchain.cmake fakes the clang version in
+  # CMAKE_CXX_COMPILER_VERSION with an incorrect number, so ignore this.
+  if (NOT CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION MATCHES "clang"
+      AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5)
+    message(FATAL_ERROR
+      "Minimum Clang version required is Clang 5, please update.")
+  endif()
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
+    message(FATAL_ERROR
+      "Minimum GCC version required is 7, please update.")
+  endif()
+endif()
+
+message(STATUS
+    "Compiled IDs C:${CMAKE_C_COMPILER_ID}, C++:${CMAKE_CXX_COMPILER_ID}")
+
+# CMAKE_EXPORT_COMPILE_COMMANDS is used to generate the compilation database
+# used by clang-tidy.
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(JPEGXL_STATIC)
+  set(BUILD_SHARED_LIBS 0)
+  # Clang developers say that in case to use "static" we have to build stdlib
+  # ourselves; for real use case we don't care about stdlib, as it is "granted",
+  # so just linking all other libraries is fine.
+  if (NOT MSVC AND NOT APPLE)
+    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
+    set(CMAKE_EXE_LINKER_FLAGS
+        "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
+  endif()
+endif()  # JPEGXL_STATIC
+
+# Threads
+set(THREADS_PREFER_PTHREAD_FLAG YES)
+find_package(Threads REQUIRED)
+
+# These settings are important to drive check_cxx_source_compiles
+# See CMP0067 (min cmake version is 3.10 anyway)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+# Atomics
+find_package(Atomics REQUIRED)
+
+if(JPEGXL_STATIC)
+  if (MINGW)
+    # In MINGW libstdc++ uses pthreads directly. When building statically a
+    # program (regardless of whether the source code uses pthread or not) the
+    # toolchain will add stdc++ and pthread to the linking step but stdc++ will
+    # be linked statically while pthread will be linked dynamically.
+    # To avoid this and have pthread statically linked with need to pass it in
+    # the command line with "-Wl,-Bstatic -lpthread -Wl,-Bdynamic" but the
+    # linker will discard it if not used by anything else up to that point in
+    # the linker command line. If the program or any dependency don't use
+    # pthread directly -lpthread is discarded and libstdc++ (added by the
+    # toolchain later) will then use the dynamic version. For this we also need
+    # to pass -lstdc++ explicitly before -lpthread. For pure C programs -lstdc++
+    # will be discarded anyway.
+    # This adds these flags as dependencies for *all* targets. Adding this to
+    # CMAKE_EXE_LINKER_FLAGS instead would cause them to be included before any
+    # object files and therefore discarded. This should be set in the
+    # INTERFACE_LINK_LIBRARIES of Threads::Threads but some third_part targets
+    # don't depend on it.
+    link_libraries(-Wl,-Bstatic -lstdc++ -lpthread -Wl,-Bdynamic)
+  elseif(CMAKE_USE_PTHREADS_INIT)
+    # "whole-archive" is not supported on OSX.
+    if (NOT APPLE)
+      # Set pthreads as a whole-archive, otherwise weak symbols in the static
+      # libraries will discard pthreads symbols leading to segmentation fault at
+      # runtime.
+      message(STATUS "Using -lpthread as --whole-archive")
+      set_target_properties(Threads::Threads PROPERTIES
+        INTERFACE_LINK_LIBRARIES
+            "-Wl,--whole-archive;-lpthread;-Wl,--no-whole-archive")
+    endif()
+  endif()
+endif()  # JPEGXL_STATIC
+
+if (JPEGXL_EMSCRIPTEN)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pthread")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
+endif()
+
+if (CXX_MACRO_PREFIX_MAP)
+  add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}=.)
+endif()
+
+if (CXX_NO_RTTI_SUPPORTED)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+endif()
+
+# Internal flags for coverage builds:
+set(JPEGXL_COVERAGE_FLAGS)
+set(JPEGXL_COVERAGE_LINK_FLAGS)
+
+if (MSVC)
+  # TODO(janwas): add flags
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+else ()
+  # Global compiler flags for all targets here and in subdirectories.
+  add_definitions(
+    # Avoid changing the binary based on the current time and date.
+    -D__DATE__="redacted"
+    -D__TIMESTAMP__="redacted"
+    -D__TIME__="redacted"
+  )
+
+  # TODO(eustas): JXL currently compiles, but does not pass tests...
+  if (NOT JXL_HWY_DISABLED_TARGETS_FORCED AND NOT JPEGXL_ENABLE_SIZELESS_VECTORS)
+    add_definitions(-DHWY_DISABLED_TARGETS=\(HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128|HWY_RVV\))
+    message("Warning: HWY_SVE, HWY_SVE2, HWY_SVE_256, HWY_SVE2_128 and HWY_RVV CPU targets are disabled")
+  endif()
+
+  # In CMake before 3.12 it is problematic to pass repeated flags like -Xclang.
+  # For this reason we place them in CMAKE_CXX_FLAGS instead.
+  # See https://gitlab.kitware.com/cmake/cmake/issues/15826
+
+  # Machine flags.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funwind-tables")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mrelax-all")
+  endif()
+  if (CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mconstructor-aliases")
+  endif()
+
+  if(WIN32)
+    # Not supported by clang-cl, but frame pointers are default on Windows
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+  endif()
+
+  # CPU flags - remove once we have NEON dynamic dispatch
+
+  # TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    if(JPEGXL_FORCE_NEON)
+      # GCC requires these flags, otherwise __ARM_NEON is undefined.
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+        -mfpu=neon-vfpv4 -mfloat-abi=hard")
+    endif()
+  endif()
+
+  # Force build with optimizations in release mode.
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+
+  add_compile_options(
+    # Ignore this to allow redefining __DATE__ and others.
+    -Wno-builtin-macro-redefined
+
+    # Global warning settings.
+    -Wall
+  )
+
+  if (JPEGXL_WARNINGS_AS_ERRORS)
+    add_compile_options(-Werror)
+  endif ()
+
+  if(JPEGXL_ENABLE_COVERAGE)
+    set(JPEGXL_COVERAGE_FLAGS
+        -g -O0 -fprofile-arcs -ftest-coverage
+        -DJXL_ENABLE_ASSERT=0 -DJXL_ENABLE_CHECK=0
+    )
+    set(JPEGXL_COVERAGE_LINK_FLAGS
+        --coverage
+    )
+  endif()  # JPEGXL_ENABLE_COVERAGE
+endif ()  # !MSVC
+
+include(GNUInstallDirs)
+
+# Separately build/configure testing frameworks and other third_party libraries
+# to allow disabling tests in those libraries.
+include(third_party/testing.cmake)
+add_subdirectory(third_party)
+# Copy the JXL license file to the output build directory.
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/LICENSE"
+               ${PROJECT_BINARY_DIR}/LICENSE.jpeg-xl COPYONLY)
+
+# Enable tests regardless of where they are defined.
+enable_testing()
+include(CTest)
+# Specify default location of `testdata`:
+if(NOT DEFINED JPEGXL_TEST_DATA_PATH)
+  set(JPEGXL_TEST_DATA_PATH "${PROJECT_SOURCE_DIR}/testdata")
+endif()
+
+# Libraries.
+add_subdirectory(lib)
+
+if(BUILD_TESTING)
+  # Script to run tests over the source code in bash.
+  find_program (BASH_PROGRAM bash)
+  if(BASH_PROGRAM)
+    add_test(
+      NAME bash_test
+      COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
+  endif()
+endif() # BUILD_TESTING
+
+# Documentation generated by Doxygen
+if(JPEGXL_ENABLE_DOXYGEN)
+  find_package(Doxygen)
+  if(DOXYGEN_FOUND)
+    set(DOXYGEN_GENERATE_HTML "YES")
+    set(DOXYGEN_GENERATE_XML "YES")
+    set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/lib/include")
+    set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md")
+    if(JPEGXL_WARNINGS_AS_ERRORS)
+      set(DOXYGEN_WARN_AS_ERROR "YES")
+    endif()
+    set(DOXYGEN_QUIET "YES")
+    doxygen_add_docs(doc
+      "${CMAKE_CURRENT_SOURCE_DIR}/lib/include"
+      "${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMENT "Generating C API documentation")
+
+    # Add sphinx doc build step for readthedocs.io (requires doxygen too).
+    find_program(SPHINX_BUILD_PROGRAM sphinx-build)
+    if(SPHINX_BUILD_PROGRAM)
+      add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent"
+        COMMENT "Generating readthedocs.io output on ${CMAKE_CURRENT_BINARY_DIR}/rtd"
+        COMMAND ${SPHINX_BUILD_PROGRAM} -q -W -b html -j auto
+          ${CMAKE_SOURCE_DIR}/doc/sphinx
+          ${CMAKE_CURRENT_BINARY_DIR}/rtd
+        DEPENDS doc
+      )
+      # This command runs the documentation generation every time since the output
+      # target file doesn't exist.
+      add_custom_target(rtd-html
+        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent
+      )
+    else() # SPHINX_BUILD_PROGRAM\
+      message(WARNING "sphinx-build not found, skipping rtd documentation")
+    endif() # SPHINX_BUILD_PROGRAM
+
+  else()
+    # Create a "doc" target for compatibility since "doc" is not otherwise added to
+    # the build when doxygen is not installed.
+    add_custom_target(doc false
+      COMMENT "Error: Can't generate doc since Doxygen not installed.")
+  endif() # DOXYGEN_FOUND
+endif() # JPEGXL_ENABLE_DOXYGEN
+
+if(JPEGXL_ENABLE_MANPAGES)
+  find_program(ASCIIDOC a2x)
+  if(ASCIIDOC)
+    file(STRINGS "${ASCIIDOC}" ASCIIDOC_SHEBANG LIMIT_COUNT 1)
+    if(ASCIIDOC_SHEBANG MATCHES "/sh|/bash" OR MINGW)
+      set(ASCIIDOC_PY_FOUND ON)
+      # Run the program directly and set ASCIIDOC as empty.
+      set(ASCIIDOC_PY "${ASCIIDOC}")
+      set(ASCIIDOC "")
+    elseif(ASCIIDOC_SHEBANG MATCHES "python2")
+      find_package(Python2 COMPONENTS Interpreter)
+      set(ASCIIDOC_PY_FOUND "${Python2_Interpreter_FOUND}")
+      set(ASCIIDOC_PY Python2::Interpreter)
+    elseif(ASCIIDOC_SHEBANG MATCHES "python3")
+      find_package(Python3 COMPONENTS Interpreter)
+      set(ASCIIDOC_PY_FOUND "${Python3_Interpreter_FOUND}")
+      set(ASCIIDOC_PY Python3::Interpreter)
+    else()
+      find_package(Python COMPONENTS Interpreter QUIET)
+      if(NOT Python_Interpreter_FOUND)
+        find_program(ASCIIDOC_PY python)
+        if(ASCIIDOC_PY)
+          set(ASCIIDOC_PY_FOUND ON)
+        endif()
+      else()
+        set(ASCIIDOC_PY_FOUND "${Python_Interpreter_FOUND}")
+        set(ASCIIDOC_PY Python::Interpreter)
+      endif()
+    endif()
+
+    if (ASCIIDOC_PY_FOUND)
+      set(MANPAGE_FILES "")
+      set(MANPAGES "")
+      foreach(PAGE IN ITEMS cjxl djxl)
+        # Invoking the Python interpreter ourselves instead of running the a2x binary
+        # directly is necessary on MSYS2, otherwise it is run through cmd.exe which
+        # does not recognize it.
+        add_custom_command(
+          OUTPUT "${PAGE}.1"
+          COMMAND "${ASCIIDOC_PY}"
+          ARGS ${ASCIIDOC}
+            --format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
+          MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
+        list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
+        list(APPEND MANPAGES "${PAGE}.1")
+      endforeach()
+      add_custom_target(manpages ALL DEPENDS ${MANPAGES})
+      install(FILES ${MANPAGE_FILES} DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
+    endif()  # ASCIIDOC_PY_FOUND
+  else()
+    message(WARNING "asciidoc was not found, the man pages will not be installed.")
+  endif()  # ASCIIDOC
+endif()  # JPEGXL_ENABLE_MANPAGES
+
+# Example usage code.
+if (JPEGXL_ENABLE_EXAMPLES)
+  include(examples/examples.cmake)
+endif ()
+
+# Plugins for third-party software
+if (JPEGXL_ENABLE_PLUGINS)
+  add_subdirectory(plugins)
+endif ()
+
+# Binary tools
+add_subdirectory(tools)
diff --git a/third_party/jpeg-xl/CODE_OF_CONDUCT.md b/third_party/jpeg-xl/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..b2d81a3214
--- /dev/null
+++ b/third_party/jpeg-xl/CODE_OF_CONDUCT.md
@@ -0,0 +1,93 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of
+experience, education, socio-economic status, nationality, personal appearance,
+race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+*   Using welcoming and inclusive language
+*   Being respectful of differing viewpoints and experiences
+*   Gracefully accepting constructive criticism
+*   Focusing on what is best for the community
+*   Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+*   The use of sexualized language or imagery and unwelcome sexual attention or
+    advances
+*   Trolling, insulting/derogatory comments, and personal or political attacks
+*   Public or private harassment
+*   Publishing others' private information, such as a physical or electronic
+    address, without explicit permission
+*   Other conduct which could reasonably be considered inappropriate in a
+    professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, or to ban temporarily or permanently any
+contributor for other behaviors that they deem inappropriate, threatening,
+offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when the Project
+Steward has a reasonable belief that an individual's behavior may have a
+negative impact on the project or its community.
+
+## Conflict Resolution
+
+We do not believe that all conflict is bad; healthy debate and disagreement
+often yield positive results. However, it is never okay to be disrespectful or
+to engage in behavior that violates the project’s code of conduct.
+
+If you see someone violating the code of conduct, you are encouraged to address
+the behavior directly with those involved. Many issues can be resolved quickly
+and easily, and this gives people more control over the outcome of their
+dispute. If you are unable to resolve the matter for any reason, or if the
+behavior is threatening or harassing, report it. We are dedicated to providing
+an environment where participants feel welcome and safe.
+
+Reports should be directed to Jyrki Alakuijala <jyrki@google.com>, the
+Project Steward(s) for JPEG XL. It is the Project Steward’s duty to
+receive and address reported violations of the code of conduct. They will then
+work with a committee consisting of representatives from the Open Source
+Programs Office and the Google Open Source Strategy team. If for any reason you
+are uncomfortable reaching out to the Project Steward, please email
+opensource@google.com.
+
+We will investigate every complaint, but you may not receive a direct response.
+We will use our discretion in determining when and how to follow up on reported
+incidents, which may range from not taking action to permanent expulsion from
+the project and project-sponsored spaces. We will notify the accused of the
+report and provide them an opportunity to discuss it before any action is taken.
+The identity of the reporter will be omitted from the details of the report
+supplied to the accused. In potentially harmful situations, such as ongoing
+harassment or threats to anyone's safety, we may take action without notice.
+
+## Attribution
+
+This Code of Conduct is adapted from the Contributor Covenant, version 1.4,
+available at
+https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
diff --git a/third_party/jpeg-xl/CONTRIBUTING.md b/third_party/jpeg-xl/CONTRIBUTING.md
new file mode 100644
index 0000000000..cb6459797c
--- /dev/null
+++ b/third_party/jpeg-xl/CONTRIBUTING.md
@@ -0,0 +1,132 @@
+# Contributing to libjxl
+
+## Contributing with bug reports
+
+For security-related issues please see [SECURITY.md](SECURITY.md).
+
+We welcome suggestions, feature requests and bug reports. Before opening a new
+issue please take a look if there is already an existing one in the following
+link:
+
+ *  https://github.com/libjxl/libjxl/issues
+
+## Contributing with patches and Pull Requests
+
+We'd love to accept your contributions to the JPEG XL Project. Please read
+through this section before sending a Pull Request.
+
+### Contributor License Agreements
+
+Our project is open source under the terms outlined in the [LICENSE](LICENSE)
+and [PATENTS](PATENTS) files. Before we can accept your contributions, even for
+small changes, there are just a few small guidelines you need to follow:
+
+Please fill out either the individual or corporate Contributor License Agreement
+(CLA) with Google. JPEG XL Project is an an effort by multiple individuals and
+companies, including the initial contributors Cloudinary and Google, but Google
+is the legal entity in charge of receiving these CLA and relicensing this
+software:
+
+  * If you are an individual writing original source code and you're sure you
+  own the intellectual property, then you'll need to sign an [individual
+  CLA](https://code.google.com/legal/individual-cla-v1.0.html).
+
+  * If you work for a company that wants to allow you to contribute your work,
+  then you'll need to sign a [corporate
+  CLA](https://code.google.com/legal/corporate-cla-v1.0.html).
+
+Follow either of the two links above to access the appropriate CLA and
+instructions for how to sign and return it. Once we receive it, we'll be able
+to accept your pull requests.
+
+***NOTE***: Only original source code from you and other people that have signed
+the CLA can be accepted into the main repository.
+
+### License
+
+Contributions are licensed under the project's [LICENSE](LICENSE). Each new
+file must include the following header when possible, with comment style adapted
+to the language as needed:
+
+```
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+```
+
+### Code Reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+### Contribution philosophy
+
+  * Prefer small changes, even if they don't implement a complete feature. Small
+  changes are easier to review and can be submitted faster. Think about what's
+  the smallest unit you can send that makes sense to review and submit in
+  isolation. For example, new modules that are not yet used by the tools but
+  have their own unittests are ok. If you have unrelated changes that
+  you discovered while working on something else, please send them in a
+  different Pull Request. If your are refactoring code and changing
+  functionality try to send the refactor first without any change in
+  functionality. Reviewers may ask you to split a Pull Request and it is
+  easier to create a smaller change from the beginning.
+
+  * Describe your commits. Add a meaningful description to your commit message, explain what you are changing if it is not trivially obvious, but more importantly explain *why* you are making those changes. For example "Fix
+  build" is not a good commit message, describe what build and if it makes sense
+  why is this fixing it or why was it failing without this. It is very likely
+  that people far in the future without any context you have right now will be
+  looking at your commit trying to figure out why was the change introduced. If
+  related to an issue in this or another repository include a link to it.
+
+  * Code Style: We follow the [Google C++ Coding
+  Style](https://google.github.io/styleguide/cppguide.html). A
+  [clang-format](https://clang.llvm.org/docs/ClangFormat.html) configuration
+  file is available to automatically format your code, you can invoke it with
+  the `./ci.sh lint` helper tool.
+
+  * Testing: Test your change and explain in the commit message *how* your
+  commit was tested. For example adding unittests or in some cases just testing
+  with the existing ones is enough. In any case, mention what testing was
+  performed so reviewers can evaluate whether that's enough testing. In many
+  cases, testing that the Continuous Integration workflow passes is enough.
+
+  * Make one commit per Pull Request / review, unless there's a good reason not
+  to. If you have multiple changes send multiple Pull Requests and each one can
+  have its own review.
+
+  * When addressing comments from reviewers prefer to squash or fixup your
+  edits and force-push your commit. When merging changes into the repository we
+  don't want to include the history of code review back and forth changes or
+  typos. Reviewers can click on the "force-pushed" automatic comment on a Pull
+  Request to see the changes between versions. We use "Rebase and merge" policy
+  to keep a linear git history which is easier to reason about.
+
+  * Your change must pass the build and test workflows. There's a `ci.sh` script
+  to help building and testing these configurations. See [building and
+  testing](doc/building_and_testing.md) for more details.
+
+### Contributing checklist.
+
+  * Sign the CLA (only needed once per user, see above).
+
+  * AUTHORS: If this is your first contribution, add your name or your
+  company name to the [AUTHORS](AUTHORS) file for copyright tracking purposes.
+
+  * Style guide. Check `./ci.sh lint`.
+
+  * Meaningful commit description: What and *why*, links to issues, testing
+  procedure.
+
+  * Squashed multiple edits into a single commit.
+
+  * Upload your changes to your fork and [create a Pull
+  Request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
+
+# Community Guidelines
+
+This project follows [Google's Open Source Community
+Guidelines](https://opensource.google.com/conduct/).
diff --git a/third_party/jpeg-xl/CONTRIBUTORS b/third_party/jpeg-xl/CONTRIBUTORS
new file mode 100644
index 0000000000..848096f921
--- /dev/null
+++ b/third_party/jpeg-xl/CONTRIBUTORS
@@ -0,0 +1,23 @@
+# This files lists individuals who made significant contributions to the JPEG XL
+# code base, such as design, adding features, performing experiments, ...
+# Small changes such as a small bugfix or fixing spelling errors are not
+# included. If you'd like to be included in this file thanks to a significant
+# contribution, feel free to send a pull request changing this file.
+Alex Deymo
+Alexander Rhatushnyak
+Evgenii Kliuchnikov
+Iulia-Maria Comșa
+Jan Wassenberg
+Jon Sneyers
+Jyrki Alakuijala
+Krzysztof Potempa
+Lode Vandevenne
+Luca Versari
+Martin Bruse
+Moritz Firsching
+Renata Khasanova
+Robert Obryk
+Sami Boukortt
+Sebastian Gomez-Gonzalez
+Thomas Fischbacher
+Zoltan Szabadka
diff --git a/third_party/jpeg-xl/LICENSE b/third_party/jpeg-xl/LICENSE
new file mode 100644
index 0000000000..c66034b105
--- /dev/null
+++ b/third_party/jpeg-xl/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) the JPEG XL Project Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/jpeg-xl/PATENTS b/third_party/jpeg-xl/PATENTS
new file mode 100644
index 0000000000..c95b8f4105
--- /dev/null
+++ b/third_party/jpeg-xl/PATENTS
@@ -0,0 +1,22 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the JPEG XL project.
+
+Google hereby grants to You a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except as stated in this section)
+patent license to make, have made, use, offer to sell, sell, import,
+transfer and otherwise run, modify and propagate the contents of this
+implementation of JPEG XL, where such license applies only to those patent
+claims, both currently owned or controlled by Google and acquired in
+the future, licensable by Google that are necessarily infringed by this
+implementation of JPEG XL.  This grant does not include claims that would be
+infringed only as a consequence of further modification of this
+implementation.  If you or your agent or exclusive licensee institute or
+order or agree to the institution of patent litigation against any
+entity (including a cross-claim or counterclaim in a lawsuit) alleging
+that this implementation of JPEG XL or any code incorporated within this
+implementation of JPEG XL constitutes direct or contributory patent
+infringement, or inducement of patent infringement, then any patent
+rights granted to you under this License for this implementation of JPEG XL
+shall terminate as of the date such litigation is filed.
diff --git a/third_party/jpeg-xl/README.md b/third_party/jpeg-xl/README.md
new file mode 100644
index 0000000000..1e9a9adbd1
--- /dev/null
+++ b/third_party/jpeg-xl/README.md
@@ -0,0 +1,133 @@
+# JPEG XL reference implementation
+
+[![Build/Test](https://github.com/libjxl/libjxl/actions/workflows/build_test.yml/badge.svg)](
+https://github.com/libjxl/libjxl/actions/workflows/build_test.yml)
+[![Build/Test Cross](https://github.com/libjxl/libjxl/actions/workflows/build_test_cross.yml/badge.svg)](
+https://github.com/libjxl/libjxl/actions/workflows/build_test_cross.yml)
+[![Conformance](https://github.com/libjxl/libjxl/actions/workflows/conformance.yml/badge.svg)](
+https://github.com/libjxl/libjxl/actions/workflows/conformance.yml)
+[![CIFuzz](https://github.com/libjxl/libjxl/actions/workflows/fuzz.yml/badge.svg)](
+https://github.com/libjxl/libjxl/actions/workflows/fuzz.yml)
+[![Releases](https://github.com/libjxl/libjxl/actions/workflows/release.yaml/badge.svg)](
+https://github.com/libjxl/libjxl/actions/workflows/release.yaml)
+[![Doc](https://readthedocs.org/projects/libjxl/badge/?version=latest)](
+https://libjxl.readthedocs.io/en/latest/?badge=latest)
+[![codecov](https://codecov.io/gh/libjxl/libjxl/branch/main/graph/badge.svg)](
+https://codecov.io/gh/libjxl/libjxl)
+
+<img src="doc/jxl.svg" width="100" align="right" alt="JXL logo">
+
+This repository contains a reference implementation of JPEG XL (encoder and
+decoder), called `libjxl`. This software library is
+[used by many applications that support JPEG XL](doc/software_support.md).
+
+JPEG XL was standardized in 2022 as [ISO/IEC 18181](https://jpeg.org/jpegxl/workplan.html).
+The [core codestream](doc/format_overview.md#codestream-features) is specified in 18181-1,
+the [file format](doc/format_overview.md#file-format-features) in 18181-2.
+[Decoder conformance](https://github.com/libjxl/conformance) is defined in 18181-3,
+and 18181-4 is the [reference software](https://github.com/libjxl/libjxl).
+
+The library API, command line options, and tools in this repository are subject
+to change, however files encoded with `cjxl` conform to the JPEG XL specification
+and can be decoded with current and future `djxl` decoders or the `libjxl` decoding library.
+
+## Installation
+
+In most Linux distributions, installing `libjxl` is just a matter of using the package management system.
+For example in Debian-based distributions: `apt install libjxl-tools` will install `cjxl` and `djxl`
+and other tools like `benchmark_xl` are available in the package `libjxl-devtools`.
+On MacOS, you can use [Homebrew](https://brew.sh/): `brew install jpeg-xl`.
+
+[![libjxl packaging status](https://repology.org/badge/vertical-allrepos/libjxl.svg?exclude_unsupported=1&columns=3&exclude_sources=modules,site&header=libjxl%20packaging%20status)](https://repology.org/project/libjxl/versions)
+
+From the [releases page](https://github.com/libjxl/libjxl/releases/) the following can be downloaded:
+ - Windows binaries 
+ - Debian and Ubuntu .deb packages 
+
+Of course you can also [build libjxl from sources](BUILDING.md).
+
+
+## Usage
+
+To encode a source image to JPEG XL with default settings:
+
+```bash
+cjxl input.png output.jxl
+```
+
+The desired visual fidelity can be selected using the `--distance` parameter
+(in units of just-noticeable difference, where 0 is lossless and the most useful lossy range is 0.5 .. 3.0),
+or using `--quality` (on a scale from 0 to 100, roughly matching libjpeg).
+The [encode effort](doc/encode_effort.md) can be selected using the `--effort` parameter.
+
+For more settings run `cjxl --help` or for a full list of options
+run `cjxl -v -v --help`.
+
+To decode a JPEG XL file run:
+
+```bash
+djxl input.jxl output.png
+```
+
+When possible `cjxl`/`djxl` are able to read/write the following
+image formats: .exr, .gif, .jpeg/.jpg, .pfm, .pgm/.ppm, .pgx, .png.
+Specifically for JPEG files, the default `cjxl` behavior is to apply lossless
+recompression and the default `djxl` behavior is to reconstruct the original
+JPEG file (when the extension of the output file is .jpg).
+
+### Benchmarking
+
+For speed benchmarks on single images in single or multi-threaded decoding
+`djxl` can print decoding speed information. See `djxl --help` for details
+on the decoding options and note that the output image is optional for
+benchmarking purposes.
+
+For more comprehensive benchmarking options, see the
+[benchmarking guide](doc/benchmarking.md).
+
+### Library API
+
+Besides the `libjxl` library [API documentation](https://libjxl.readthedocs.io/en/latest/),
+there are [example applications](examples/) and [plugins](plugins/) that can be used as a reference or
+starting point for developers who wish to integrate `libjxl` in their project.
+
+
+## License
+
+This software is available under a 3-clause BSD license which can be found in
+the [LICENSE](LICENSE) file, with an "Additional IP Rights Grant" as outlined in
+the [PATENTS](PATENTS) file.
+
+Please note that the PATENTS file only mentions Google since Google is the legal
+entity receiving the Contributor License Agreements (CLA) from all contributors
+to the JPEG XL Project, including the initial main contributors to the JPEG XL
+format: Cloudinary and Google.
+
+## Additional documentation
+
+### Codec description
+
+*   [JPEG XL Format Overview](doc/format_overview.md)
+*   [Introductory paper](https://www.spiedigitallibrary.org/proceedings/Download?fullDOI=10.1117%2F12.2529237) (open-access)
+*   [XL Overview](doc/xl_overview.md) - a brief introduction to the source code modules
+*   [JPEG XL white paper](https://ds.jpeg.org/whitepapers/jpeg-xl-whitepaper.pdf)
+*   [JPEG XL official website](https://jpeg.org/jpegxl)
+*   [JPEG XL community website](https://jpegxl.info)
+
+### Development process
+
+*   [More information on testing/build options](doc/building_and_testing.md)
+*   [Git guide for JPEG XL](doc/developing_in_github.md) - for developers
+*   [Fuzzing](doc/fuzzing.md) - for developers
+*   [Building Web Assembly artifacts](doc/building_wasm.md)
+*   [Test coverage on Codecov.io](https://app.codecov.io/gh/libjxl/libjxl) - for
+    developers
+*   [libjxl documentation on readthedocs.io](https://libjxl.readthedocs.io/)
+
+### Contact
+
+If you encounter a bug or other issue with the software, please open an Issue here.
+
+There is a [subreddit about JPEG XL](https://www.reddit.com/r/jpegxl/), and
+informal chatting with developers and early adopters of `libjxl` can be done on the
+[JPEG XL Discord server](https://discord.gg/DqkQgDRTFu).
diff --git a/third_party/jpeg-xl/SECURITY.md b/third_party/jpeg-xl/SECURITY.md
new file mode 100644
index 0000000000..d03012a63a
--- /dev/null
+++ b/third_party/jpeg-xl/SECURITY.md
@@ -0,0 +1,73 @@
+# Security and Vulnerability Policy for libjxl
+
+## TL;DR:
+
+CPE prefix: `cpe:2.3:a:libjxl_project:libjxl`
+
+To report a security issue, please email libjxl-security@google.com.
+
+Include in your email a description of the issue, the steps you took to create
+the issue, affected versions, and if known, mitigations for the issue. Our
+vulnerability management team will acknowledge receiving your email within 3
+working days.
+
+This project follows a 90 day disclosure timeline.
+
+For all other bugs, where there are no security implications about disclosing
+the unpatched bug, open a [new issue](https://github.com/libjxl/libjxl/issues)
+checking first for existing similar issues. If in doubt about the security
+impact of a bug you discovered, email first.
+
+## Policy overview
+
+libjxl's Security Policy is based on the [Google Open Source program
+guidelines](https://github.com/google/oss-vulnerability-guide) for coordinated
+vulnerability disclosure.
+
+Early versions of `libjxl` had a different security policy that didn't provide
+security and vulnerability disclosure support. Versions up to and including
+0.3.7 are not covered and won't receive any security advisory.
+
+Only released versions, starting from version 0.5, are covered by this policy.
+Development branches, arbitrary commits from `main` branch or even releases with
+backported features externally patched on top are not covered. Only those
+versions with a release tag in `libjxl`'s repository are covered, starting from
+version 0.5.
+
+## What's a "Security bug"
+
+A security bug is a bug that can potentially be exploited to let an attacker
+gain unauthorized access or privileges such as disclosing information or
+arbitrary code execution. Not all fuzzer-found bugs and not all assert()
+failures are considered security bugs in libjxl. For a detailed explanation and
+examples see our [Security Vulnerabilities Playbook](doc/vuln_playbook.md).
+
+## What to expect
+
+To report a security issue, please email libjxl-security@google.com with all the
+details about the bug you encountered.
+
+ * Include a description of the issue, steps to reproduce, etc. Compiler
+   versions, flags, exact version used and even CPU are often relevant given our
+   usage of SIMD and run-time dispatch of SIMD instructions.
+
+ * A member of our security team will reply to you within 3 business days. Note
+   that business days are different in different countries.
+
+ * We will evaluate the issue and we may require more input from your side to
+   reproduce it.
+
+ * If the issue fits in the description of a security bug, we will issue a
+   CVE, publish a fix and make a new minor or patch release with it. There is
+   a maximum of 90 day disclosure timeline, we ask you to not publish the
+   details before the 90 day deadline or the release date (whichever comes
+   first).
+
+ * In the case that we publish a CVE we will credit the external researcher who
+   reported the issue. When reporting security issues please let us know if you
+   need to include specific information while doing so, like for example a
+   company affiliation.
+
+Our security team follows the [Security Vulnerabilities
+Playbook](doc/vuln_playbook.md). For more details about the process and policies
+please take a look at it.
diff --git a/third_party/jpeg-xl/WORKSPACE b/third_party/jpeg-xl/WORKSPACE
new file mode 100644
index 0000000000..f0c63df47d
--- /dev/null
+++ b/third_party/jpeg-xl/WORKSPACE
@@ -0,0 +1,742 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+    ],
+)
+
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
+bazel_skylib_workspace()
+
+local_repository(
+    name = "highway",
+    path = "third_party/highway",
+)
+
+local_repository(
+    name = "brotli",
+    path = "third_party/brotli",
+)
+
+new_local_repository(
+    name = "googletest",
+    build_file = "third_party/googletest/BUILD.bazel",
+    path = "third_party/googletest",
+)
+
+new_local_repository(
+    name = "skcms",
+    build_file_content = """
+cc_library(
+    name = "skcms",
+    srcs = [
+        "skcms.cc",
+        "skcms_internal.h",
+        "src/Transform_inl.h",
+    ],
+    hdrs = ["skcms.h"],
+    visibility = ["//visibility:public"],
+)
+    """,
+    path = "third_party/skcms",
+)
+
+new_git_repository(
+    name = "zlib",
+    build_file_content = """
+cc_library(
+    name = "zlib",
+    defines = ["HAVE_UNISTD_H"],
+    srcs = [
+        "adler32.c",
+        "compress.c",
+        "crc32.c",
+        "crc32.h",
+        "deflate.c",
+        "deflate.h",
+        "gzclose.c",
+        "gzguts.h",
+        "gzlib.c",
+        "gzread.c",
+        "gzwrite.c",
+        "infback.c",
+        "inffast.c",
+        "inffast.h",
+        "inffixed.h",
+        "inflate.c",
+        "inflate.h",
+        "inftrees.c",
+        "inftrees.h",
+        "trees.c",
+        "trees.h",
+        "uncompr.c",
+        "zconf.h",
+        "zutil.c",
+        "zutil.h",
+    ],
+    hdrs = ["zlib.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+    """,
+    remote = "https://github.com/madler/zlib",
+    tag = "v1.2.13",
+)
+
+new_local_repository(
+    name = "png",
+    build_file_content = """
+genrule(
+    name = "pnglibconf",
+    srcs = ["scripts/pnglibconf.h.prebuilt"],
+    outs = ["pnglibconf.h"],
+    cmd = "cp -f $< $@",
+)
+cc_library(
+    name = "png",
+    srcs = [
+        "png.c",
+        "pngconf.h",
+        "pngdebug.h",
+        "pngerror.c",
+        "pngget.c",
+        "pnginfo.h",
+        ":pnglibconf",
+        "pngmem.c",
+        "pngpread.c",
+        "pngpriv.h",
+        "pngread.c",
+        "pngrio.c",
+        "pngrtran.c",
+        "pngrutil.c",
+        "pngset.c",
+        "pngstruct.h",
+        "pngtrans.c",
+        "pngwio.c",
+        "pngwrite.c",
+        "pngwtran.c",
+        "pngwutil.c",
+    ],
+    hdrs = ["png.h"],
+    includes = ["."],
+    linkopts = ["-lm"],
+    visibility = ["//visibility:public"],
+    deps = ["@zlib//:zlib"],
+)
+    """,
+    path = "third_party/libpng",
+)
+
+new_git_repository(
+    name = "libjpeg_turbo",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@BITS_IN_JSAMPLE@" : "8",
+    "@BUILD@" : "20230125",
+    "@CMAKE_PROJECT_NAME@" : "libjpeg-turbo",
+    "@COPYRIGHT_YEAR@" : "2023",
+    "@INLINE@" : "__inline__",
+    "@JPEG_LIB_VERSION@" : "62",
+    "@LIBJPEG_TURBO_VERSION_NUMBER@" : "2001005",
+    "@SIZE_T@" : "8",
+    "@THREAD_LOCAL@" : "__thread",
+    "@VERSION@" : "2.1.5",
+}
+YES_DEFINES = [
+    "C_ARITH_CODING_SUPPORTED", "D_ARITH_CODING_SUPPORTED",
+    "MEM_SRCDST_SUPPORTED", "HAVE_LOCALE_H", "HAVE_STDDEF_H", "HAVE_STDLIB_H",
+    "NEED_SYS_TYPES_H", "HAVE_UNSIGNED_CHAR", "HAVE_UNSIGNED_SHORT",
+    "HAVE_BUILTIN_CTZL"
+]
+NO_DEFINES = [
+    "WITH_SIMD", "NEED_BSD_STRINGS", "INCOMPLETE_TYPES_BROKEN",
+    "RIGHT_SHIFT_IS_UNSIGNED", "HAVE_INTRIN_H"
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+[
+    expand_template(
+        name = "expand_" + src,
+        template = src + ".in",
+        out = src,
+        substitutions = SUBSTITUTIONS,
+    ) for src in ["jconfig.h", "jconfigint.h", "jversion.h"]
+]
+cc_library(
+    name = "jpeg",
+    srcs = [
+        "jaricom.c",
+        "jcapimin.c",
+        "jcapistd.c",
+        "jcarith.c",
+        "jccoefct.c",
+        "jccolor.c",
+        "jcdctmgr.c",
+        "jchuff.c",
+        "jchuff.h",
+        "jcicc.c",
+        "jcinit.c",
+        "jcmainct.c",
+        "jcmarker.c",
+        "jcmaster.c",
+        "jcomapi.c",
+        "jconfig.h",
+        "jconfigint.h",
+        "jcparam.c",
+        "jcphuff.c",
+        "jcprepct.c",
+        "jcsample.c",
+        "jctrans.c",
+        "jdapimin.c",
+        "jdapistd.c",
+        "jdarith.c",
+        "jdatadst.c",
+        "jdatasrc.c",
+        "jdcoefct.c",
+        "jdcoefct.h",
+        "jdcolor.c",
+        "jdct.h",
+        "jddctmgr.c",
+        "jdhuff.c",
+        "jdhuff.h",
+        "jdicc.c",
+        "jdinput.c",
+        "jdmainct.c",
+        "jdmainct.h",
+        "jdmarker.c",
+        "jdmaster.c",
+        "jdmaster.h",
+        "jdmerge.c",
+        "jdmerge.h",
+        "jdphuff.c",
+        "jdpostct.c",
+        "jdsample.c",
+        "jdsample.h",
+        "jdtrans.c",
+        "jerror.c",
+        "jerror.h",
+        "jfdctflt.c",
+        "jfdctfst.c",
+        "jfdctint.c",
+        "jidctflt.c",
+        "jidctfst.c",
+        "jidctint.c",
+        "jidctred.c",
+        "jinclude.h",
+        "jmemmgr.c",
+        "jmemnobs.c",
+        "jmemsys.h",
+        "jmorecfg.h",
+        "jpeg_nbits_table.h",
+        "jpegcomp.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jquant1.c",
+        "jquant2.c",
+        "jsimd_none.c",
+        "jsimd.h",
+        "jsimddct.h",
+        "jutils.c",
+        "jversion.h",
+    ],
+    hdrs = [
+        "jccolext.c",
+        "jdcol565.c",
+        "jdcolext.c",
+        "jdmrg565.c",
+        "jdmrgext.c",
+        "jerror.h",
+        "jinclude.h",
+        "jpeglib.h",
+        "jstdhuff.c",
+    ],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+    """,
+    remote = "https://github.com/libjpeg-turbo/libjpeg-turbo.git",
+    tag = "2.1.4",
+)
+
+http_archive(
+    name = "gif",
+    build_file_content = """
+cc_library(
+    name = "gif",
+    srcs = [
+        "dgif_lib.c", "egif_lib.c", "gifalloc.c", "gif_err.c", "gif_font.c",
+        "gif_hash.c", "openbsd-reallocarray.c", "gif_hash.h",
+        "gif_lib_private.h"
+    ],
+    hdrs = ["gif_lib.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+    """,
+    sha256 = "31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd",
+    strip_prefix = "giflib-5.2.1",
+    url = "https://netcologne.dl.sourceforge.net/project/giflib/giflib-5.2.1.tar.gz",
+)
+
+new_git_repository(
+    name = "imath",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@IMATH_INTERNAL_NAMESPACE@": "Imath_3_1",
+    "@IMATH_LIB_VERSION@": "3.1.4",
+    "@IMATH_NAMESPACE_CUSTOM@": "0",
+    "@IMATH_NAMESPACE@": "Imath",
+    "@IMATH_PACKAGE_NAME@": "Imath 3.1.4",
+    "@IMATH_VERSION_MAJOR@": "3",
+    "@IMATH_VERSION_MINOR@": "1",
+    "@IMATH_VERSION_PATCH@": "4",
+    "@IMATH_VERSION@": "3.1.4",
+}
+YES_DEFINES = [
+    "IMATH_HALF_USE_LOOKUP_TABLE", "IMATH_ENABLE_API_VISIBILITY",
+]
+NO_DEFINES = [
+    "IMATH_HAVE_LARGE_STACK",
+]
+ONE_DEFINES = [
+    "IMATH_USE_NOEXCEPT",
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 1" for key in ONE_DEFINES
+})
+expand_template(
+    name = "expand_ImathConfig",
+    template = "config/ImathConfig.h.in",
+    out = "src/Imath/ImathConfig.h",
+    substitutions = SUBSTITUTIONS,
+)
+cc_library(
+    name = "Imath",
+    srcs = [
+        "src/Imath/ImathColorAlgo.cpp",
+        ":src/Imath/ImathConfig.h",
+        "src/Imath/ImathFun.cpp",
+        "src/Imath/ImathMatrixAlgo.cpp",
+        "src/Imath/ImathRandom.cpp",
+        "src/Imath/half.cpp",
+        "src/Imath/toFloat.h",
+    ],
+    hdrs = [
+        "src/Imath/ImathBox.h",
+        "src/Imath/ImathBoxAlgo.h",
+        "src/Imath/ImathColor.h",
+        "src/Imath/ImathColorAlgo.h",
+        "src/Imath/ImathEuler.h",
+        "src/Imath/ImathExport.h",
+        "src/Imath/ImathForward.h",
+        "src/Imath/ImathFrame.h",
+        "src/Imath/ImathFrustum.h",
+        "src/Imath/ImathFrustumTest.h",
+        "src/Imath/ImathFun.h",
+        "src/Imath/ImathGL.h",
+        "src/Imath/ImathGLU.h",
+        "src/Imath/ImathInt64.h",
+        "src/Imath/ImathInterval.h",
+        "src/Imath/ImathLine.h",
+        "src/Imath/ImathLineAlgo.h",
+        "src/Imath/ImathMath.h",
+        "src/Imath/ImathMatrix.h",
+        "src/Imath/ImathMatrixAlgo.h",
+        "src/Imath/ImathNamespace.h",
+        "src/Imath/ImathPlane.h",
+        "src/Imath/ImathPlatform.h",
+        "src/Imath/ImathQuat.h",
+        "src/Imath/ImathRandom.h",
+        "src/Imath/ImathRoots.h",
+        "src/Imath/ImathShear.h",
+        "src/Imath/ImathSphere.h",
+        "src/Imath/ImathTypeTraits.h",
+        "src/Imath/ImathVec.h",
+        "src/Imath/ImathVecAlgo.h",
+        "src/Imath/half.h",
+        "src/Imath/halfFunction.h",
+        "src/Imath/halfLimits.h",
+    ],
+    includes = ["src/Imath"],
+    visibility = ["//visibility:public"],
+)
+""",
+    remote = "https://github.com/AcademySoftwareFoundation/imath",
+    tag = "v3.1.5",
+)
+
+new_git_repository(
+    name = "openexr",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@IEX_INTERNAL_NAMESPACE@": "Iex_3_0",
+    "@IEX_NAMESPACE_CUSTOM@": "0",
+    "@IEX_NAMESPACE@": "Iex",
+    "@ILMTHREAD_INTERNAL_NAMESPACE@": "IlmThread_3_0",
+    "@ILMTHREAD_NAMESPACE_CUSTOM@": "0",
+    "@ILMTHREAD_NAMESPACE@": "IlmThread",
+    "@OPENEXR_IMF_NAMESPACE@": "Imf",
+    "@OPENEXR_INTERNAL_IMF_NAMESPACE@": "Imf_3_0",
+    "@OPENEXR_LIB_VERSION@": "3.0.4",
+    "@OPENEXR_NAMESPACE_CUSTOM@": "0",
+    "@OPENEXR_PACKAGE_NAME@": "OpenEXR 3.0.4",
+    "@OPENEXR_VERSION_EXTRA@": "",
+    "@OPENEXR_VERSION_MAJOR@": "3",
+    "@OPENEXR_VERSION_MINOR@": "0",
+    "@OPENEXR_VERSION_PATCH@": "4",
+    "@OPENEXR_VERSION@": "3.0.4",
+}
+YES_DEFINES = [
+    "OPENEXR_ENABLE_API_VISIBILITY", "OPENEXR_IMF_HAVE_COMPLETE_IOMANIP",
+    "OPENEXR_HAVE_LARGE_STACK",
+]
+NO_DEFINES = [
+    "HAVE_UCONTEXT_H", "IEX_HAVE_CONTROL_REGISTER_SUPPORT",
+    "IEX_HAVE_SIGCONTEXT_CONTROL_REGISTER_SUPPORT", "OPENEXR_IMF_HAVE_DARWIN",
+    "OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX", "OPENEXR_IMF_HAVE_LINUX_PROCFS",
+    "OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN",
+]
+ONE_DEFINES = [
+    "ILMTHREAD_THREADING_ENABLED",
+]
+ZERO_DEFINES = [
+    "ILMTHREAD_HAVE_POSIX_SEMAPHORES",
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 1" for key in ONE_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 0" for key in ZERO_DEFINES
+})
+[
+    expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/Iex/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["IexConfig", "IexConfigInternal"]
+]
+[
+expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/IlmThread/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["IlmThreadConfig"]
+]
+[
+expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/OpenEXR/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["OpenEXRConfig", "OpenEXRConfigInternal"]
+]
+cc_library(
+    name = "Iex",
+    srcs = [
+        "src/lib/Iex/IexBaseExc.cpp",
+        "src/lib/Iex/IexMathFloatExc.cpp",
+        "src/lib/Iex/IexMathFpu.cpp",
+        "src/lib/Iex/IexThrowErrnoExc.cpp",
+    ],
+    hdrs = [
+        "src/lib/Iex/Iex.h",
+        "src/lib/Iex/IexBaseExc.h",
+        ":src/lib/Iex/IexConfig.h",
+        ":src/lib/Iex/IexConfigInternal.h",
+        "src/lib/Iex/IexErrnoExc.h",
+        "src/lib/Iex/IexExport.h",
+        "src/lib/Iex/IexForward.h",
+        "src/lib/Iex/IexMacros.h",
+        "src/lib/Iex/IexMathExc.h",
+        "src/lib/Iex/IexMathFloatExc.h",
+        "src/lib/Iex/IexMathFpu.h",
+        "src/lib/Iex/IexMathIeeeExc.h",
+        "src/lib/Iex/IexNamespace.h",
+        "src/lib/Iex/IexThrowErrnoExc.h",
+        ":src/lib/OpenEXR/OpenEXRConfig.h",
+    ],
+    includes = [
+        "src/lib/Iex",
+        "src/lib/OpenEXR",
+    ],
+)
+
+cc_library(
+    name = "IlmThread",
+    srcs = [
+        "src/lib/IlmThread/IlmThread.cpp",
+        "src/lib/IlmThread/IlmThreadPool.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphore.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphoreOSX.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphorePosix.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphorePosixCompat.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphoreWin32.cpp",
+    ],
+    hdrs = [
+        "src/lib/IlmThread/IlmThread.h",
+        ":src/lib/IlmThread/IlmThreadConfig.h",
+        "src/lib/IlmThread/IlmThreadExport.h",
+        "src/lib/IlmThread/IlmThreadForward.h",
+        "src/lib/IlmThread/IlmThreadMutex.h",
+        "src/lib/IlmThread/IlmThreadNamespace.h",
+        "src/lib/IlmThread/IlmThreadPool.h",
+        "src/lib/IlmThread/IlmThreadSemaphore.h",
+    ],
+    includes = ["src/lib/IlmThread"],
+    deps = [":Iex"],
+)
+cc_library(
+    name = "OpenEXR",
+    srcs = [
+        "src/lib/OpenEXR/ImfAcesFile.cpp",
+        "src/lib/OpenEXR/ImfAttribute.cpp",
+        "src/lib/OpenEXR/ImfB44Compressor.cpp",
+        "src/lib/OpenEXR/ImfBoxAttribute.cpp",
+        "src/lib/OpenEXR/ImfCRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfChannelList.cpp",
+        "src/lib/OpenEXR/ImfChannelListAttribute.cpp",
+        "src/lib/OpenEXR/ImfChromaticities.cpp",
+        "src/lib/OpenEXR/ImfChromaticitiesAttribute.cpp",
+        "src/lib/OpenEXR/ImfCompositeDeepScanLine.cpp",
+        "src/lib/OpenEXR/ImfCompressionAttribute.cpp",
+        "src/lib/OpenEXR/ImfCompressor.cpp",
+        "src/lib/OpenEXR/ImfConvert.cpp",
+        "src/lib/OpenEXR/ImfDeepCompositing.cpp",
+        "src/lib/OpenEXR/ImfDeepFrameBuffer.cpp",
+        "src/lib/OpenEXR/ImfDeepImageStateAttribute.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineInputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineInputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledInputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledInputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledOutputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledOutputPart.cpp",
+        "src/lib/OpenEXR/ImfDoubleAttribute.cpp",
+        "src/lib/OpenEXR/ImfDwaCompressor.cpp",
+        "src/lib/OpenEXR/ImfEnvmap.cpp",
+        "src/lib/OpenEXR/ImfEnvmapAttribute.cpp",
+        "src/lib/OpenEXR/ImfFastHuf.cpp",
+        "src/lib/OpenEXR/ImfFloatAttribute.cpp",
+        "src/lib/OpenEXR/ImfFloatVectorAttribute.cpp",
+        "src/lib/OpenEXR/ImfFrameBuffer.cpp",
+        "src/lib/OpenEXR/ImfFramesPerSecond.cpp",
+        "src/lib/OpenEXR/ImfGenericInputFile.cpp",
+        "src/lib/OpenEXR/ImfGenericOutputFile.cpp",
+        "src/lib/OpenEXR/ImfHeader.cpp",
+        "src/lib/OpenEXR/ImfHuf.cpp",
+        "src/lib/OpenEXR/ImfIDManifest.cpp",
+        "src/lib/OpenEXR/ImfIDManifestAttribute.cpp",
+        "src/lib/OpenEXR/ImfIO.cpp",
+        "src/lib/OpenEXR/ImfInputFile.cpp",
+        "src/lib/OpenEXR/ImfInputPart.cpp",
+        "src/lib/OpenEXR/ImfInputPartData.cpp",
+        "src/lib/OpenEXR/ImfIntAttribute.cpp",
+        "src/lib/OpenEXR/ImfKeyCode.cpp",
+        "src/lib/OpenEXR/ImfKeyCodeAttribute.cpp",
+        "src/lib/OpenEXR/ImfLineOrderAttribute.cpp",
+        "src/lib/OpenEXR/ImfLut.cpp",
+        "src/lib/OpenEXR/ImfMatrixAttribute.cpp",
+        "src/lib/OpenEXR/ImfMisc.cpp",
+        "src/lib/OpenEXR/ImfMultiPartInputFile.cpp",
+        "src/lib/OpenEXR/ImfMultiPartOutputFile.cpp",
+        "src/lib/OpenEXR/ImfMultiView.cpp",
+        "src/lib/OpenEXR/ImfOpaqueAttribute.cpp",
+        "src/lib/OpenEXR/ImfOutputFile.cpp",
+        "src/lib/OpenEXR/ImfOutputPart.cpp",
+        "src/lib/OpenEXR/ImfOutputPartData.cpp",
+        "src/lib/OpenEXR/ImfPartType.cpp",
+        "src/lib/OpenEXR/ImfPizCompressor.cpp",
+        "src/lib/OpenEXR/ImfPreviewImage.cpp",
+        "src/lib/OpenEXR/ImfPreviewImageAttribute.cpp",
+        "src/lib/OpenEXR/ImfPxr24Compressor.cpp",
+        "src/lib/OpenEXR/ImfRational.cpp",
+        "src/lib/OpenEXR/ImfRationalAttribute.cpp",
+        "src/lib/OpenEXR/ImfRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfRgbaYca.cpp",
+        "src/lib/OpenEXR/ImfRle.cpp",
+        "src/lib/OpenEXR/ImfRleCompressor.cpp",
+        "src/lib/OpenEXR/ImfScanLineInputFile.cpp",
+        "src/lib/OpenEXR/ImfStandardAttributes.cpp",
+        "src/lib/OpenEXR/ImfStdIO.cpp",
+        "src/lib/OpenEXR/ImfStringAttribute.cpp",
+        "src/lib/OpenEXR/ImfStringVectorAttribute.cpp",
+        "src/lib/OpenEXR/ImfSystemSpecific.cpp",
+        "src/lib/OpenEXR/ImfTestFile.cpp",
+        "src/lib/OpenEXR/ImfThreading.cpp",
+        "src/lib/OpenEXR/ImfTileDescriptionAttribute.cpp",
+        "src/lib/OpenEXR/ImfTileOffsets.cpp",
+        "src/lib/OpenEXR/ImfTiledInputFile.cpp",
+        "src/lib/OpenEXR/ImfTiledInputPart.cpp",
+        "src/lib/OpenEXR/ImfTiledMisc.cpp",
+        "src/lib/OpenEXR/ImfTiledOutputFile.cpp",
+        "src/lib/OpenEXR/ImfTiledOutputPart.cpp",
+        "src/lib/OpenEXR/ImfTiledRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfTimeCode.cpp",
+        "src/lib/OpenEXR/ImfTimeCodeAttribute.cpp",
+        "src/lib/OpenEXR/ImfVecAttribute.cpp",
+        "src/lib/OpenEXR/ImfVersion.cpp",
+        "src/lib/OpenEXR/ImfWav.cpp",
+        "src/lib/OpenEXR/ImfZip.cpp",
+        "src/lib/OpenEXR/ImfZipCompressor.cpp",
+        "src/lib/OpenEXR/b44ExpLogTable.h",
+        "src/lib/OpenEXR/dwaLookups.h",
+    ],
+    hdrs = [
+        ":src/lib/Iex/IexConfig.h",
+        ":src/lib/Iex/IexConfigInternal.h",
+        ":src/lib/IlmThread/IlmThreadConfig.h",
+        "src/lib/OpenEXR/ImfAcesFile.h",
+        "src/lib/OpenEXR/ImfArray.h",
+        "src/lib/OpenEXR/ImfAttribute.h",
+        "src/lib/OpenEXR/ImfAutoArray.h",
+        "src/lib/OpenEXR/ImfB44Compressor.h",
+        "src/lib/OpenEXR/ImfBoxAttribute.h",
+        "src/lib/OpenEXR/ImfCRgbaFile.h",
+        "src/lib/OpenEXR/ImfChannelList.h",
+        "src/lib/OpenEXR/ImfChannelListAttribute.h",
+        "src/lib/OpenEXR/ImfCheckedArithmetic.h",
+        "src/lib/OpenEXR/ImfChromaticities.h",
+        "src/lib/OpenEXR/ImfChromaticitiesAttribute.h",
+        "src/lib/OpenEXR/ImfCompositeDeepScanLine.h",
+        "src/lib/OpenEXR/ImfCompression.h",
+        "src/lib/OpenEXR/ImfCompressionAttribute.h",
+        "src/lib/OpenEXR/ImfCompressor.h",
+        "src/lib/OpenEXR/ImfConvert.h",
+        "src/lib/OpenEXR/ImfDeepCompositing.h",
+        "src/lib/OpenEXR/ImfDeepFrameBuffer.h",
+        "src/lib/OpenEXR/ImfDeepImageState.h",
+        "src/lib/OpenEXR/ImfDeepImageStateAttribute.h",
+        "src/lib/OpenEXR/ImfDeepScanLineInputFile.h",
+        "src/lib/OpenEXR/ImfDeepScanLineInputPart.h",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputFile.h",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputPart.h",
+        "src/lib/OpenEXR/ImfDeepTiledInputFile.h",
+        "src/lib/OpenEXR/ImfDeepTiledInputPart.h",
+        "src/lib/OpenEXR/ImfDeepTiledOutputFile.h",
+        "src/lib/OpenEXR/ImfDeepTiledOutputPart.h",
+        "src/lib/OpenEXR/ImfDoubleAttribute.h",
+        "src/lib/OpenEXR/ImfDwaCompressor.h",
+        "src/lib/OpenEXR/ImfDwaCompressorSimd.h",
+        "src/lib/OpenEXR/ImfEnvmap.h",
+        "src/lib/OpenEXR/ImfEnvmapAttribute.h",
+        "src/lib/OpenEXR/ImfExport.h",
+        "src/lib/OpenEXR/ImfFastHuf.h",
+        "src/lib/OpenEXR/ImfFloatAttribute.h",
+        "src/lib/OpenEXR/ImfFloatVectorAttribute.h",
+        "src/lib/OpenEXR/ImfForward.h",
+        "src/lib/OpenEXR/ImfFrameBuffer.h",
+        "src/lib/OpenEXR/ImfFramesPerSecond.h",
+        "src/lib/OpenEXR/ImfGenericInputFile.h",
+        "src/lib/OpenEXR/ImfGenericOutputFile.h",
+        "src/lib/OpenEXR/ImfHeader.h",
+        "src/lib/OpenEXR/ImfHuf.h",
+        "src/lib/OpenEXR/ImfIDManifest.h",
+        "src/lib/OpenEXR/ImfIDManifestAttribute.h",
+        "src/lib/OpenEXR/ImfIO.h",
+        "src/lib/OpenEXR/ImfInputFile.h",
+        "src/lib/OpenEXR/ImfInputPart.h",
+        "src/lib/OpenEXR/ImfInputPartData.h",
+        "src/lib/OpenEXR/ImfInputStreamMutex.h",
+        "src/lib/OpenEXR/ImfInt64.h",
+        "src/lib/OpenEXR/ImfIntAttribute.h",
+        "src/lib/OpenEXR/ImfKeyCode.h",
+        "src/lib/OpenEXR/ImfKeyCodeAttribute.h",
+        "src/lib/OpenEXR/ImfLineOrder.h",
+        "src/lib/OpenEXR/ImfLineOrderAttribute.h",
+        "src/lib/OpenEXR/ImfLut.h",
+        "src/lib/OpenEXR/ImfMatrixAttribute.h",
+        "src/lib/OpenEXR/ImfMisc.h",
+        "src/lib/OpenEXR/ImfMultiPartInputFile.h",
+        "src/lib/OpenEXR/ImfMultiPartOutputFile.h",
+        "src/lib/OpenEXR/ImfMultiView.h",
+        "src/lib/OpenEXR/ImfName.h",
+        "src/lib/OpenEXR/ImfNamespace.h",
+        "src/lib/OpenEXR/ImfOpaqueAttribute.h",
+        "src/lib/OpenEXR/ImfOptimizedPixelReading.h",
+        "src/lib/OpenEXR/ImfOutputFile.h",
+        "src/lib/OpenEXR/ImfOutputPart.h",
+        "src/lib/OpenEXR/ImfOutputPartData.h",
+        "src/lib/OpenEXR/ImfOutputStreamMutex.h",
+        "src/lib/OpenEXR/ImfPartHelper.h",
+        "src/lib/OpenEXR/ImfPartType.h",
+        "src/lib/OpenEXR/ImfPixelType.h",
+        "src/lib/OpenEXR/ImfPizCompressor.h",
+        "src/lib/OpenEXR/ImfPreviewImage.h",
+        "src/lib/OpenEXR/ImfPreviewImageAttribute.h",
+        "src/lib/OpenEXR/ImfPxr24Compressor.h",
+        "src/lib/OpenEXR/ImfRational.h",
+        "src/lib/OpenEXR/ImfRationalAttribute.h",
+        "src/lib/OpenEXR/ImfRgba.h",
+        "src/lib/OpenEXR/ImfRgbaFile.h",
+        "src/lib/OpenEXR/ImfRgbaYca.h",
+        "src/lib/OpenEXR/ImfRle.h",
+        "src/lib/OpenEXR/ImfRleCompressor.h",
+        "src/lib/OpenEXR/ImfScanLineInputFile.h",
+        "src/lib/OpenEXR/ImfSimd.h",
+        "src/lib/OpenEXR/ImfStandardAttributes.h",
+        "src/lib/OpenEXR/ImfStdIO.h",
+        "src/lib/OpenEXR/ImfStringAttribute.h",
+        "src/lib/OpenEXR/ImfStringVectorAttribute.h",
+        "src/lib/OpenEXR/ImfSystemSpecific.h",
+        "src/lib/OpenEXR/ImfTestFile.h",
+        "src/lib/OpenEXR/ImfThreading.h",
+        "src/lib/OpenEXR/ImfTileDescription.h",
+        "src/lib/OpenEXR/ImfTileDescriptionAttribute.h",
+        "src/lib/OpenEXR/ImfTileOffsets.h",
+        "src/lib/OpenEXR/ImfTiledInputFile.h",
+        "src/lib/OpenEXR/ImfTiledInputPart.h",
+        "src/lib/OpenEXR/ImfTiledMisc.h",
+        "src/lib/OpenEXR/ImfTiledOutputFile.h",
+        "src/lib/OpenEXR/ImfTiledOutputPart.h",
+        "src/lib/OpenEXR/ImfTiledRgbaFile.h",
+        "src/lib/OpenEXR/ImfTimeCode.h",
+        "src/lib/OpenEXR/ImfTimeCodeAttribute.h",
+        "src/lib/OpenEXR/ImfVecAttribute.h",
+        "src/lib/OpenEXR/ImfVersion.h",
+        "src/lib/OpenEXR/ImfWav.h",
+        "src/lib/OpenEXR/ImfXdr.h",
+        "src/lib/OpenEXR/ImfZip.h",
+        "src/lib/OpenEXR/ImfZipCompressor.h",
+        ":src/lib/OpenEXR/OpenEXRConfig.h",
+        ":src/lib/OpenEXR/OpenEXRConfigInternal.h",
+    ],
+    includes = ["src/lib/OpenEXR"],
+    deps = [
+        ":IlmThread",
+        "@imath//:Imath",
+        "@zlib//:zlib",
+    ],
+    visibility = ["//visibility:public"],
+)
+""",
+    remote = "https://github.com/AcademySoftwareFoundation/openexr",
+    tag = "v3.1.5",
+)
diff --git a/third_party/jpeg-xl/bash_test.sh b/third_party/jpeg-xl/bash_test.sh
new file mode 100755
index 0000000000..3dce72aa0b
--- /dev/null
+++ b/third_party/jpeg-xl/bash_test.sh
@@ -0,0 +1,320 @@
+#!/bin/bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Tests implemented in bash. These typically will run checks about the source
+# code rather than the compiled one.
+
+MYDIR=$(dirname $(realpath "$0"))
+
+set -u
+
+test_includes() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h)$'); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    # Check that the public files (in lib/include/ directory) don't use the full
+    # path to the public header since users of the library will include the
+    # library as: #include "jxl/foobar.h".
+    if [[ "${f#lib/include/}" != "${f}" ]]; then
+      if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
+        echo "Don't add \"include/\" to the include path of public headers." >&2
+        ret=1
+      fi
+    fi
+
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+
+      # Check that local files don't use the full path to third_party/
+      # directory since the installed versions will not have that path.
+      # Add an exception for third_party/dirent.h.
+      if grep -v -F 'third_party/dirent.h' "$f" | \
+          grep -i -H -n -E '#include\s*[<"]third_party/' >&2 &&
+          [[ $ret -eq 0 ]]; then
+        cat >&2 <<EOF
+$f: Don't add third_party/ to the include path of third_party projects. This \
+makes it harder to use installed system libraries instead of the third_party/ \
+ones.
+EOF
+        ret=1
+      fi
+    fi
+
+  done
+  return ${ret}
+}
+
+test_include_collision() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '^lib/include/'); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    local base=${f#lib/include/}
+    if [[ -e "lib/${base}" ]]; then
+      echo "$f: Name collision, both $f and lib/${base} exist." >&2
+      ret=1
+    fi
+  done
+  return ${ret}
+}
+
+test_copyright() {
+  local ret=0
+  local f
+  for f in $(
+      git ls-files | grep -E \
+      '(Dockerfile.*|\.c|\.cc|\.cpp|\.gni|\.h|\.java|\.sh|\.m|\.py|\.ui|\.yml)$'); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+      if ! head -n 10 "$f" |
+          grep -F 'Copyright (c) the JPEG XL Project Authors.' >/dev/null ; then
+        echo "$f: Missing Copyright blob near the top of the file." >&2
+        ret=1
+      fi
+      if ! head -n 10 "$f" |
+          grep -F 'Use of this source code is governed by a BSD-style' \
+            >/dev/null ; then
+        echo "$f: Missing License blob near the top of the file." >&2
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+# Check that we don't use "%zu" or "%zd" in format string for size_t.
+test_printf_size_t() {
+  local ret=0
+  if grep -n -E '%[0-9]*z[udx]' \
+      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$'); then
+    echo "Don't use '%zu' or '%zd' in a format string, instead use " \
+      "'%\" PRIuS \"' or '%\" PRIdS \"'." >&2
+    ret=1
+  fi
+
+  if grep -n -E 'gtest\.h' \
+      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$' | grep -v -F /testing.h); then
+    echo "Don't include gtest directly, instead include 'testing.h'. " >&2
+    ret=1
+  fi
+
+  if grep -n -E 'gmock\.h' \
+      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$' | grep -v -F /testing.h); then
+    echo "Don't include gmock directly, instead include 'testing.h'. " >&2
+    ret=1
+  fi
+
+  local f
+  for f in $(git ls-files | grep -E "\.cc$" | xargs grep 'PRI[udx]S' |
+      cut -f 1 -d : | uniq); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    if ! grep -F printf_macros.h "$f" >/dev/null; then
+      echo "$f: Add lib/jxl/base/printf_macros.h for PRI.S, or use other " \
+        "types for code outside lib/jxl library." >&2
+      ret=1
+    fi
+  done
+
+  for f in $(git ls-files | grep -E "\.h$" | grep -v -E '(printf_macros\.h|testing\.h)' |
+      xargs grep -n 'PRI[udx]S'); do
+    # Having PRIuS / PRIdS in a header file means that printf_macros.h may
+    # be included before a system header, in particular before gtest headers.
+    # those may re-define PRIuS unconditionally causing a compile error.
+    echo "$f: Don't use PRI.S in header files. Sorry."
+    ret=1
+  done
+
+  return ${ret}
+}
+
+# Check that "dec_" code doesn't depend on "enc_" headers.
+test_dec_enc_deps() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '/dec_'); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+      if grep -n -H -E "#include.*/enc_" "$f" >&2; then
+        echo "$f: Don't include \"enc_*\" files from \"dec_*\" files." >&2
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+# Check for git merge conflict markers.
+test_merge_conflict() {
+  local ret=0
+  TEXT_FILES='(\.cc|\.cpp|\.h|\.sh|\.m|\.py|\.md|\.txt|\.cmake)$'
+  for f in $(git ls-files | grep -E "${TEXT_FILES}"); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    if grep -E '^<<<<<<< ' "$f"; then
+      echo "$f: Found git merge conflict marker. Please resolve." >&2
+      ret=1
+    fi
+  done
+  return ${ret}
+}
+
+# Check that the library and the package have the same version. This prevents
+# accidentally having them out of sync.
+get_version() {
+  local varname=$1
+  local line=$(grep -F "set(${varname} " lib/CMakeLists.txt | head -n 1)
+  [[ -n "${line}" ]]
+  line="${line#set(${varname} }"
+  line="${line%)}"
+  echo "${line}"
+}
+
+test_version() {
+  local major=$(get_version JPEGXL_MAJOR_VERSION)
+  local minor=$(get_version JPEGXL_MINOR_VERSION)
+  local patch=$(get_version JPEGXL_PATCH_VERSION)
+  # Check that the version is not empty
+  if [[ -z "${major}${minor}${patch}" ]]; then
+    echo "Couldn't parse version from CMakeLists.txt" >&2
+    return 1
+  fi
+  local pkg_version=$(head -n 1 debian/changelog)
+  # Get only the part between the first "jpeg-xl (" and the following ")".
+  pkg_version="${pkg_version#jpeg-xl (}"
+  pkg_version="${pkg_version%%)*}"
+  if [[ -z "${pkg_version}" ]]; then
+    echo "Couldn't parse version from debian package" >&2
+    return 1
+  fi
+
+  local lib_version="${major}.${minor}.${patch}"
+  lib_version="${lib_version%.0}"
+  if [[ "${pkg_version}" != "${lib_version}"* ]]; then
+    echo "Debian package version (${pkg_version}) doesn't match library" \
+      "version (${lib_version})." >&2
+    return 1
+  fi
+  return 0
+}
+
+# Check that the SHA versions in deps.sh matches the git submodules.
+test_deps_version() {
+  while IFS= read -r line; do
+    if [[ "${line:0:10}" != "[submodule" ]]; then
+      continue
+    fi
+    line="${line#[submodule \"}"
+    line="${line%\"]}"
+    local varname=$(tr '[:lower:]' '[:upper:]' <<< "${line}")
+    varname="${varname/\//_}"
+    if ! grep -F "${varname}=" deps.sh >/dev/null; then
+      # Ignoring submodule not in deps.sh
+      continue
+    fi
+    local deps_sha=$(grep -F "${varname}=" deps.sh | cut -f 2 -d '"')
+    [[ -n "${deps_sha}" ]]
+    local git_sha=$(git ls-tree -r HEAD "${line}" | cut -f 1 | cut -f 3 -d ' ')
+    if [[ "${deps_sha}" != "${git_sha}" ]]; then
+      cat >&2 <<EOF
+deps.sh: SHA for project ${line} is at ${deps_sha} but the git submodule is at
+${git_sha}. Please update deps.sh
+
+If you did not intend to change the submodule's SHA value, it is possible that
+you accidentally included this change in your commit after a rebase or checkout
+without running "git submodule --init". To revert the submodule change run from
+the top checkout directory:
+
+  git -C ${line} checkout ${deps_sha}
+  git commit --amend ${line}
+
+EOF
+      return 1
+    fi
+  done < .gitmodules
+}
+
+# Make sure that all the Fields objects are fuzzed directly.
+test_fuzz_fields() {
+  local ret=0
+  # List all the classes of the form "ClassName : public Fields".
+  # This doesn't catch class names that are too long to fit.
+  local field_classes=$( git ls-files |
+    grep -E '\.(cc|h)' | grep -v 'test\.cc$' |
+    xargs grep -h -o -E '\b[^ ]+ : public Fields' | cut -f 1 -d ' ')
+  local classname
+  for classname in ${field_classes}; do
+    if [ ! -e "$classname" ]; then
+      continue
+    fi
+    if ! grep -E "\\b${classname}\\b" tools/fields_fuzzer.cc >/dev/null; then
+      cat >&2 <<EOF
+tools/fields_fuzzer.cc: Class ${classname} not found in the fields_fuzzer.
+EOF
+      ret=1
+    fi
+  done
+  return $ret
+}
+
+# Test that we don't use %n in C++ code to avoid using it in printf and scanf.
+# This test is not very precise but in cases where "module n" is needed we would
+# normally have "% n" instead of "%n". Using %n is not allowed in Android 10+.
+test_percent_n() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h)$'); do
+    if [ ! -e "$f" ]; then
+      continue
+    fi
+    if grep -i -H -n -E '%h*n' "$f" >&2; then
+      echo "Don't use \"%n\"." >&2
+      ret=1
+    fi
+  done
+  return ${ret}
+}
+
+main() {
+  local ret=0
+  cd "${MYDIR}"
+
+  if ! git rev-parse >/dev/null 2>/dev/null; then
+    echo "Not a git checkout, skipping bash_test"
+    return 0
+  fi
+
+  IFS=$'\n'
+  for f in $(declare -F); do
+    local test_name=$(echo "$f" | cut -f 3 -d ' ')
+    # Runs all the local bash functions that start with "test_".
+    if [[ "${test_name}" == test_* ]]; then
+      echo "Test ${test_name}: Start"
+      if ${test_name}; then
+        echo "Test ${test_name}: PASS"
+      else
+        echo "Test ${test_name}: FAIL"
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/ci.sh b/third_party/jpeg-xl/ci.sh
new file mode 100755
index 0000000000..e55f0f8a89
--- /dev/null
+++ b/third_party/jpeg-xl/ci.sh
@@ -0,0 +1,1552 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Continuous integration helper module. This module is meant to be called from
+# the .gitlab-ci.yml file during the continuous integration build, as well as
+# from the command line for developers.
+
+set -eu
+
+OS=`uname -s`
+
+MYDIR=$(dirname $(realpath "$0"))
+
+### Environment parameters:
+TEST_STACK_LIMIT="${TEST_STACK_LIMIT:-256}"
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-RelWithDebInfo}
+CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}
+CMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER:-}
+CMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER:-}
+CMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM:-}
+SKIP_BUILD="${SKIP_BUILD:-0}"
+SKIP_TEST="${SKIP_TEST:-0}"
+TARGETS="${TARGETS:-all doc}"
+TEST_SELECTOR="${TEST_SELECTOR:-}"
+BUILD_TARGET="${BUILD_TARGET:-}"
+ENABLE_WASM_SIMD="${ENABLE_WASM_SIMD:-0}"
+if [[ -n "${BUILD_TARGET}" ]]; then
+  BUILD_DIR="${BUILD_DIR:-${MYDIR}/build-${BUILD_TARGET%%-*}}"
+else
+  BUILD_DIR="${BUILD_DIR:-${MYDIR}/build}"
+fi
+# Whether we should post a message in the MR when the build fails.
+POST_MESSAGE_ON_ERROR="${POST_MESSAGE_ON_ERROR:-1}"
+
+# Set default compilers to clang if not already set
+export CC=${CC:-clang}
+export CXX=${CXX:-clang++}
+
+# Time limit for the "fuzz" command in seconds (0 means no limit).
+FUZZER_MAX_TIME="${FUZZER_MAX_TIME:-0}"
+
+SANITIZER="none"
+
+
+if [[ "${BUILD_TARGET%%-*}" == "x86_64" ||
+    "${BUILD_TARGET%%-*}" == "i686" ]]; then
+  # Default to building all targets, even if compiler baseline is SSE4
+  HWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS:-HWY_EMU128}
+else
+  HWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS:-}
+fi
+
+# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
+CMAKE_FLAGS=${CMAKE_FLAGS:-}
+CMAKE_C_FLAGS="${CMAKE_C_FLAGS:-} ${CMAKE_FLAGS}"
+CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS:-} ${CMAKE_FLAGS}"
+
+CMAKE_CROSSCOMPILING_EMULATOR=${CMAKE_CROSSCOMPILING_EMULATOR:-}
+CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
+CMAKE_FIND_ROOT_PATH=${CMAKE_FIND_ROOT_PATH:-}
+CMAKE_MODULE_LINKER_FLAGS=${CMAKE_MODULE_LINKER_FLAGS:-}
+CMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS:-}
+CMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE:-}
+
+if [[ "${ENABLE_WASM_SIMD}" -ne "0" ]]; then
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -msimd128"
+  CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -msimd128"
+  CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -msimd128"
+fi
+
+if [[ "${ENABLE_WASM_SIMD}" -eq "2" ]]; then
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_WANT_WASM2"
+  CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -DHWY_WANT_WASM2"
+fi
+
+if [[ ! -z "${HWY_BASELINE_TARGETS}" ]]; then
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS}"
+fi
+
+# Version inferred from the CI variables.
+CI_COMMIT_SHA=${CI_COMMIT_SHA:-${GITHUB_SHA:-}}
+JPEGXL_VERSION=${JPEGXL_VERSION:-${CI_COMMIT_SHA:0:8}}
+
+# Benchmark parameters
+STORE_IMAGES=${STORE_IMAGES:-1}
+BENCHMARK_CORPORA="${MYDIR}/third_party/corpora"
+
+# Local flags passed to sanitizers.
+UBSAN_FLAGS=(
+  -fsanitize=alignment
+  -fsanitize=bool
+  -fsanitize=bounds
+  -fsanitize=builtin
+  -fsanitize=enum
+  -fsanitize=float-cast-overflow
+  -fsanitize=float-divide-by-zero
+  -fsanitize=integer-divide-by-zero
+  -fsanitize=null
+  -fsanitize=object-size
+  -fsanitize=pointer-overflow
+  -fsanitize=return
+  -fsanitize=returns-nonnull-attribute
+  -fsanitize=shift-base
+  -fsanitize=shift-exponent
+  -fsanitize=unreachable
+  -fsanitize=vla-bound
+
+  -fno-sanitize-recover=undefined
+  # Brunsli uses unaligned accesses to uint32_t, so alignment is just a warning.
+  -fsanitize-recover=alignment
+)
+# -fsanitize=function doesn't work on aarch64 and arm.
+if [[ "${BUILD_TARGET%%-*}" != "aarch64" &&
+    "${BUILD_TARGET%%-*}" != "arm" ]]; then
+  UBSAN_FLAGS+=(
+    -fsanitize=function
+  )
+fi
+if [[ "${BUILD_TARGET%%-*}" != "arm" ]]; then
+  UBSAN_FLAGS+=(
+    -fsanitize=signed-integer-overflow
+  )
+fi
+
+CLANG_TIDY_BIN=$(which clang-tidy-6.0 clang-tidy-7 clang-tidy-8 clang-tidy | head -n 1)
+# Default to "cat" if "colordiff" is not installed or if stdout is not a tty.
+if [[ -t 1 ]]; then
+  COLORDIFF_BIN=$(which colordiff cat | head -n 1)
+else
+  COLORDIFF_BIN="cat"
+fi
+FIND_BIN=$(which gfind find | head -n 1)
+# "false" will disable wine64 when not installed. This won't allow
+# cross-compiling.
+WINE_BIN=$(which wine64 false | head -n 1)
+
+CLANG_VERSION="${CLANG_VERSION:-}"
+# Detect the clang version suffix and store it in CLANG_VERSION. For example,
+# "6.0" for clang 6 or "7" for clang 7.
+detect_clang_version() {
+  if [[ -n "${CLANG_VERSION}" ]]; then
+    return 0
+  fi
+  local clang_version=$("${CC:-clang}" --version | head -n1)
+  clang_version=${clang_version#"Debian "}
+  clang_version=${clang_version#"Ubuntu "}
+  local llvm_tag
+  case "${clang_version}" in
+    "clang version 6."*)
+      CLANG_VERSION="6.0"
+      ;;
+    "clang version "*)
+      # Any other clang version uses just the major version number.
+      local suffix="${clang_version#clang version }"
+      CLANG_VERSION="${suffix%%.*}"
+      ;;
+    "emcc"*)
+      # We can't use asan or msan in the emcc case.
+      ;;
+    *)
+      echo "Unknown clang version: ${clang_version}" >&2
+      return 1
+  esac
+}
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+
+# Executed on exit.
+on_exit() {
+  local retcode="$1"
+  # Always cleanup the CLEANUP_FILES.
+  cleanup
+
+  # Post a message in the MR when requested with POST_MESSAGE_ON_ERROR but only
+  # if the run failed and we are not running from a MR pipeline.
+  if [[ ${retcode} -ne 0 && -n "${CI_BUILD_NAME:-}" &&
+        -n "${POST_MESSAGE_ON_ERROR}" && -z "${CI_MERGE_REQUEST_ID:-}" &&
+        "${CI_BUILD_REF_NAME}" = "master" ]]; then
+    load_mr_vars_from_commit
+    { set +xeu; } 2>/dev/null
+    local message="**Run ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} failed.**
+
+Check the output of the job at ${CI_JOB_URL:-} to see if this was your problem.
+If it was, please rollback this change or fix the problem ASAP, broken builds
+slow down development. Check if the error already existed in the previous build
+as well.
+
+Pipeline: ${CI_PIPELINE_URL}
+
+Previous build commit: ${CI_COMMIT_BEFORE_SHA}
+"
+    cmd_post_mr_comment "${message}"
+  fi
+}
+
+trap 'retcode=$?; { set +x; } 2>/dev/null; on_exit ${retcode}' INT TERM EXIT
+
+
+# These variables are populated when calling merge_request_commits().
+
+# The current hash at the top of the current branch or merge request branch (if
+# running from a merge request pipeline).
+MR_HEAD_SHA=""
+# The common ancestor between the current commit and the tracked branch, such
+# as master. This includes a list
+MR_ANCESTOR_SHA=""
+
+# Populate MR_HEAD_SHA and MR_ANCESTOR_SHA.
+merge_request_commits() {
+  { set +x; } 2>/dev/null
+  # GITHUB_SHA is the current reference being build in GitHub Actions.
+  if [[ -n "${GITHUB_SHA:-}" ]]; then
+    # GitHub normally does a checkout of a merge commit on a shallow repository
+    # by default. We want to get a bit more of the history to be able to diff
+    # changes on the Pull Request if needed. This fetches 10 more commits which
+    # should be enough given that PR normally should have 1 commit.
+    git -C "${MYDIR}" fetch -q origin "${GITHUB_SHA}" --depth 10
+    MR_HEAD_SHA="$(git rev-parse "FETCH_HEAD^2" 2>/dev/null ||
+                   echo "${GITHUB_SHA}")"
+  else
+    # CI_BUILD_REF is the reference currently being build in the CI workflow.
+    MR_HEAD_SHA=$(git -C "${MYDIR}" rev-parse -q "${CI_BUILD_REF:-HEAD}")
+  fi
+
+  if [[ -n "${CI_MERGE_REQUEST_IID:-}" ]]; then
+    # Merge request pipeline in CI. In this case the upstream is called "origin"
+    # but it refers to the forked project that's the source of the merge
+    # request. We need to get the target of the merge request, for which we need
+    # to query that repository using our CI_JOB_TOKEN.
+    echo "machine gitlab.com login gitlab-ci-token password ${CI_JOB_TOKEN}" \
+      >> "${HOME}/.netrc"
+    git -C "${MYDIR}" fetch "${CI_MERGE_REQUEST_PROJECT_URL}" \
+      "${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}"
+    MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q FETCH_HEAD)
+  elif [[ -n "${GITHUB_BASE_REF:-}" ]]; then
+    # Pull request workflow in GitHub Actions. GitHub checkout action uses
+    # "origin" as the remote for the git checkout.
+    git -C "${MYDIR}" fetch -q origin "${GITHUB_BASE_REF}"
+    MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q FETCH_HEAD)
+  else
+    # We are in a local branch, not a merge request.
+    MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q HEAD@{upstream} || true)
+  fi
+
+  if [[ -z "${MR_ANCESTOR_SHA}" ]]; then
+    echo "Warning, not tracking any branch, using the last commit in HEAD.">&2
+    # This prints the return value with just HEAD.
+    MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q "${MR_HEAD_SHA}^")
+  else
+    # GitHub runs the pipeline on a merge commit, no need to look for the common
+    # ancestor in that case.
+    if [[ -z "${GITHUB_BASE_REF:-}" ]]; then
+      MR_ANCESTOR_SHA=$(git -C "${MYDIR}" merge-base \
+        "${MR_ANCESTOR_SHA}" "${MR_HEAD_SHA}")
+    fi
+  fi
+  set -x
+}
+
+# Load the MR iid from the landed commit message when running not from a
+# merge request workflow. This is useful to post back results at the merge
+# request when running pipelines from master.
+load_mr_vars_from_commit() {
+  { set +x; } 2>/dev/null
+  if [[ -z "${CI_MERGE_REQUEST_IID:-}" ]]; then
+    local mr_iid=$(git rev-list --format=%B --max-count=1 HEAD |
+      grep -F "${CI_PROJECT_URL}" | grep -F "/merge_requests" | head -n 1)
+    # mr_iid contains a string like this if it matched:
+    #  Part-of: <https://gitlab.com/wg1/jpeg-xlm/merge_requests/123456>
+    if [[ -n "${mr_iid}" ]]; then
+      mr_iid=$(echo "${mr_iid}" |
+        sed -E 's,^.*merge_requests/([0-9]+)>.*$,\1,')
+      CI_MERGE_REQUEST_IID="${mr_iid}"
+      CI_MERGE_REQUEST_PROJECT_ID=${CI_PROJECT_ID}
+    fi
+  fi
+  set -x
+}
+
+# Posts a comment to the current merge request.
+cmd_post_mr_comment() {
+  { set +x; } 2>/dev/null
+  local comment="$1"
+  if [[ -n "${BOT_TOKEN:-}" && -n "${CI_MERGE_REQUEST_IID:-}" ]]; then
+    local url="${CI_API_V4_URL}/projects/${CI_MERGE_REQUEST_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}/notes"
+    curl -X POST -g \
+      -H "PRIVATE-TOKEN: ${BOT_TOKEN}" \
+      --data-urlencode "body=${comment}" \
+      --output /dev/null \
+      "${url}"
+  fi
+  set -x
+}
+
+# Set up and export the environment variables needed by the child processes.
+export_env() {
+  if [[ "${BUILD_TARGET}" == *mingw32 ]]; then
+    # Wine needs to know the paths to the mingw dlls. These should be
+    # separated by ';'.
+    WINEPATH=$("${CC:-clang}" -print-search-dirs --target="${BUILD_TARGET}" \
+      | grep -F 'libraries: =' | cut -f 2- -d '=' | tr ':' ';')
+    # We also need our own libraries in the wine path.
+    local real_build_dir=$(realpath "${BUILD_DIR}")
+    # Some library .dll dependencies are installed in /bin:
+    export WINEPATH="${WINEPATH};${real_build_dir};${real_build_dir}/third_party/brotli;/usr/${BUILD_TARGET}/bin"
+
+    local prefix="${BUILD_DIR}/wineprefix"
+    mkdir -p "${prefix}"
+    export WINEPREFIX=$(realpath "${prefix}")
+  fi
+  # Sanitizers need these variables to print and properly format the stack
+  # traces:
+  LLVM_SYMBOLIZER=$("${CC:-clang}" -print-prog-name=llvm-symbolizer || true)
+  if [[ -n "${LLVM_SYMBOLIZER}" ]]; then
+    export ASAN_SYMBOLIZER_PATH="${LLVM_SYMBOLIZER}"
+    export MSAN_SYMBOLIZER_PATH="${LLVM_SYMBOLIZER}"
+    export UBSAN_SYMBOLIZER_PATH="${LLVM_SYMBOLIZER}"
+  fi
+}
+
+cmake_configure() {
+  export_env
+
+  if [[ "${STACK_SIZE:-0}" == 1 ]]; then
+    # Dump the stack size of each function in the .stack_sizes section for
+    # analysis.
+    CMAKE_C_FLAGS+=" -fstack-size-section"
+    CMAKE_CXX_FLAGS+=" -fstack-size-section"
+  fi
+
+  local args=(
+    -B"${BUILD_DIR}" -H"${MYDIR}"
+    -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}"
+    -G Ninja
+    -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}"
+    -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}"
+    -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}"
+    -DCMAKE_MODULE_LINKER_FLAGS="${CMAKE_MODULE_LINKER_FLAGS}"
+    -DCMAKE_SHARED_LINKER_FLAGS="${CMAKE_SHARED_LINKER_FLAGS}"
+    -DJPEGXL_VERSION="${JPEGXL_VERSION}"
+    -DSANITIZER="${SANITIZER}"
+    # These are not enabled by default in cmake.
+    -DJPEGXL_ENABLE_VIEWERS=ON
+    -DJPEGXL_ENABLE_PLUGINS=ON
+    -DJPEGXL_ENABLE_DEVTOOLS=ON
+    # We always use libfuzzer in the ci.sh wrapper.
+    -DJPEGXL_FUZZER_LINK_FLAGS="-fsanitize=fuzzer"
+  )
+  if [[ "${BUILD_TARGET}" != *mingw32 ]]; then
+    args+=(
+      -DJPEGXL_WARNINGS_AS_ERRORS=ON
+    )
+  fi
+  if [[ -n "${BUILD_TARGET}" ]]; then
+    local system_name="Linux"
+    if [[ "${BUILD_TARGET}" == *mingw32 ]]; then
+      # When cross-compiling with mingw the target must be set to Windows and
+      # run programs with wine.
+      system_name="Windows"
+      args+=(
+        -DCMAKE_CROSSCOMPILING_EMULATOR="${WINE_BIN}"
+        # Normally CMake automatically defines MINGW=1 when building with the
+        # mingw compiler (x86_64-w64-mingw32-gcc) but we are normally compiling
+        # with clang.
+        -DMINGW=1
+      )
+    fi
+    # EMSCRIPTEN toolchain sets the right values itself
+    if [[ "${BUILD_TARGET}" != wasm* ]]; then
+      # If set, BUILD_TARGET must be the target triplet such as
+      # x86_64-unknown-linux-gnu.
+      args+=(
+        -DCMAKE_C_COMPILER_TARGET="${BUILD_TARGET}"
+        -DCMAKE_CXX_COMPILER_TARGET="${BUILD_TARGET}"
+        # Only the first element of the target triplet.
+        -DCMAKE_SYSTEM_PROCESSOR="${BUILD_TARGET%%-*}"
+        -DCMAKE_SYSTEM_NAME="${system_name}"
+        -DCMAKE_TOOLCHAIN_FILE="${CMAKE_TOOLCHAIN_FILE}"
+      )
+    else
+      args+=(
+        # sjpeg confuses WASM SIMD with SSE.
+        -DSJPEG_ENABLE_SIMD=OFF
+        # Building shared libs is not very useful for WASM.
+        -DBUILD_SHARED_LIBS=OFF
+      )
+    fi
+    args+=(
+      # These are needed to make googletest work when cross-compiling.
+      -DCMAKE_CROSSCOMPILING=1
+      -DHAVE_STD_REGEX=0
+      -DHAVE_POSIX_REGEX=0
+      -DHAVE_GNU_POSIX_REGEX=0
+      -DHAVE_STEADY_CLOCK=0
+      -DHAVE_THREAD_SAFETY_ATTRIBUTES=0
+    )
+    if [[ -z "${CMAKE_FIND_ROOT_PATH}" ]]; then
+      # find_package() will look in this prefix for libraries.
+      CMAKE_FIND_ROOT_PATH="/usr/${BUILD_TARGET}"
+    fi
+    if [[ -z "${CMAKE_PREFIX_PATH}" ]]; then
+      CMAKE_PREFIX_PATH="/usr/${BUILD_TARGET}"
+    fi
+    # Use pkg-config for the target. If there's no pkg-config available for the
+    # target we can set the PKG_CONFIG_PATH to the appropriate path in most
+    # linux distributions.
+    local pkg_config=$(which "${BUILD_TARGET}-pkg-config" || true)
+    if [[ -z "${pkg_config}" ]]; then
+      pkg_config=$(which pkg-config)
+      export PKG_CONFIG_LIBDIR="/usr/${BUILD_TARGET}/lib/pkgconfig"
+    fi
+    if [[ -n "${pkg_config}" ]]; then
+      args+=(-DPKG_CONFIG_EXECUTABLE="${pkg_config}")
+    fi
+  fi
+  if [[ -n "${CMAKE_CROSSCOMPILING_EMULATOR}" ]]; then
+    args+=(
+      -DCMAKE_CROSSCOMPILING_EMULATOR="${CMAKE_CROSSCOMPILING_EMULATOR}"
+    )
+  fi
+  if [[ -n "${CMAKE_FIND_ROOT_PATH}" ]]; then
+    args+=(
+      -DCMAKE_FIND_ROOT_PATH="${CMAKE_FIND_ROOT_PATH}"
+    )
+  fi
+  if [[ -n "${CMAKE_PREFIX_PATH}" ]]; then
+    args+=(
+      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}"
+    )
+  fi
+  if [[ -n "${CMAKE_C_COMPILER_LAUNCHER}" ]]; then
+    args+=(
+      -DCMAKE_C_COMPILER_LAUNCHER="${CMAKE_C_COMPILER_LAUNCHER}"
+    )
+  fi
+  if [[ -n "${CMAKE_CXX_COMPILER_LAUNCHER}" ]]; then
+    args+=(
+      -DCMAKE_CXX_COMPILER_LAUNCHER="${CMAKE_CXX_COMPILER_LAUNCHER}"
+    )
+  fi
+  if [[ -n "${CMAKE_MAKE_PROGRAM}" ]]; then
+    args+=(
+      -DCMAKE_MAKE_PROGRAM="${CMAKE_MAKE_PROGRAM}"
+    )
+  fi
+  if [[ "${BUILD_TARGET}" == wasm* ]]; then
+    emcmake cmake "${args[@]}" "$@"
+  else
+    cmake "${args[@]}" "$@"
+  fi
+}
+
+cmake_build_and_test() {
+  if [[ "${SKIP_BUILD}" -eq "1" ]]; then
+      return 0
+  fi
+  # gtest_discover_tests() runs the test binaries to discover the list of tests
+  # at build time, which fails under qemu.
+  ASAN_OPTIONS=detect_leaks=0 cmake --build "${BUILD_DIR}" -- $TARGETS
+  # Pack test binaries if requested.
+  if [[ "${PACK_TEST:-}" == "1" ]]; then
+    (cd "${BUILD_DIR}"
+     ${FIND_BIN} -name '*.cmake' -a '!' -path '*CMakeFiles*'
+     # gtest / gmock / gtest_main shared libs
+     ${FIND_BIN} lib/ -name 'libg*.so*'
+     ${FIND_BIN} -type d -name tests -a '!' -path '*CMakeFiles*'
+    ) | tar -C "${BUILD_DIR}" -cf "${BUILD_DIR}/tests.tar.xz" -T - \
+      --use-compress-program="xz --threads=$(nproc --all || echo 1) -6"
+    du -h "${BUILD_DIR}/tests.tar.xz"
+    # Pack coverage data if also available.
+    touch "${BUILD_DIR}/gcno.sentinel"
+    (cd "${BUILD_DIR}"; echo gcno.sentinel; ${FIND_BIN} -name '*gcno') | \
+      tar -C "${BUILD_DIR}" -cvf "${BUILD_DIR}/gcno.tar.xz" -T - \
+        --use-compress-program="xz --threads=$(nproc --all || echo 1) -6"
+  fi
+
+  if [[ "${SKIP_TEST}" -ne "1" ]]; then
+    (cd "${BUILD_DIR}"
+     export UBSAN_OPTIONS=print_stacktrace=1
+     [[ "${TEST_STACK_LIMIT}" == "none" ]] || ulimit -s "${TEST_STACK_LIMIT}"
+     ctest -j $(nproc --all || echo 1) ${TEST_SELECTOR} --output-on-failure)
+  fi
+}
+
+# Configure the build to strip unused functions. This considerably reduces the
+# output size, specially for tests which only use a small part of the whole
+# library.
+strip_dead_code() {
+  # Emscripten does tree shaking without any extra flags.
+  if [[ "${BUILD_TARGET}" == wasm* ]]; then
+    return 0
+  fi
+  # -ffunction-sections, -fdata-sections and -Wl,--gc-sections effectively
+  # discard all unreachable code, reducing the code size. For this to work, we
+  # need to also pass --no-export-dynamic to prevent it from exporting all the
+  # internal symbols (like functions) making them all reachable and thus not a
+  # candidate for removal.
+  CMAKE_CXX_FLAGS+=" -ffunction-sections -fdata-sections"
+  CMAKE_C_FLAGS+=" -ffunction-sections -fdata-sections"
+  if [[ "${OS}" == "Darwin" ]]; then
+    CMAKE_EXE_LINKER_FLAGS+=" -dead_strip"
+    CMAKE_SHARED_LINKER_FLAGS+=" -dead_strip"
+  else
+    CMAKE_EXE_LINKER_FLAGS+=" -Wl,--gc-sections -Wl,--no-export-dynamic"
+    CMAKE_SHARED_LINKER_FLAGS+=" -Wl,--gc-sections -Wl,--no-export-dynamic"
+  fi
+}
+
+### Externally visible commands
+
+cmd_debug() {
+  CMAKE_BUILD_TYPE="Debug"
+  cmake_configure "$@"
+  cmake_build_and_test
+}
+
+cmd_release() {
+  CMAKE_BUILD_TYPE="Release"
+  strip_dead_code
+  cmake_configure "$@"
+  cmake_build_and_test
+}
+
+cmd_opt() {
+  CMAKE_BUILD_TYPE="RelWithDebInfo"
+  CMAKE_CXX_FLAGS+=" -DJXL_DEBUG_WARNING -DJXL_DEBUG_ON_ERROR"
+  cmake_configure "$@"
+  cmake_build_and_test
+}
+
+cmd_coverage() {
+  # -O0 prohibits stack space reuse -> causes stack-overflow on dozens of tests.
+  TEST_STACK_LIMIT="none"
+
+  cmd_release -DJPEGXL_ENABLE_COVERAGE=ON "$@"
+
+  if [[ "${SKIP_TEST}" -ne "1" ]]; then
+    # If we didn't run the test we also don't print a coverage report.
+    cmd_coverage_report
+  fi
+}
+
+cmd_coverage_report() {
+  LLVM_COV=$("${CC:-clang}" -print-prog-name=llvm-cov)
+  local real_build_dir=$(realpath "${BUILD_DIR}")
+  local gcovr_args=(
+    -r "${real_build_dir}"
+    --gcov-executable "${LLVM_COV} gcov"
+    # Only print coverage information for the libjxl directories. The rest
+    # is not part of the code under test.
+    --filter '.*jxl/.*'
+    --exclude '.*_gbench.cc'
+    --exclude '.*_test.cc'
+    --exclude '.*_testonly..*'
+    --exclude '.*_debug.*'
+    --exclude '.*test_utils..*'
+    --object-directory "${real_build_dir}"
+  )
+
+  (
+   cd "${real_build_dir}"
+    gcovr "${gcovr_args[@]}" --html --html-details \
+      --output="${real_build_dir}/coverage.html"
+    gcovr "${gcovr_args[@]}" --print-summary |
+      tee "${real_build_dir}/coverage.txt"
+    gcovr "${gcovr_args[@]}" --xml --output="${real_build_dir}/coverage.xml"
+  )
+}
+
+cmd_test() {
+  export_env
+  # Unpack tests if needed.
+  if [[ -e "${BUILD_DIR}/tests.tar.xz" && ! -d "${BUILD_DIR}/tests" ]]; then
+    tar -C "${BUILD_DIR}" -Jxvf "${BUILD_DIR}/tests.tar.xz"
+  fi
+  if [[ -e "${BUILD_DIR}/gcno.tar.xz" && ! -d "${BUILD_DIR}/gcno.sentinel" ]]; then
+    tar -C "${BUILD_DIR}" -Jxvf "${BUILD_DIR}/gcno.tar.xz"
+  fi
+  (cd "${BUILD_DIR}"
+   export UBSAN_OPTIONS=print_stacktrace=1
+   [[ "${TEST_STACK_LIMIT}" == "none" ]] || ulimit -s "${TEST_STACK_LIMIT}"
+   ctest -j $(nproc --all || echo 1) ${TEST_SELECTOR} --output-on-failure "$@")
+}
+
+cmd_gbench() {
+  export_env
+  (cd "${BUILD_DIR}"
+   export UBSAN_OPTIONS=print_stacktrace=1
+   lib/jxl_gbench \
+     --benchmark_counters_tabular=true \
+     --benchmark_out_format=json \
+     --benchmark_out=gbench.json "$@"
+  )
+}
+
+cmd_asanfuzz() {
+  CMAKE_CXX_FLAGS+=" -fsanitize=fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1"
+  CMAKE_C_FLAGS+=" -fsanitize=fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1"
+  cmd_asan -DJPEGXL_ENABLE_FUZZERS=ON "$@"
+}
+
+cmd_msanfuzz() {
+  # Install msan if needed before changing the flags.
+  detect_clang_version
+  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
+  if [[ ! -d "${msan_prefix}" || -e "${msan_prefix}/lib/libc++abi.a" ]]; then
+    # Install msan libraries for this version if needed or if an older version
+    # with libc++abi was installed.
+    cmd_msan_install
+  fi
+
+  CMAKE_CXX_FLAGS+=" -fsanitize=fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1"
+  CMAKE_C_FLAGS+=" -fsanitize=fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION=1"
+  cmd_msan -DJPEGXL_ENABLE_FUZZERS=ON "$@"
+}
+
+cmd_asan() {
+  SANITIZER="asan"
+  CMAKE_C_FLAGS+=" -DJXL_ENABLE_ASSERT=1 -g -DADDRESS_SANITIZER \
+    -fsanitize=address ${UBSAN_FLAGS[@]}"
+  CMAKE_CXX_FLAGS+=" -DJXL_ENABLE_ASSERT=1 -g -DADDRESS_SANITIZER \
+    -fsanitize=address ${UBSAN_FLAGS[@]}"
+  strip_dead_code
+  cmake_configure "$@" -DJPEGXL_ENABLE_TCMALLOC=OFF
+  cmake_build_and_test
+}
+
+cmd_tsan() {
+  SANITIZER="tsan"
+  local tsan_args=(
+    -DJXL_ENABLE_ASSERT=1
+    -g
+    -DTHREAD_SANITIZER
+    ${UBSAN_FLAGS[@]}
+    -fsanitize=thread
+  )
+  CMAKE_C_FLAGS+=" ${tsan_args[@]}"
+  CMAKE_CXX_FLAGS+=" ${tsan_args[@]}"
+
+  CMAKE_BUILD_TYPE="RelWithDebInfo"
+  cmake_configure "$@" -DJPEGXL_ENABLE_TCMALLOC=OFF
+  cmake_build_and_test
+}
+
+cmd_msan() {
+  SANITIZER="msan"
+  detect_clang_version
+  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
+  if [[ ! -d "${msan_prefix}" || -e "${msan_prefix}/lib/libc++abi.a" ]]; then
+    # Install msan libraries for this version if needed or if an older version
+    # with libc++abi was installed.
+    cmd_msan_install
+  fi
+
+  local msan_c_flags=(
+    -fsanitize=memory
+    -fno-omit-frame-pointer
+    -fsanitize-memory-track-origins
+
+    -DJXL_ENABLE_ASSERT=1
+    -g
+    -DMEMORY_SANITIZER
+
+    # Force gtest to not use the cxxbai.
+    -DGTEST_HAS_CXXABI_H_=0
+  )
+  local msan_cxx_flags=(
+    "${msan_c_flags[@]}"
+
+    # Some C++ sources don't use the std at all, so the -stdlib=libc++ is unused
+    # in those cases. Ignore the warning.
+    -Wno-unused-command-line-argument
+    -stdlib=libc++
+
+    # We include the libc++ from the msan directory instead, so we don't want
+    # the std includes.
+    -nostdinc++
+    -cxx-isystem"${msan_prefix}/include/c++/v1"
+  )
+
+  local msan_linker_flags=(
+    -L"${msan_prefix}"/lib
+    -Wl,-rpath -Wl,"${msan_prefix}"/lib/
+  )
+
+  CMAKE_C_FLAGS+=" ${msan_c_flags[@]} ${UBSAN_FLAGS[@]}"
+  CMAKE_CXX_FLAGS+=" ${msan_cxx_flags[@]} ${UBSAN_FLAGS[@]}"
+  CMAKE_EXE_LINKER_FLAGS+=" ${msan_linker_flags[@]}"
+  CMAKE_MODULE_LINKER_FLAGS+=" ${msan_linker_flags[@]}"
+  CMAKE_SHARED_LINKER_FLAGS+=" ${msan_linker_flags[@]}"
+  strip_dead_code
+  cmake_configure "$@" \
+    -DCMAKE_CROSSCOMPILING=1 -DRUN_HAVE_STD_REGEX=0 -DRUN_HAVE_POSIX_REGEX=0 \
+    -DJPEGXL_ENABLE_TCMALLOC=OFF -DJPEGXL_WARNINGS_AS_ERRORS=OFF \
+    -DCMAKE_REQUIRED_LINK_OPTIONS="${msan_linker_flags[@]}"
+  cmake_build_and_test
+}
+
+# Install libc++ libraries compiled with msan in the msan_prefix for the current
+# compiler version.
+cmd_msan_install() {
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+  # Detect the llvm to install:
+  export CC="${CC:-clang}"
+  export CXX="${CXX:-clang++}"
+  detect_clang_version
+  # Allow overriding the LLVM checkout.
+  local llvm_root="${LLVM_ROOT:-}"
+  if [ -z "${llvm_root}" ]; then
+    local llvm_tag="llvmorg-${CLANG_VERSION}.0.0"
+    case "${CLANG_VERSION}" in
+      "6.0")
+        llvm_tag="llvmorg-6.0.1"
+        ;;
+      "7")
+        llvm_tag="llvmorg-7.0.1"
+        ;;
+    esac
+    local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
+    curl -L --show-error -o "${llvm_targz}" \
+      "https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
+    tar -C "${tmpdir}" -zxf "${llvm_targz}"
+    llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
+  fi
+
+  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
+  rm -rf "${msan_prefix}"
+
+  declare -A CMAKE_EXTRAS
+  CMAKE_EXTRAS[libcxx]="\
+    -DLIBCXX_CXX_ABI=libstdc++ \
+    -DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
+
+  for project in libcxx; do
+    local proj_build="${tmpdir}/build-${project}"
+    local proj_dir="${llvm_root}/${project}"
+    mkdir -p "${proj_build}"
+    cmake -B"${proj_build}" -H"${proj_dir}" \
+      -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_USE_SANITIZER=Memory \
+      -DLLVM_PATH="${llvm_root}/llvm" \
+      -DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
+                            head -n1)" \
+      -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
+      -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
+      -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
+      -DCMAKE_SHARED_LINKER_FLAGS="${CMAKE_SHARED_LINKER_FLAGS}" \
+      -DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
+      ${CMAKE_EXTRAS[${project}]}
+    cmake --build "${proj_build}"
+    ninja -C "${proj_build}" install
+  done
+}
+
+# Internal build step shared between all cmd_ossfuzz_* commands.
+_cmd_ossfuzz() {
+  local sanitizer="$1"
+  shift
+  mkdir -p "${BUILD_DIR}"
+  local real_build_dir=$(realpath "${BUILD_DIR}")
+
+  # oss-fuzz defines three directories:
+  # * /work, with the working directory to do re-builds
+  # * /src, with the source code to build
+  # * /out, with the output directory where to copy over the built files.
+  # We use $BUILD_DIR as the /work and the script directory as the /src. The
+  # /out directory is ignored as developers are used to look for the fuzzers in
+  # $BUILD_DIR/tools/ directly.
+
+  if [[ "${sanitizer}" = "memory" && ! -d "${BUILD_DIR}/msan" ]]; then
+    sudo docker run --rm -i \
+      --user $(id -u):$(id -g) \
+      -v "${real_build_dir}":/work \
+      gcr.io/oss-fuzz-base/msan-libs-builder \
+      bash -c "cp -r /msan /work"
+  fi
+
+  # Args passed to ninja. These will be evaluated as a string separated by
+  # spaces.
+  local jpegxl_extra_args="$@"
+
+  sudo docker run --rm -i \
+    -e JPEGXL_UID=$(id -u) \
+    -e JPEGXL_GID=$(id -g) \
+    -e FUZZING_ENGINE="${FUZZING_ENGINE:-libfuzzer}" \
+    -e SANITIZER="${sanitizer}" \
+    -e ARCHITECTURE=x86_64 \
+    -e FUZZING_LANGUAGE=c++ \
+    -e MSAN_LIBS_PATH="/work/msan" \
+    -e JPEGXL_EXTRA_ARGS="${jpegxl_extra_args}" \
+    -v "${MYDIR}":/src/libjxl \
+    -v "${MYDIR}/tools/scripts/ossfuzz-build.sh":/src/build.sh \
+    -v "${real_build_dir}":/work \
+    gcr.io/oss-fuzz/libjxl
+}
+
+cmd_ossfuzz_asan() {
+  _cmd_ossfuzz address "$@"
+}
+cmd_ossfuzz_msan() {
+  _cmd_ossfuzz memory "$@"
+}
+cmd_ossfuzz_ubsan() {
+  _cmd_ossfuzz undefined "$@"
+}
+
+cmd_ossfuzz_ninja() {
+  [[ -e "${BUILD_DIR}/build.ninja" ]]
+  local real_build_dir=$(realpath "${BUILD_DIR}")
+
+  if [[ -e "${BUILD_DIR}/msan" ]]; then
+    echo "ossfuzz_ninja doesn't work with msan builds. Use ossfuzz_msan." >&2
+    exit 1
+  fi
+
+  sudo docker run --rm -i \
+    --user $(id -u):$(id -g) \
+    -v "${MYDIR}":/src/libjxl \
+    -v "${real_build_dir}":/work \
+    gcr.io/oss-fuzz/libjxl \
+    ninja -C /work "$@"
+}
+
+cmd_fast_benchmark() {
+  local small_corpus_tar="${BENCHMARK_CORPORA}/jyrki-full.tar"
+  mkdir -p "${BENCHMARK_CORPORA}"
+  curl --show-error -o "${small_corpus_tar}" -z "${small_corpus_tar}" \
+    "https://storage.googleapis.com/artifacts.jpegxl.appspot.com/corpora/jyrki-full.tar"
+
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+  tar -xf "${small_corpus_tar}" -C "${tmpdir}"
+
+  run_benchmark "${tmpdir}" 1048576
+}
+
+cmd_benchmark() {
+  local nikon_corpus_tar="${BENCHMARK_CORPORA}/nikon-subset.tar"
+  mkdir -p "${BENCHMARK_CORPORA}"
+  curl --show-error -o "${nikon_corpus_tar}" -z "${nikon_corpus_tar}" \
+    "https://storage.googleapis.com/artifacts.jpegxl.appspot.com/corpora/nikon-subset.tar"
+
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+  tar -xvf "${nikon_corpus_tar}" -C "${tmpdir}"
+
+  local sem_id="jpegxl_benchmark-$$"
+  local nprocs=$(nproc --all || echo 1)
+  images=()
+  local filename
+  while IFS= read -r filename; do
+    # This removes the './'
+    filename="${filename:2}"
+    local mode
+    if [[ "${filename:0:4}" == "srgb" ]]; then
+      mode="RGB_D65_SRG_Rel_SRG"
+    elif [[ "${filename:0:5}" == "adobe" ]]; then
+      mode="RGB_D65_Ado_Rel_Ado"
+    else
+      echo "Unknown image colorspace: ${filename}" >&2
+      exit 1
+    fi
+    png_filename="${filename%.ppm}.png"
+    png_filename=$(echo "${png_filename}" | tr '/' '_')
+    sem --bg --id "${sem_id}" -j"${nprocs}" -- \
+      "${BUILD_DIR}/tools/decode_and_encode" \
+        "${tmpdir}/${filename}" "${mode}" "${tmpdir}/${png_filename}"
+    images+=( "${png_filename}" )
+  done < <(cd "${tmpdir}"; ${FIND_BIN} . -name '*.ppm' -type f)
+  sem --id "${sem_id}" --wait
+
+  # We need about 10 GiB per thread on these images.
+  run_benchmark "${tmpdir}" 10485760
+}
+
+get_mem_available() {
+  if [[ "${OS}" == "Darwin" ]]; then
+    echo $(vm_stat | grep -F 'Pages free:' | awk '{print $3 * 4}')
+  else
+    echo $(grep -F MemAvailable: /proc/meminfo | awk '{print $2}')
+  fi
+}
+
+run_benchmark() {
+  local src_img_dir="$1"
+  local mem_per_thread="${2:-10485760}"
+
+  local output_dir="${BUILD_DIR}/benchmark_results"
+  mkdir -p "${output_dir}"
+
+  # The memory available at the beginning of the benchmark run in kB. The number
+  # of threads depends on the available memory, and the passed memory per
+  # thread. We also add a 2 GiB of constant memory.
+  local mem_available="$(get_mem_available)"
+  # Check that we actually have a MemAvailable value.
+  [[ -n "${mem_available}" ]]
+  local num_threads=$(( (${mem_available} - 1048576) / ${mem_per_thread} ))
+  if [[ ${num_threads} -le 0 ]]; then
+    num_threads=1
+  fi
+
+  local benchmark_args=(
+    --input "${src_img_dir}/*.png"
+    --codec=jpeg:yuv420:q85,webp:q80,jxl:d1:6,jxl:d1:6:downsampling=8,jxl:d5:6,jxl:d5:6:downsampling=8,jxl:m:d0:2,jxl:m:d0:3,jxl:m:d2:2
+    --output_dir "${output_dir}"
+    --noprofiler --show_progress
+    --num_threads="${num_threads}"
+  )
+  if [[ "${STORE_IMAGES}" == "1" ]]; then
+    benchmark_args+=(--save_decompressed --save_compressed)
+  fi
+  (
+    [[ "${TEST_STACK_LIMIT}" == "none" ]] || ulimit -s "${TEST_STACK_LIMIT}"
+    "${BUILD_DIR}/tools/benchmark_xl" "${benchmark_args[@]}" | \
+       tee "${output_dir}/results.txt"
+
+    # Check error code for benckmark_xl command. This will exit if not.
+    return ${PIPESTATUS[0]}
+  )
+
+  if [[ -n "${CI_BUILD_NAME:-}" ]]; then
+    { set +x; } 2>/dev/null
+    local message="Results for ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} (job ${CI_JOB_URL:-}):
+
+$(cat "${output_dir}/results.txt")
+"
+    cmd_post_mr_comment "${message}"
+    set -x
+  fi
+}
+
+# Helper function to wait for the CPU temperature to cool down on ARM.
+wait_for_temp() {
+  { set +x; } 2>/dev/null
+  local temp_limit=${1:-38000}
+  if [[ -z "${THERMAL_FILE:-}" ]]; then
+    echo "Must define the THERMAL_FILE with the thermal_zoneX/temp file" \
+      "to read the temperature from. This is normally set in the runner." >&2
+    exit 1
+  fi
+  local org_temp=$(cat "${THERMAL_FILE}")
+  if [[ "${org_temp}" -ge "${temp_limit}" ]]; then
+    echo -n "Waiting for temp to get down from ${org_temp}... "
+  fi
+  local temp="${org_temp}"
+  local secs=0
+  while [[ "${temp}" -ge "${temp_limit}" ]]; do
+    sleep 1
+    temp=$(cat "${THERMAL_FILE}")
+    echo -n "${temp} "
+    secs=$((secs + 1))
+    if [[ ${secs} -ge 5 ]]; then
+      break
+    fi
+  done
+  if [[ "${org_temp}" -ge "${temp_limit}" ]]; then
+    echo "Done, temp=${temp}"
+  fi
+  set -x
+}
+
+# Helper function to set the cpuset restriction of the current process.
+cmd_cpuset() {
+  [[ "${SKIP_CPUSET:-}" != "1" ]] || return 0
+  local newset="$1"
+  local mycpuset=$(cat /proc/self/cpuset)
+  mycpuset="/dev/cpuset${mycpuset}"
+  # Check that the directory exists:
+  [[ -d "${mycpuset}" ]]
+  if [[ -e "${mycpuset}/cpuset.cpus" ]]; then
+    echo "${newset}" >"${mycpuset}/cpuset.cpus"
+  else
+    echo "${newset}" >"${mycpuset}/cpus"
+  fi
+}
+
+# Return the encoding/decoding speed from the Stats output.
+_speed_from_output() {
+  local speed="$1"
+  local unit="${2:-MP/s}"
+  if [[ "${speed}" == *"${unit}"* ]]; then
+    speed="${speed%% ${unit}*}"
+    speed="${speed##* }"
+    echo "${speed}"
+  fi
+}
+
+
+# Run benchmarks on ARM for the big and little CPUs.
+cmd_arm_benchmark() {
+  # Flags used for cjxl encoder with .png inputs
+  local jxl_png_benchmarks=(
+    # Lossy options:
+    "--epf=0 --distance=1.0 --speed=cheetah"
+    "--epf=2 --distance=1.0 --speed=cheetah"
+    "--epf=0 --distance=8.0 --speed=cheetah"
+    "--epf=1 --distance=8.0 --speed=cheetah"
+    "--epf=2 --distance=8.0 --speed=cheetah"
+    "--epf=3 --distance=8.0 --speed=cheetah"
+    "--modular -Q 90"
+    "--modular -Q 50"
+    # Lossless options:
+    "--modular"
+    "--modular -E 0 -I 0"
+    "--modular -P 5"
+    "--modular --responsive=1"
+    # Near-lossless options:
+    "--epf=0 --distance=0.3 --speed=fast"
+    "--modular -Q 97"
+  )
+
+  # Flags used for cjxl encoder with .jpg inputs. These should do lossless
+  # JPEG recompression (of pixels or full jpeg).
+  local jxl_jpeg_benchmarks=(
+    "--num_reps=3"
+  )
+
+  local images=(
+    "testdata/jxl/flower/flower.png"
+  )
+
+  local jpg_images=(
+    "testdata/jxl/flower/flower.png.im_q85_420.jpg"
+  )
+
+  if [[ "${SKIP_CPUSET:-}" == "1" ]]; then
+    # Use a single cpu config in this case.
+    local cpu_confs=("?")
+  else
+    # Otherwise the CPU config comes from the environment:
+    local cpu_confs=(
+      "${RUNNER_CPU_LITTLE}"
+      "${RUNNER_CPU_BIG}"
+      # The CPU description is something like 3-7, so these configurations only
+      # take the first CPU of the group.
+      "${RUNNER_CPU_LITTLE%%-*}"
+      "${RUNNER_CPU_BIG%%-*}"
+    )
+    # Check that RUNNER_CPU_ALL is defined. In the SKIP_CPUSET=1 case this will
+    # be ignored but still evaluated when calling cmd_cpuset.
+    [[ -n "${RUNNER_CPU_ALL}" ]]
+  fi
+
+  local jpg_dirname="third_party/corpora/jpeg"
+  mkdir -p "${jpg_dirname}"
+  local jpg_qualities=( 50 80 95 )
+  for src_img in "${images[@]}"; do
+    for q in "${jpg_qualities[@]}"; do
+      local jpeg_name="${jpg_dirname}/"$(basename "${src_img}" .png)"-q${q}.jpg"
+      convert -sampling-factor 1x1 -quality "${q}" \
+        "${src_img}" "${jpeg_name}"
+      jpg_images+=("${jpeg_name}")
+    done
+  done
+
+  local output_dir="${BUILD_DIR}/benchmark_results"
+  mkdir -p "${output_dir}"
+  local runs_file="${output_dir}/runs.txt"
+
+  if [[ ! -e "${runs_file}" ]]; then
+    echo -e "binary\tflags\tsrc_img\tsrc size\tsrc pixels\tcpuset\tenc size (B)\tenc speed (MP/s)\tdec speed (MP/s)\tJPG dec speed (MP/s)\tJPG dec speed (MB/s)" |
+      tee -a "${runs_file}"
+  fi
+
+  mkdir -p "${BUILD_DIR}/arm_benchmark"
+  local flags
+  local src_img
+  for src_img in "${jpg_images[@]}" "${images[@]}"; do
+    local src_img_hash=$(sha1sum "${src_img}" | cut -f 1 -d ' ')
+    local enc_binaries=("${BUILD_DIR}/tools/cjxl")
+    local src_ext="${src_img##*.}"
+    for enc_binary in "${enc_binaries[@]}"; do
+      local enc_binary_base=$(basename "${enc_binary}")
+
+      # Select the list of flags to use for the current encoder/image pair.
+      local img_benchmarks
+      if [[ "${src_ext}" == "jpg" ]]; then
+        img_benchmarks=("${jxl_jpeg_benchmarks[@]}")
+      else
+        img_benchmarks=("${jxl_png_benchmarks[@]}")
+      fi
+
+      for flags in "${img_benchmarks[@]}"; do
+        # Encoding step.
+        local enc_file_hash="${enc_binary_base} || $flags || ${src_img} || ${src_img_hash}"
+        enc_file_hash=$(echo "${enc_file_hash}" | sha1sum | cut -f 1 -d ' ')
+        local enc_file="${BUILD_DIR}/arm_benchmark/${enc_file_hash}.jxl"
+
+        for cpu_conf in "${cpu_confs[@]}"; do
+          cmd_cpuset "${cpu_conf}"
+          # nproc returns the number of active CPUs, which is given by the cpuset
+          # mask.
+          local num_threads="$(nproc)"
+
+          echo "Encoding with: ${enc_binary_base} img=${src_img} cpus=${cpu_conf} enc_flags=${flags}"
+          local enc_output
+          if [[ "${flags}" == *"modular"* ]]; then
+            # We don't benchmark encoding speed in this case.
+            if [[ ! -f "${enc_file}" ]]; then
+              cmd_cpuset "${RUNNER_CPU_ALL:-}"
+              "${enc_binary}" ${flags} "${src_img}" "${enc_file}.tmp"
+              mv "${enc_file}.tmp" "${enc_file}"
+              cmd_cpuset "${cpu_conf}"
+            fi
+            enc_output=" ?? MP/s"
+          else
+            wait_for_temp
+            enc_output=$("${enc_binary}" ${flags} "${src_img}" "${enc_file}.tmp" \
+              2>&1 | tee /dev/stderr | grep -F "MP/s [")
+            mv "${enc_file}.tmp" "${enc_file}"
+          fi
+          local enc_speed=$(_speed_from_output "${enc_output}")
+          local enc_size=$(stat -c "%s" "${enc_file}")
+
+          echo "Decoding with: img=${src_img} cpus=${cpu_conf} enc_flags=${flags}"
+
+          local dec_output
+          wait_for_temp
+          dec_output=$("${BUILD_DIR}/tools/djxl" "${enc_file}" \
+            --num_reps=5 --num_threads="${num_threads}" 2>&1 | tee /dev/stderr |
+            grep -E "M[BP]/s \[")
+          local img_size=$(echo "${dec_output}" | cut -f 1 -d ',')
+          local img_size_x=$(echo "${img_size}" | cut -f 1 -d ' ')
+          local img_size_y=$(echo "${img_size}" | cut -f 3 -d ' ')
+          local img_size_px=$(( ${img_size_x} * ${img_size_y} ))
+          local dec_speed=$(_speed_from_output "${dec_output}")
+
+          # For JPEG lossless recompression modes (where the original is a JPEG)
+          # decode to JPG as well.
+          local jpeg_dec_mps_speed=""
+          local jpeg_dec_mbs_speed=""
+          if [[ "${src_ext}" == "jpg" ]]; then
+            wait_for_temp
+            local dec_file="${BUILD_DIR}/arm_benchmark/${enc_file_hash}.jpg"
+            dec_output=$("${BUILD_DIR}/tools/djxl" "${enc_file}" \
+              "${dec_file}" --num_reps=5 --num_threads="${num_threads}" 2>&1 | \
+                tee /dev/stderr | grep -E "M[BP]/s \[")
+            local jpeg_dec_mps_speed=$(_speed_from_output "${dec_output}")
+            local jpeg_dec_mbs_speed=$(_speed_from_output "${dec_output}" MB/s)
+            if ! cmp --quiet "${src_img}" "${dec_file}"; then
+              # Add a start at the end to signal that the files are different.
+              jpeg_dec_mbs_speed+="*"
+            fi
+          fi
+
+          # Record entry in a tab-separated file.
+          local src_img_base=$(basename "${src_img}")
+          echo -e "${enc_binary_base}\t${flags}\t${src_img_base}\t${img_size}\t${img_size_px}\t${cpu_conf}\t${enc_size}\t${enc_speed}\t${dec_speed}\t${jpeg_dec_mps_speed}\t${jpeg_dec_mbs_speed}" |
+            tee -a "${runs_file}"
+        done
+      done
+    done
+  done
+  cmd_cpuset "${RUNNER_CPU_ALL:-}"
+  cat "${runs_file}"
+
+  if [[ -n "${CI_BUILD_NAME:-}" ]]; then
+    load_mr_vars_from_commit
+    { set +x; } 2>/dev/null
+    local message="Results for ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} (job ${CI_JOB_URL:-}):
+
+\`\`\`
+$(column -t -s "	" "${runs_file}")
+\`\`\`
+"
+    cmd_post_mr_comment "${message}"
+    set -x
+  fi
+}
+
+# Generate a corpus and run the fuzzer on that corpus.
+cmd_fuzz() {
+  local corpus_dir=$(realpath "${BUILD_DIR}/fuzzer_corpus")
+  local fuzzer_crash_dir=$(realpath "${BUILD_DIR}/fuzzer_crash")
+  mkdir -p "${corpus_dir}" "${fuzzer_crash_dir}"
+  # Generate step.
+  "${BUILD_DIR}/tools/fuzzer_corpus" "${corpus_dir}"
+  # Run step:
+  local nprocs=$(nproc --all || echo 1)
+  (
+   cd "${BUILD_DIR}"
+   "tools/djxl_fuzzer" "${fuzzer_crash_dir}" "${corpus_dir}" \
+     -max_total_time="${FUZZER_MAX_TIME}" -jobs=${nprocs} \
+     -artifact_prefix="${fuzzer_crash_dir}/"
+  )
+}
+
+# Runs the linters (clang-format, build_cleaner, buildirier) on the pending CLs.
+cmd_lint() {
+  merge_request_commits
+  { set +x; } 2>/dev/null
+  local versions=(${1:-16 15 14 13 12 11 10 9 8 7 6.0})
+  local clang_format_bins=("${versions[@]/#/clang-format-}" clang-format)
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+
+  local ret=0
+  local build_patch="${tmpdir}/build_cleaner.patch"
+  if ! "${MYDIR}/tools/scripts/build_cleaner.py" >"${build_patch}"; then
+    ret=1
+    echo "build_cleaner.py findings:" >&2
+    "${COLORDIFF_BIN}" <"${build_patch}"
+    echo "Run \`tools/scripts/build_cleaner.py --update\` to apply them" >&2
+  fi
+
+  # It is ok, if buildifier is not installed.
+  if which buildifier >/dev/null; then
+    local buildifier_patch="${tmpdir}/buildifier.patch"
+    local bazel_files=`git -C ${MYDIR} ls-files | grep -E "/BUILD$|WORKSPACE|.bzl$"`
+    set -x
+    buildifier -d ${bazel_files} >"${buildifier_patch}"|| true
+    { set +x; } 2>/dev/null
+    if [ -s "${buildifier_patch}" ]; then
+      ret=1
+      echo 'buildifier have found some problems in Bazel build files:' >&2
+      "${COLORDIFF_BIN}" <"${buildifier_patch}"
+      echo 'To fix them run (from the base directory):' >&2
+      echo '  buildifier `git ls-files | grep -E "/BUILD$|WORKSPACE|.bzl$"`' >&2
+    fi
+  fi
+
+  local installed=()
+  local clang_patch
+  local clang_format
+  for clang_format in "${clang_format_bins[@]}"; do
+    if ! which "${clang_format}" >/dev/null; then
+      continue
+    fi
+    installed+=("${clang_format}")
+    local tmppatch="${tmpdir}/${clang_format}.patch"
+    # We include in this linter all the changes including the uncommitted changes
+    # to avoid printing changes already applied.
+    set -x
+    # Ignoring the error that git-clang-format outputs.
+    git -C "${MYDIR}" "${clang_format}" --binary "${clang_format}" \
+      --style=file --diff "${MR_ANCESTOR_SHA}" -- >"${tmppatch}" || true
+    { set +x; } 2>/dev/null
+    if grep -E '^--- ' "${tmppatch}">/dev/null; then
+      if [[ -n "${LINT_OUTPUT:-}" ]]; then
+        cp "${tmppatch}" "${LINT_OUTPUT}"
+      fi
+      clang_patch="${tmppatch}"
+    else
+      echo "clang-format check OK" >&2
+      return ${ret}
+    fi
+  done
+
+  if [[ ${#installed[@]} -eq 0 ]]; then
+    echo "You must install clang-format for \"git clang-format\"" >&2
+    exit 1
+  fi
+
+  # clang-format is installed but found problems.
+  echo "clang-format findings:" >&2
+  "${COLORDIFF_BIN}" < "${clang_patch}"
+
+  echo "clang-format found issues in your patches from ${MR_ANCESTOR_SHA}" \
+    "to the current patch. Run \`./ci.sh lint | patch -p1\` from the base" \
+    "directory to apply them." >&2
+  exit 1
+}
+
+# Runs clang-tidy on the pending CLs. If the "all" argument is passed it runs
+# clang-tidy over all the source files instead.
+cmd_tidy() {
+  local what="${1:-}"
+
+  if [[ -z "${CLANG_TIDY_BIN}" ]]; then
+    echo "ERROR: You must install clang-tidy-7 or newer to use ci.sh tidy" >&2
+    exit 1
+  fi
+
+  local git_args=()
+  if [[ "${what}" == "all" ]]; then
+    git_args=(ls-files)
+    shift
+  else
+    merge_request_commits
+    git_args=(
+        diff-tree --no-commit-id --name-only -r "${MR_ANCESTOR_SHA}"
+        "${MR_HEAD_SHA}"
+    )
+  fi
+
+  # Clang-tidy needs the compilation database generated by cmake.
+  if [[ ! -e "${BUILD_DIR}/compile_commands.json" ]]; then
+    # Generate the build options in debug mode, since we need the debug asserts
+    # enabled for the clang-tidy analyzer to use them.
+    CMAKE_BUILD_TYPE="Debug"
+    cmake_configure
+    # Build the autogen targets to generate the .h files from the .ui files.
+    local autogen_targets=(
+        $(ninja -C "${BUILD_DIR}" -t targets | grep -F _autogen: |
+          cut -f 1 -d :)
+    )
+    if [[ ${#autogen_targets[@]} != 0 ]]; then
+      ninja -C "${BUILD_DIR}" "${autogen_targets[@]}"
+    fi
+  fi
+
+  cd "${MYDIR}"
+  local nprocs=$(nproc --all || echo 1)
+  local ret=0
+  if ! parallel -j"${nprocs}" --keep-order -- \
+      "${CLANG_TIDY_BIN}" -p "${BUILD_DIR}" -format-style=file -quiet "$@" {} \
+      < <(git "${git_args[@]}" | grep -E '(\.cc|\.cpp)$') \
+      >"${BUILD_DIR}/clang-tidy.txt"; then
+    ret=1
+  fi
+  { set +x; } 2>/dev/null
+  echo "Findings statistics:" >&2
+  grep -E ' \[[A-Za-z\.,\-]+\]' -o "${BUILD_DIR}/clang-tidy.txt" | sort \
+    | uniq -c >&2
+
+  if [[ $ret -ne 0 ]]; then
+    cat >&2 <<EOF
+Errors found, see ${BUILD_DIR}/clang-tidy.txt for details.
+To automatically fix them, run:
+
+  SKIP_TEST=1 ./ci.sh debug
+  ${CLANG_TIDY_BIN} -p ${BUILD_DIR} -fix -format-style=file -quiet $@ \$(git ${git_args[@]} | grep -E '(\.cc|\.cpp)\$')
+EOF
+  fi
+
+  return ${ret}
+}
+
+# Print stats about all the packages built in ${BUILD_DIR}/debs/.
+cmd_debian_stats() {
+  { set +x; } 2>/dev/null
+  local debsdir="${BUILD_DIR}/debs"
+  local f
+  while IFS='' read -r -d '' f; do
+    echo "====================================================================="
+    echo "Package $f:"
+    dpkg --info $f
+    dpkg --contents $f
+  done < <(find "${BUILD_DIR}/debs" -maxdepth 1 -mindepth 1 -type f \
+           -name '*.deb' -print0)
+}
+
+build_debian_pkg() {
+  local srcdir="$1"
+  local srcpkg="$2"
+
+  local debsdir="${BUILD_DIR}/debs"
+  local builddir="${debsdir}/${srcpkg}"
+
+  # debuild doesn't have an easy way to build out of tree, so we make a copy
+  # of with all symlinks on the first level.
+  mkdir -p "${builddir}"
+  for f in $(find "${srcdir}" -mindepth 1 -maxdepth 1 -printf '%P\n'); do
+    if [[ ! -L "${builddir}/$f" ]]; then
+      rm -f "${builddir}/$f"
+      ln -s "${srcdir}/$f" "${builddir}/$f"
+    fi
+  done
+  (
+    cd "${builddir}"
+    debuild -b -uc -us
+  )
+}
+
+cmd_debian_build() {
+  local srcpkg="${1:-}"
+
+  case "${srcpkg}" in
+    jpeg-xl)
+      build_debian_pkg "${MYDIR}" "jpeg-xl"
+      ;;
+    highway)
+      build_debian_pkg "${MYDIR}/third_party/highway" "highway"
+      ;;
+    *)
+      echo "ERROR: Must pass a valid source package name to build." >&2
+      ;;
+  esac
+}
+
+get_version() {
+  local varname=$1
+  local line=$(grep -F "set(${varname} " lib/CMakeLists.txt | head -n 1)
+  [[ -n "${line}" ]]
+  line="${line#set(${varname} }"
+  line="${line%)}"
+  echo "${line}"
+}
+
+cmd_bump_version() {
+  local newver="${1:-}"
+
+  if ! which dch >/dev/null; then
+    echo "Missing dch\nTo install it run:\n  sudo apt install devscripts"
+    exit 1
+  fi
+
+  if [[ -z "${newver}" ]]; then
+    local major=$(get_version JPEGXL_MAJOR_VERSION)
+    local minor=$(get_version JPEGXL_MINOR_VERSION)
+    local patch=0
+    minor=$(( ${minor}  + 1))
+  else
+    local major="${newver%%.*}"
+    newver="${newver#*.}"
+    local minor="${newver%%.*}"
+    newver="${newver#${minor}}"
+    local patch="${newver#.}"
+    if [[ -z "${patch}" ]]; then
+      patch=0
+    fi
+  fi
+
+  newver="${major}.${minor}.${patch}"
+
+  echo "Bumping version to ${newver} (${major}.${minor}.${patch})"
+  sed -E \
+    -e "s/(set\\(JPEGXL_MAJOR_VERSION) [0-9]+\\)/\\1 ${major})/" \
+    -e "s/(set\\(JPEGXL_MINOR_VERSION) [0-9]+\\)/\\1 ${minor})/" \
+    -e "s/(set\\(JPEGXL_PATCH_VERSION) [0-9]+\\)/\\1 ${patch})/" \
+    -i lib/CMakeLists.txt
+  sed -E \
+    -e "s/(LIBJXL_VERSION: )[0-9\\.]+/\\1 ${major}.${minor}.${patch}/" \
+    -e "s/(LIBJXL_ABI_VERSION: )[0-9\\.]+/\\1 ${major}.${minor}/" \
+    -i .github/workflows/conformance.yml
+
+  # Update lib.gni
+  tools/scripts/build_cleaner.py --update
+
+  # Mark the previous version as "unstable".
+  DEBCHANGE_RELEASE_HEURISTIC=log dch -M --distribution unstable --release ''
+  DEBCHANGE_RELEASE_HEURISTIC=log dch -M \
+    --newversion "${newver}" \
+    "Bump JPEG XL version to ${newver}."
+}
+
+# Check that the AUTHORS file contains the email of the committer.
+cmd_authors() {
+  merge_request_commits
+  local emails
+  local names
+  readarray -t emails < <(git log --format='%ae' "${MR_ANCESTOR_SHA}..${MR_HEAD_SHA}")
+  readarray -t names < <(git log --format='%an' "${MR_ANCESTOR_SHA}..${MR_HEAD_SHA}")
+  for i in "${!names[@]}"; do
+    echo "Checking name '${names[$i]}' with email '${emails[$i]}' ..."
+    "${MYDIR}"/tools/scripts/check_author.py "${emails[$i]}" "${names[$i]}"
+  done
+}
+
+main() {
+  local cmd="${1:-}"
+  if [[ -z "${cmd}" ]]; then
+    cat >&2 <<EOF
+Use: $0 CMD
+
+Where cmd is one of:
+ opt       Build and test a Release with symbols build.
+ debug     Build and test a Debug build (NDEBUG is not defined).
+ release   Build and test a striped Release binary without debug information.
+ asan      Build and test an ASan (AddressSanitizer) build.
+ msan      Build and test an MSan (MemorySanitizer) build. Needs to have msan
+           c++ libs installed with msan_install first.
+ tsan      Build and test a TSan (ThreadSanitizer) build.
+ asanfuzz  Build and test an ASan (AddressSanitizer) build for fuzzing.
+ msanfuzz  Build and test an MSan (MemorySanitizer) build for fuzzing.
+ test      Run the tests build by opt, debug, release, asan or msan. Useful when
+           building with SKIP_TEST=1.
+ gbench    Run the Google benchmark tests.
+ fuzz      Generate the fuzzer corpus and run the fuzzer on it. Useful after
+           building with asan or msan.
+ benchmark Run the benchmark over the default corpus.
+ fast_benchmark Run the benchmark over the small corpus.
+
+ coverage  Build and run tests with coverage support. Runs coverage_report as
+           well.
+ coverage_report Generate HTML, XML and text coverage report after a coverage
+           run.
+
+ lint      Run the linter checks on the current commit or merge request.
+ tidy      Run clang-tidy on the current commit or merge request.
+ authors   Check that the last commit's author is listed in the AUTHORS file.
+
+ msan_install Install the libc++ libraries required to build in msan mode. This
+              needs to be done once.
+
+ debian_build <srcpkg> Build the given source package.
+ debian_stats  Print stats about the built packages.
+
+oss-fuzz commands:
+ ossfuzz_asan   Build the local source inside oss-fuzz docker with asan.
+ ossfuzz_msan   Build the local source inside oss-fuzz docker with msan.
+ ossfuzz_ubsan  Build the local source inside oss-fuzz docker with ubsan.
+ ossfuzz_ninja  Run ninja on the BUILD_DIR inside the oss-fuzz docker. Extra
+                parameters are passed to ninja, for example "djxl_fuzzer" will
+                only build that ninja target. Use for faster build iteration
+                after one of the ossfuzz_*san commands.
+
+You can pass some optional environment variables as well:
+ - BUILD_DIR: The output build directory (by default "$$repo/build")
+ - BUILD_TARGET: The target triplet used when cross-compiling.
+ - CMAKE_FLAGS: Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS.
+ - CMAKE_PREFIX_PATH: Installation prefixes to be searched by the find_package.
+ - ENABLE_WASM_SIMD=1: enable experimental SIMD in WASM build (only).
+ - FUZZER_MAX_TIME: "fuzz" command fuzzer running timeout in seconds.
+ - LINT_OUTPUT: Path to the output patch from the "lint" command.
+ - SKIP_CPUSET=1: Skip modifying the cpuset in the arm_benchmark.
+ - SKIP_BUILD=1: Skip the build stage, cmake configure only.
+ - SKIP_TEST=1: Skip the test stage.
+ - STORE_IMAGES=0: Makes the benchmark discard the computed images.
+ - TEST_STACK_LIMIT: Stack size limit (ulimit -s) during tests, in KiB.
+ - TEST_SELECTOR: pass additional arguments to ctest, e.g. "-R .Resample.".
+ - STACK_SIZE=1: Generate binaries with the .stack_sizes sections.
+
+These optional environment variables are forwarded to the cmake call as
+parameters:
+ - CMAKE_BUILD_TYPE
+ - CMAKE_C_FLAGS
+ - CMAKE_CXX_FLAGS
+ - CMAKE_C_COMPILER_LAUNCHER
+ - CMAKE_CXX_COMPILER_LAUNCHER
+ - CMAKE_CROSSCOMPILING_EMULATOR
+ - CMAKE_FIND_ROOT_PATH
+ - CMAKE_EXE_LINKER_FLAGS
+ - CMAKE_MAKE_PROGRAM
+ - CMAKE_MODULE_LINKER_FLAGS
+ - CMAKE_SHARED_LINKER_FLAGS
+ - CMAKE_TOOLCHAIN_FILE
+
+Example:
+  BUILD_DIR=/tmp/build $0 opt
+EOF
+    exit 1
+  fi
+
+  cmd="cmd_${cmd}"
+  shift
+  set -x
+  "${cmd}" "$@"
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/cmake/FindAtomics.cmake b/third_party/jpeg-xl/cmake/FindAtomics.cmake
new file mode 100644
index 0000000000..9a6cdc39ec
--- /dev/null
+++ b/third_party/jpeg-xl/cmake/FindAtomics.cmake
@@ -0,0 +1,53 @@
+# Original issue:
+# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733
+#
+# For reference:
+# * https://gcc.gnu.org/wiki/Atomic/GCCMM
+#
+# riscv64 specific:
+# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html
+#
+# ATOMICS_FOUND        - system has c++ atomics
+# ATOMICS_LIBRARIES    - libraries needed to use c++ atomics
+
+include(CheckCXXSourceCompiles)
+
+# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed
+# to convert smaller atomics to those larger ones via masking and
+# shifting like LLVM, but it’s a known bug that it does not. This means
+# anything that wants to use atomics on 1-byte or 2-byte types needs
+# -latomic, but not 4-byte or 8-byte (though it does no harm).
+set(atomic_code
+    "
+     #include <atomic>
+     #include <cstdint>
+     std::atomic<uint8_t> n8 (0); // riscv64
+     std::atomic<uint64_t> n64 (0); // armel, mipsel, powerpc
+     int main() {
+       ++n8;
+       ++n64;
+       return 0;
+     }")
+
+check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS)
+
+if(ATOMICS_LOCK_FREE_INSTRUCTIONS)
+  set(ATOMICS_FOUND TRUE)
+  set(ATOMICS_LIBRARIES)
+else()
+  set(CMAKE_REQUIRED_LIBRARIES "-latomic")
+  check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY)
+  set(CMAKE_REQUIRED_LIBRARIES)
+  if(ATOMICS_IN_LIBRARY)
+    set(ATOMICS_LIBRARY atomic)
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY)
+    set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY})
+    unset(ATOMICS_LIBRARY)
+  else()
+    if(Atomics_FIND_REQUIRED)
+      message(FATAL_ERROR "Neither lock free instructions nor -latomic found.")
+    endif()
+  endif()
+endif()
+unset(atomic_code)
diff --git a/third_party/jpeg-xl/cmake/FindBrotli.cmake b/third_party/jpeg-xl/cmake/FindBrotli.cmake
new file mode 100644
index 0000000000..9fb78e47d8
--- /dev/null
+++ b/third_party/jpeg-xl/cmake/FindBrotli.cmake
@@ -0,0 +1,75 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+set(brlibs brotlicommon brotlienc brotlidec)
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+  foreach(brlib IN ITEMS ${brlibs})
+    string(TOUPPER "${brlib}" BRPREFIX)
+    pkg_check_modules("PC_${BRPREFIX}" lib${brlib})
+  endforeach()
+endif()
+
+find_path(BROTLI_INCLUDE_DIR
+  NAMES brotli/decode.h
+  HINTS ${PC_BROTLICOMMON_INCLUDEDIR} ${PC_BROTLICOMMON_INCLUDE_DIRS}
+)
+
+foreach(brlib IN ITEMS ${brlibs})
+  string(TOUPPER "${brlib}" BRPREFIX)
+  find_library(${BRPREFIX}_LIBRARY
+    NAMES ${${BRPREFIX}_NAMES} ${brlib}
+    HINTS ${PC_${BRPREFIX}_LIBDIR} ${PC_${BRPREFIX}_LIBRARY_DIRS}
+  )
+
+  if (${BRPREFIX}_LIBRARY AND NOT TARGET ${brlib})
+    if(CMAKE_VERSION VERSION_LESS "3.13.5")
+    add_library(${brlib} INTERFACE IMPORTED GLOBAL)
+      set_property(TARGET ${brlib} PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${BROTLI_INCLUDE_DIR})
+      target_link_libraries(${brlib} INTERFACE ${${BRPREFIX}_LIBRARY})
+      set_property(TARGET ${brlib} PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_${BRPREFIX}_CFLAGS_OTHER})
+    else()
+    add_library(${brlib} INTERFACE IMPORTED GLOBAL)
+      target_include_directories(${brlib}
+        INTERFACE ${BROTLI_INCLUDE_DIR})
+      target_link_libraries(${brlib}
+        INTERFACE ${${BRPREFIX}_LIBRARY})
+      target_link_options(${brlib}
+        INTERFACE ${PC_${BRPREFIX}_LDFLAGS_OTHER})
+      target_compile_options(${brlib}
+        INTERFACE ${PC_${BRPREFIX}_CFLAGS_OTHER})
+    endif()
+  endif()
+endforeach()
+
+if (BROTLICOMMON_FOUND AND BROTLIENC_FOUND AND BROTLIDEC_FOUND)
+  set(Brotli_FOUND ON)
+else ()
+  set(Brotli_FOUND OFF)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Brotli
+  FOUND_VAR Brotli_FOUND
+  REQUIRED_VARS
+    BROTLI_INCLUDE_DIR
+    BROTLICOMMON_LIBRARY
+    BROTLIENC_LIBRARY
+    BROTLIDEC_LIBRARY
+  VERSION_VAR Brotli_VERSION
+)
+
+mark_as_advanced(
+  BROTLI_INCLUDE_DIR
+  BROTLICOMMON_LIBRARY
+  BROTLIENC_LIBRARY
+  BROTLIDEC_LIBRARY
+)
+
+if (Brotli_FOUND)
+  set(Brotli_LIBRARIES ${BROTLICOMMON_LIBRARY} ${BROTLIENC_LIBRARY} ${BROTLIDEC_LIBRARY})
+  set(Brotli_INCLUDE_DIRS ${BROTLI_INCLUDE_DIR})
+endif()
diff --git a/third_party/jpeg-xl/cmake/FindHWY.cmake b/third_party/jpeg-xl/cmake/FindHWY.cmake
new file mode 100644
index 0000000000..c1deb9b851
--- /dev/null
+++ b/third_party/jpeg-xl/cmake/FindHWY.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+  pkg_check_modules(PC_HWY QUIET libhwy)
+  set(HWY_VERSION ${PC_HWY_VERSION})
+endif ()
+
+find_path(HWY_INCLUDE_DIR
+  NAMES hwy/highway.h
+  HINTS ${PC_HWY_INCLUDEDIR} ${PC_HWY_INCLUDE_DIRS}
+)
+
+find_library(HWY_LIBRARY
+  NAMES ${HWY_NAMES} hwy
+  HINTS ${PC_HWY_LIBDIR} ${PC_HWY_LIBRARY_DIRS}
+)
+
+if (HWY_INCLUDE_DIR AND NOT HWY_VERSION)
+  if (EXISTS "${HWY_INCLUDE_DIR}/hwy/highway.h")
+    file(READ "${HWY_INCLUDE_DIR}/hwy/highway.h" HWY_VERSION_CONTENT)
+
+    string(REGEX MATCH "#define HWY_MAJOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_MAJOR "${CMAKE_MATCH_1}")
+
+    string(REGEX MATCH "#define +HWY_MINOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_MINOR "${CMAKE_MATCH_1}")
+
+    string(REGEX MATCH "#define +HWY_PATCH +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_PATCH "${CMAKE_MATCH_1}")
+
+    set(HWY_VERSION "${HWY_VERSION_MAJOR}.${HWY_VERSION_MINOR}.${HWY_VERSION_PATCH}")
+  endif ()
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(HWY
+  FOUND_VAR HWY_FOUND
+  REQUIRED_VARS HWY_LIBRARY HWY_INCLUDE_DIR
+  VERSION_VAR HWY_VERSION
+)
+
+if (HWY_LIBRARY AND NOT TARGET hwy)
+  add_library(hwy INTERFACE IMPORTED GLOBAL)
+
+  if(CMAKE_VERSION VERSION_LESS "3.13.5")
+    set_property(TARGET hwy PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${HWY_INCLUDE_DIR})
+    target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
+    set_property(TARGET hwy PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_HWY_CFLAGS_OTHER})
+  else()
+    target_include_directories(hwy INTERFACE ${HWY_INCLUDE_DIR})
+    target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
+    target_link_options(hwy INTERFACE ${PC_HWY_LDFLAGS_OTHER})
+    target_compile_options(hwy INTERFACE ${PC_HWY_CFLAGS_OTHER})
+  endif()
+endif()
+
+mark_as_advanced(HWY_INCLUDE_DIR HWY_LIBRARY)
+
+if (HWY_FOUND)
+    set(HWY_LIBRARIES ${HWY_LIBRARY})
+    set(HWY_INCLUDE_DIRS ${HWY_INCLUDE_DIR})
+endif ()
diff --git a/third_party/jpeg-xl/cmake/FindLCMS2.cmake b/third_party/jpeg-xl/cmake/FindLCMS2.cmake
new file mode 100644
index 0000000000..0a7b54eb96
--- /dev/null
+++ b/third_party/jpeg-xl/cmake/FindLCMS2.cmake
@@ -0,0 +1,59 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+  pkg_check_modules(PC_LCMS2 QUIET libLCMS2)
+  set(LCMS2_VERSION ${PC_LCMS2_VERSION})
+endif ()
+
+find_path(LCMS2_INCLUDE_DIR
+  NAMES lcms2.h
+  HINTS ${PC_LCMS2_INCLUDEDIR} ${PC_LCMS2_INCLUDE_DIRS}
+)
+
+find_library(LCMS2_LIBRARY
+  NAMES ${LCMS2_NAMES} lcms2 liblcms2 lcms-2 liblcms-2
+  HINTS ${PC_LCMS2_LIBDIR} ${PC_LCMS2_LIBRARY_DIRS}
+)
+
+if (LCMS2_INCLUDE_DIR AND NOT LCMS_VERSION)
+    file(READ ${LCMS2_INCLUDE_DIR}/lcms2.h LCMS2_VERSION_CONTENT)
+    string(REGEX MATCH "#define[ \t]+LCMS_VERSION[ \t]+([0-9]+)[ \t]*\n" LCMS2_VERSION_MATCH ${LCMS2_VERSION_CONTENT})
+    if (LCMS2_VERSION_MATCH)
+        string(SUBSTRING ${CMAKE_MATCH_1} 0 1 LCMS2_VERSION_MAJOR)
+        string(SUBSTRING ${CMAKE_MATCH_1} 1 2 LCMS2_VERSION_MINOR)
+        set(LCMS2_VERSION "${LCMS2_VERSION_MAJOR}.${LCMS2_VERSION_MINOR}")
+    endif ()
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LCMS2
+  FOUND_VAR LCMS2_FOUND
+  REQUIRED_VARS LCMS2_LIBRARY LCMS2_INCLUDE_DIR
+  VERSION_VAR LCMS2_VERSION
+)
+
+if (LCMS2_LIBRARY AND NOT TARGET lcms2)
+  add_library(lcms2 INTERFACE IMPORTED GLOBAL)
+
+  if(CMAKE_VERSION VERSION_LESS "3.13.5")
+    set_property(TARGET lcms2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${LCMS2_INCLUDE_DIR})
+    target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
+    set_property(TARGET lcms2 PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_LCMS2_CFLAGS_OTHER})
+  else()
+    target_include_directories(lcms2 INTERFACE ${LCMS2_INCLUDE_DIR})
+    target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
+    target_link_options(lcms2 INTERFACE ${PC_LCMS2_LDFLAGS_OTHER})
+    target_compile_options(lcms2 INTERFACE ${PC_LCMS2_CFLAGS_OTHER})
+  endif()
+endif()
+
+mark_as_advanced(LCMS2_INCLUDE_DIR LCMS2_LIBRARY)
+
+if (LCMS2_FOUND)
+    set(LCMS2_LIBRARIES ${LCMS2_LIBRARY})
+    set(LCMS2_INCLUDE_DIRS ${LCMS2_INCLUDE_DIR})
+endif ()
diff --git a/third_party/jpeg-xl/debian/changelog b/third_party/jpeg-xl/debian/changelog
new file mode 100644
index 0000000000..6fbaddf68a
--- /dev/null
+++ b/third_party/jpeg-xl/debian/changelog
@@ -0,0 +1,95 @@
+jpeg-xl (0.9.0) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.9.0.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Wed, 11 Jan 2023 16:12:35 +0000
+
+jpeg-xl (0.8) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.8.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Wed, 11 Jan 2023 16:12:34 +0000
+
+jpeg-xl (0.7) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.7.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Mon, 08 Aug 2022 14:43:58 +0000
+
+jpeg-xl (0.6) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.6.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Fri, 10 Sep 2021 16:08:17 +0200
+
+jpeg-xl (0.5.0) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.5.0.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Thu, 12 Aug 2021 23:49:40 +0200
+
+jpeg-xl (0.3.7) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.7.
+
+ -- Sami Boukortt <sboukortt@google.com>  Mon, 29 Mar 2021 12:14:20 +0200
+
+jpeg-xl (0.3.6) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.6.
+
+ -- Sami Boukortt <sboukortt@google.com>  Thu, 25 Mar 2021 17:40:58 +0100
+
+jpeg-xl (0.3.5) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.5.
+
+ -- Sami Boukortt <sboukortt@google.com>  Tue, 23 Mar 2021 15:20:44 +0100
+
+jpeg-xl (0.3.4) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.4.
+
+ -- Sami Boukortt <sboukortt@google.com>  Tue, 16 Mar 2021 12:13:59 +0100
+
+jpeg-xl (0.3.3) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.3.
+
+ -- Sami Boukortt <sboukortt@google.com>  Fri, 5 Mar 2021 19:15:26 +0100
+
+jpeg-xl (0.3.2) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.2.
+
+ -- Alex Deymo <deymo@google.com>  Fri, 12 Feb 2021 21:00:12 +0100
+
+jpeg-xl (0.3.1) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.1.
+
+ -- Alex Deymo <deymo@google.com>  Tue, 09 Feb 2021 09:48:43 +0100
+
+jpeg-xl (0.3) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.
+
+ -- Alex Deymo <deymo@google.com>  Wed, 27 Jan 2021 22:36:32 +0100
+
+jpeg-xl (0.2) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.2.
+
+ -- Alex Deymo <deymo@google.com>  Wed, 23 Nov 2020 20:42:10 +0100
+
+jpeg-xl (0.1) UNRELEASED; urgency=medium
+
+  * JPEG XL format release candidate.
+
+ -- Alex Deymo <deymo@google.com>  Fri, 13 Nov 2020 17:42:24 +0100
+
+jpeg-xl (0.0.2-1) UNRELEASED; urgency=medium
+
+  * Initial debian package.
+
+ -- Alex Deymo <deymo@google.com>  Tue, 27 Oct 2020 15:27:59 +0100
diff --git a/third_party/jpeg-xl/debian/compat b/third_party/jpeg-xl/debian/compat
new file mode 100644
index 0000000000..f599e28b8a
--- /dev/null
+++ b/third_party/jpeg-xl/debian/compat
@@ -0,0 +1 @@
+10
diff --git a/third_party/jpeg-xl/debian/control b/third_party/jpeg-xl/debian/control
new file mode 100644
index 0000000000..f5dc5ce0cc
--- /dev/null
+++ b/third_party/jpeg-xl/debian/control
@@ -0,0 +1,88 @@
+Source: jpeg-xl
+Maintainer: JPEG XL Maintainers <jpegxl@google.com>
+Section: misc
+Priority: optional
+Standards-Version: 3.9.8
+Build-Depends:
+ asciidoc,
+ cmake,
+ debhelper (>= 9),
+ libbrotli-dev,
+ libgdk-pixbuf-2.0-dev | libgdk-pixbuf2.0-dev,
+ libgif-dev,
+ libgimp2.0-dev,
+ libgmock-dev,
+ libgoogle-perftools-dev,
+ libgtest-dev,
+ libhwy-dev (>= 1.0.0),
+ libjpeg-dev,
+ libopenexr-dev,
+ libpng-dev,
+ libwebp-dev,
+ pkg-config,
+ xdg-utils,
+ xmlto,
+Homepage: https://github.com/libjxl/libjxl
+Rules-Requires-Root: no
+
+Package: jxl
+Architecture: any
+Section: utils
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: JPEG XL Image Coding System - "JXL" (command line utility)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs the command line utilities.
+
+Package: libjxl-dev
+Architecture: any
+Section: libdevel
+Depends: libjxl (= ${binary:Version}), ${misc:Depends}
+ libhwy-dev,
+Description: JPEG XL Image Coding System - "JXL" (development files)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs development files.
+
+Package: libjxl
+Architecture: any
+Multi-Arch: same
+Section: libs
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Pre-Depends: ${misc:Pre-Depends}
+Description: JPEG XL Image Coding System - "JXL" (shared libraries)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs shared libraries.
+
+Package: libjxl-gdk-pixbuf
+Architecture: any
+Multi-Arch: same
+Section: libs
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Pre-Depends: ${misc:Pre-Depends}
+Description: JPEG XL Plugin for gdk-pixbuf
+ This package installs the required files for reading JPEG XL files in
+ GTK applications.
+
+Package: libjxl-gimp-plugin
+Architecture: any
+Multi-Arch: same
+Section: graphics
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Pre-Depends: ${misc:Pre-Depends}
+Enhances: gimp
+Description: JPEG XL Import and Export Plugin for GIMP
+ This is a plugin for GIMP version 2.10.x to import and export JPEG XL images.
diff --git a/third_party/jpeg-xl/debian/copyright b/third_party/jpeg-xl/debian/copyright
new file mode 100644
index 0000000000..20225a9209
--- /dev/null
+++ b/third_party/jpeg-xl/debian/copyright
@@ -0,0 +1,194 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: jpeg-xl
+
+Files: *
+Copyright: 2020 the JPEG XL Project
+License: BSD-3-clause
+
+Files: third_party/sjpeg/*
+Copyright: 2017 Google, Inc
+License: Apache-2.0
+
+Files: third_party/skcms/*
+Copyright: 2018 Google Inc.
+License: BSD-3-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ .
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Files: testdata/external/pngsuite/*
+Copyright: Willem van Schaik, 1996, 2011
+License: PngSuite License
+ See http://www.schaik.com/pngsuite/ for details.
+ .
+ Permission to use, copy, modify and distribute these images for any
+ purpose and without fee is hereby granted.
+
+Files: testdata/external/raw.pixls/*
+Copyright: their respective owners listed in https://raw.pixls.us/
+License: CC0-1.0
+
+Files: testdata/external/wesaturate/*
+Copyright: their respective owners listed in https://www.wesaturate.com/
+License: CC0-1.0
+
+Files: testdata/external/wide-gamut-tests/
+Copyright: github.com/codelogic/wide-gamut-tests authors.
+License: Apache-2.0
+
+License: Apache-2.0
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ .
+      http://www.apache.org/licenses/LICENSE-2.0
+ .
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ .
+ On Debian systems, the complete text of the Apache License, Version 2
+ can be found in "/usr/share/common-licenses/Apache-2.0".
+
+License: CC0
+ Creative Commons Zero v1.0 Universal
+ .
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL
+ SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT
+ RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS"
+ BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS
+ DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS
+ LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE
+ INFORMATION OR WORKS PROVIDED HEREUNDER.
+ .
+ Statement of Purpose
+ .
+ The laws of most jurisdictions throughout the world automatically confer
+ exclusive Copyright and Related Rights (defined below) upon the creator and
+ subsequent owner(s) (each and all, an "owner") of an original work of
+ authorship and/or a database (each, a "Work").
+ .
+ Certain owners wish to permanently relinquish those rights to a Work for the
+ purpose of contributing to a commons of creative, cultural and scientific
+ works ("Commons") that the public can reliably and without fear of later
+ claims of infringement build upon, modify, incorporate in other works, reuse
+ and redistribute as freely as possible in any form whatsoever and for any
+ purposes, including without limitation commercial purposes. These owners may
+ contribute to the Commons to promote the ideal of a free culture and the
+ further production of creative, cultural and scientific works, or to gain
+ reputation or greater distribution for their Work in part through the use
+ and efforts of others.
+ .
+ For these and/or other purposes and motivations, and without any expectation
+ of additional consideration or compensation, the person associating CC0 with
+ a Work (the "Affirmer"), to the extent that he or she is an owner of
+ Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to
+ the Work and publicly distribute the Work under its terms, with knowledge of
+ his or her Copyright and Related Rights in the Work and the meaning and
+ intended legal effect of CC0 on those rights.
+ .
+ 1. Copyright and Related Rights. A Work made available under CC0 may be
+ protected by copyright and related or neighboring rights ("Copyright and
+ Related Rights"). Copyright and Related Rights include, but are not limited
+ to, the following:
+   i. the right to reproduce, adapt, distribute, perform, display,
+ communicate, and translate a Work;
+   ii. moral rights retained by the original author(s) and/or performer(s);
+   iii. publicity and privacy rights pertaining to a person's image or
+ likeness depicted in a Work;
+   iv. rights protecting against unfair competition in regards to a Work,
+ subject to the limitations in paragraph 4(a), below;
+   v. rights protecting the extraction, dissemination, use and reuse of data
+ in a Work;
+   vi. database rights (such as those arising under Directive 96/9/EC of the
+ European Parliament and of the Council of 11 March 1996 on the legal
+ protection of databases, and under any national implementation thereof,
+ including any amended or successor version of such directive); and
+   vii. other similar, equivalent or corresponding rights throughout the
+ world based on applicable law or treaty, and any national implementations
+ thereof.
+ .
+ 2. Waiver. To the greatest extent permitted by, but not in contravention of,
+ applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+ unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+ and Related Rights and associated claims and causes of action, whether now
+ known or unknown (including existing as well as future claims and causes of
+ action), in the Work (i) in all territories worldwide, (ii) for the maximum
+ duration provided by applicable law or treaty (including future time
+ extensions), (iii) in any current or future medium and for any number of
+ copies, and (iv) for any purpose whatsoever, including without limitation
+ commercial, advertising or promotional purposes (the "Waiver"). Affirmer
+ makes the Waiver for the benefit of each member of the public at large and
+ to the detriment of Affirmer's heirs and successors, fully intending that
+ such Waiver shall not be subject to revocation, rescission, cancellation,
+ termination, or any other legal or equitable action to disrupt the quiet
+ enjoyment of the Work by the public as contemplated by Affirmer's express
+ Statement of Purpose.
+ .
+ 3. Public License Fallback. Should any part of the Waiver for any reason be
+ judged legally invalid or ineffective under applicable law, then the Waiver
+ shall be preserved to the maximum extent permitted taking into account
+ Affirmer's express Statement of Purpose. In addition, to the extent the
+ Waiver is so judged Affirmer hereby grants to each affected person a
+ royalty-free, non transferable, non sublicensable, non exclusive,
+ irrevocable and unconditional license to exercise Affirmer's Copyright and
+ Related Rights in the Work (i) in all territories worldwide, (ii) for the
+ maximum duration provided by applicable law or treaty (including future time
+ extensions), (iii) in any current or future medium and for any number of
+ copies, and (iv) for any purpose whatsoever, including without limitation
+ commercial, advertising or promotional purposes (the "License"). The License
+ shall be deemed effective as of the date CC0 was applied by Affirmer to the
+ Work. Should any part of the License for any reason be judged legally
+ invalid or ineffective under applicable law, such partial invalidity or
+ ineffectiveness shall not invalidate the remainder of the License, and in
+ such case Affirmer hereby affirms that he or she will not (i) exercise any
+ of his or her remaining Copyright and Related Rights in the Work or (ii)
+ assert any associated claims and causes of action with respect to the Work,
+ in either case contrary to Affirmer's express Statement of Purpose.
+ .
+ 4. Limitations and Disclaimers.
+   a. No trademark or patent rights held by Affirmer are waived, abandoned,
+ surrendered, licensed or otherwise affected by this document.
+   b. Affirmer offers the Work as-is and makes no representations or
+ warranties of any kind concerning the Work, express, implied, statutory or
+ otherwise, including without limitation warranties of title,
+ merchantability, fitness for a particular purpose, non infringement, or the
+ absence of latent or other defects, accuracy, or the present or absence of
+ errors, whether or not discoverable, all to the greatest extent permissible
+ under applicable law.
+   c. Affirmer disclaims responsibility for clearing rights of other persons
+ that may apply to the Work or any use thereof, including without limitation
+ any person's Copyright and Related Rights in the Work. Further, Affirmer
+ disclaims responsibility for obtaining any necessary consents, permissions
+ or other rights required for any use of the Work.
+   d. Affirmer understands and acknowledges that Creative Commons is not a
+ party to this document and has no duty or obligation with respect to this
+ CC0 or use of the Work.
+ .
+ For more information, please see:
+ http://creativecommons.org/publicdomain/zero/1.0/>
+
diff --git a/third_party/jpeg-xl/debian/jxl.install b/third_party/jpeg-xl/debian/jxl.install
new file mode 100644
index 0000000000..c3bae3ed10
--- /dev/null
+++ b/third_party/jpeg-xl/debian/jxl.install
@@ -0,0 +1,3 @@
+usr/bin/*
+usr/share/man/man1/cjxl.1
+usr/share/man/man1/djxl.1
diff --git a/third_party/jpeg-xl/debian/libjxl-dev.install b/third_party/jpeg-xl/debian/libjxl-dev.install
new file mode 100644
index 0000000000..b735ec2c26
--- /dev/null
+++ b/third_party/jpeg-xl/debian/libjxl-dev.install
@@ -0,0 +1,4 @@
+usr/include/jxl/*.h
+usr/lib/*/*.a
+usr/lib/*/*.so
+usr/lib/*/pkgconfig/*.pc
diff --git a/third_party/jpeg-xl/debian/libjxl-gdk-pixbuf.install b/third_party/jpeg-xl/debian/libjxl-gdk-pixbuf.install
new file mode 100644
index 0000000000..12d2ab250f
--- /dev/null
+++ b/third_party/jpeg-xl/debian/libjxl-gdk-pixbuf.install
@@ -0,0 +1,3 @@
+usr/lib/*/gdk-pixbuf-*/*/loaders/*
+usr/share/mime/packages/image-jxl.xml
+usr/share/thumbnailers/jxl.thumbnailer
diff --git a/third_party/jpeg-xl/debian/libjxl-gimp-plugin.install b/third_party/jpeg-xl/debian/libjxl-gimp-plugin.install
new file mode 100644
index 0000000000..353431dba3
--- /dev/null
+++ b/third_party/jpeg-xl/debian/libjxl-gimp-plugin.install
@@ -0,0 +1 @@
+usr/lib/gimp
diff --git a/third_party/jpeg-xl/debian/libjxl.install b/third_party/jpeg-xl/debian/libjxl.install
new file mode 100644
index 0000000000..cd157a7a5c
--- /dev/null
+++ b/third_party/jpeg-xl/debian/libjxl.install
@@ -0,0 +1 @@
+usr/lib/*/libjxl*.so.*
diff --git a/third_party/jpeg-xl/debian/rules b/third_party/jpeg-xl/debian/rules
new file mode 100755
index 0000000000..6259dbfc61
--- /dev/null
+++ b/third_party/jpeg-xl/debian/rules
@@ -0,0 +1,21 @@
+#!/usr/bin/make -f
+
+include /usr/share/dpkg/pkg-info.mk
+
+%:
+	dh $@ --buildsystem=cmake
+
+override_dh_auto_configure:
+	# TODO(deymo): Remove the DCMAKE_BUILD_TYPE once builds without NDEBUG
+	# are as useful as Release builds.
+        # TODO(szabadka) Re-enable jpegli after tests are fixed on Ubuntu 20.04,
+        # and debian:buster
+	dh_auto_configure -- \
+	  -DJPEGXL_VERSION=$(DEB_VERSION) \
+	  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+	  -DJPEGXL_FORCE_SYSTEM_GTEST=ON \
+	  -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+	  -DJPEGXL_FORCE_SYSTEM_HWY=ON \
+	  -DJPEGXL_ENABLE_JPEGLI=OFF \
+	  -DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
+	  -DJPEGXL_ENABLE_PLUGINS=ON
diff --git a/third_party/jpeg-xl/debian/source/format b/third_party/jpeg-xl/debian/source/format
new file mode 100644
index 0000000000..163aaf8d82
--- /dev/null
+++ b/third_party/jpeg-xl/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/third_party/jpeg-xl/deps.sh b/third_party/jpeg-xl/deps.sh
new file mode 100755
index 0000000000..b6555e30ca
--- /dev/null
+++ b/third_party/jpeg-xl/deps.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file downloads the dependencies needed to build JPEG XL into third_party.
+# These dependencies are normally pulled by gtest.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# Git revisions we use for the given submodules. Update these whenever you
+# update a git submodule.
+THIRD_PARTY_BROTLI="36533a866ed1ca4b75cf049f4521e4ec5fe24727"
+THIRD_PARTY_HIGHWAY="46e365d6770f5d7a4240d8ac9d8e928a520478ea"
+THIRD_PARTY_SKCMS="b25b07b4b07990811de121c0356155b2ba0f4318"
+THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
+THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
+THIRD_PARTY_LIBPNG="a40189cf881e9f0db80511c382292a5604c3c3d1"
+
+# Download the target revision from GitHub.
+download_github() {
+  local path="$1"
+  local project="$2"
+
+  local varname=`echo "$path" | tr '[:lower:]' '[:upper:]'`
+  varname="${varname/\//_}"
+  local sha
+  eval "sha=\${${varname}}"
+
+  local down_dir="${MYDIR}/downloads"
+  local local_fn="${down_dir}/${sha}.tar.gz"
+  if [[ -e "${local_fn}" && -d "${MYDIR}/${path}" ]]; then
+    echo "${path} already up to date." >&2
+    return 0
+  fi
+
+  local url
+  local strip_components=0
+  if [[ "${project:0:4}" == "http" ]]; then
+    # "project" is a googlesource.com base url.
+    url="${project}${sha}.tar.gz"
+  else
+    # GitHub files have a top-level directory
+    strip_components=1
+    url="https://github.com/${project}/tarball/${sha}"
+  fi
+
+  echo "Downloading ${path} version ${sha}..." >&2
+  mkdir -p "${down_dir}"
+  curl -L --show-error -o "${local_fn}.tmp" "${url}"
+  mkdir -p "${MYDIR}/${path}"
+  tar -zxf "${local_fn}.tmp" -C "${MYDIR}/${path}" \
+    --strip-components="${strip_components}"
+  mv "${local_fn}.tmp" "${local_fn}"
+}
+
+
+main() {
+  if git -C "${MYDIR}" rev-parse; then
+    cat >&2 <<EOF
+Current directory is a git repository, downloading dependencies via git:
+
+  git submodule update --init --recursive
+
+EOF
+    git -C "${MYDIR}" submodule update --init --recursive --depth 1 --recommend-shallow
+    return 0
+  fi
+
+  # Sources downloaded from a tarball.
+  download_github third_party/brotli google/brotli
+  download_github third_party/highway google/highway
+  download_github third_party/sjpeg webmproject/sjpeg
+  download_github third_party/skcms \
+    "https://skia.googlesource.com/skcms/+archive/"
+  download_github third_party/zlib madler/zlib
+  download_github third_party/libpng glennrp/libpng
+  echo "Done."
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder
new file mode 100644
index 0000000000..16e0077eea
--- /dev/null
+++ b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder
@@ -0,0 +1,21 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Build an Ubuntu-based docker image with the installed software needed to
+# develop and test JPEG XL.
+
+FROM ubuntu:bionic
+
+# Set a prompt for when using it locally.
+ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
+
+COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
+
+COPY scripts /jpegxl_scripts
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN /jpegxl_scripts/jpegxl_builder.sh && \
+  rm -rf /jpegxl_scripts
diff --git a/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder-run-aarch64 b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder-run-aarch64
new file mode 100644
index 0000000000..b4f2918375
--- /dev/null
+++ b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder-run-aarch64
@@ -0,0 +1,36 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Build an Ubuntu-based docker image for aarch64 with the installed software
+# needed to run JPEG XL. This is only useful when running on actual aarch64
+# hardware.
+
+FROM arm64v8/ubuntu:bionic
+
+COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
+
+# Set a prompt for when using it locally.
+ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN set -ex; \
+  apt-get update -y; \
+  apt-get install -y \
+    bsdmainutils \
+    cmake \
+    curl \
+    ca-certificates \
+    extra-cmake-modules \
+    git \
+    imagemagick \
+    libjpeg8 \
+    libgif7 \
+    libgoogle-perftools4 \
+    libopenexr22 \
+    libpng16-16 \
+    libsdl2-2.0-0 \
+    parallel; \
+  rm -rf /var/lib/apt/lists/*;
diff --git a/third_party/jpeg-xl/docker/README.md b/third_party/jpeg-xl/docker/README.md
new file mode 100644
index 0000000000..874df1cb80
--- /dev/null
+++ b/third_party/jpeg-xl/docker/README.md
@@ -0,0 +1,7 @@
+### Docker container infrastructure for JPEG XL
+
+This directory contains the requirements to build a docker image for the
+JPEG XL project builder.
+
+Docker images need to be created and upload manually. See ./build.sh for
+details.
diff --git a/third_party/jpeg-xl/docker/build.sh b/third_party/jpeg-xl/docker/build.sh
new file mode 100755
index 0000000000..3d4727f6a4
--- /dev/null
+++ b/third_party/jpeg-xl/docker/build.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+declare -a TARGETS
+
+load_targets() {
+  # Built-in OSX "find" does not support "-m".
+  FIND=$(which "gfind" || which "find")
+  for f in $(${FIND} -maxdepth 1 -name 'Dockerfile.*' | sort); do
+    local target="${f#*Dockerfile.}"
+    TARGETS+=("${target}")
+  done
+}
+
+usage() {
+    cat >&2 <<EOF
+Use: $1 [targets]
+
+Available targets:
+  * all
+EOF
+  for target in "${TARGETS[@]}"; do
+    echo "  * ${target}" >&2
+  done
+}
+
+build_target() {
+  local target="$1"
+
+  local dockerfile="${MYDIR}/Dockerfile.${target}"
+  # JPEG XL builder images are stored in the gcr.io/jpegxl project.
+  local tag="gcr.io/jpegxl/${target}"
+
+  echo "Building ${target}"
+  if ! sudo docker build --no-cache -t "${tag}" -f "${dockerfile}" "${MYDIR}" \
+      >"${target}.log" 2>&1; then
+    echo "${target} failed. See ${target}.log" >&2
+  else
+    echo "Done, to upload image run:" >&2
+    echo "  sudo docker push ${tag}"
+    if [[ "${JPEGXL_PUSH:-}" == "1" ]]; then
+      echo "sudo docker push ${tag}" >&2
+      sudo docker push "${tag}"
+      # The RepoDigest is only created after it is pushed.
+      local fulltag=$(sudo docker inspect --format="{{.RepoDigests}}" "${tag}")
+      fulltag="${fulltag#[}"
+      fulltag="${fulltag%]}"
+      echo "Updating .gitlab-ci.yml to ${fulltag}" >&2
+      sed -E "s;${tag}@sha256:[0-9a-f]+;${fulltag};" \
+        -i "${MYDIR}/../.gitlab-ci.yml"
+    fi
+  fi
+}
+
+main() {
+  cd "${MYDIR}"
+  local target="${1:-}"
+
+  load_targets
+  if [[ -z "${target}" ]]; then
+    usage $0
+    exit 1
+  fi
+
+  if [[ "${target}" == "all" ]]; then
+    for target in "${TARGETS[@]}"; do
+      build_target "${target}"
+    done
+  else
+    for target in "$@"; do
+      build_target "${target}"
+    done
+  fi
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/docker/scripts/99_norecommends b/third_party/jpeg-xl/docker/scripts/99_norecommends
new file mode 100644
index 0000000000..96d672811d
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/99_norecommends
@@ -0,0 +1 @@
+APT::Install-Recommends "false";
diff --git a/third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch b/third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch
new file mode 100644
index 0000000000..6066252db8
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch
@@ -0,0 +1,28 @@
+Description: fix lack of alignment in relocations (crashes on mingw)
+See https://sourceware.org/git/?p=binutils-gdb.git;a=patch;h=73af69e74974eaa155eec89867e3ccc77ab39f6d
+From: Marc <marc@groundctl.com>
+Date: Fri, 9 Nov 2018 11:13:50 +0000
+Subject: [PATCH] Allow for compilers that do not produce aligned .rdat
+ sections in PE format files.
+
+--- a/upstream/ld/scripttempl/pe.sc	2020-05-12 18:45:12.000000000 +0200
++++ b/upstream/ld/scripttempl/pe.sc	2020-05-12 18:47:12.000000000 +0200
+@@ -143,6 +143,7 @@
+   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
+   {
+     ${R_RDATA}
++    . = ALIGN(4);
+     ${RELOCATING+__rt_psrelocs_start = .;}
+     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
+     ${RELOCATING+__rt_psrelocs_end = .;}
+--- a/upstream/ld/scripttempl/pep.sc	2020-05-12 18:45:19.000000000 +0200
++++ b/upstream/ld/scripttempl/pep.sc	2020-05-12 18:47:18.000000000 +0200
+@@ -143,6 +143,7 @@
+   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
+   {
+     ${R_RDATA}
++    . = ALIGN(4);
+     ${RELOCATING+__rt_psrelocs_start = .;}
+     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
+     ${RELOCATING+__rt_psrelocs_end = .;}
+
diff --git a/third_party/jpeg-xl/docker/scripts/emsdk_install.sh b/third_party/jpeg-xl/docker/scripts/emsdk_install.sh
new file mode 100755
index 0000000000..6cf225a9d9
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/emsdk_install.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+EMSDK_URL="https://github.com/emscripten-core/emsdk/archive/main.tar.gz"
+EMSDK_DIR="/opt/emsdk"
+
+EMSDK_RELEASE="2.0.23"
+
+set -eu -x
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+main() {
+  local workdir=$(mktemp -d --suffix=emsdk)
+  CLEANUP_FILES+=("${workdir}")
+
+  local emsdktar="${workdir}/emsdk.tar.gz"
+  curl --output "${emsdktar}" "${EMSDK_URL}" --location
+  mkdir -p "${EMSDK_DIR}"
+  tar -zxf "${emsdktar}" -C "${EMSDK_DIR}" --strip-components=1
+
+  cd "${EMSDK_DIR}"
+  ./emsdk install --shallow "${EMSDK_RELEASE}"
+  ./emsdk activate --embedded "${EMSDK_RELEASE}"
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh b/third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh
new file mode 100755
index 0000000000..949c811eae
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh
@@ -0,0 +1,516 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Main entry point for all the Dockerfile for jpegxl-builder. This centralized
+# file helps sharing code and configuration between Dockerfiles.
+
+set -eux
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# libjpeg-turbo.
+JPEG_TURBO_RELEASE="2.0.4"
+JPEG_TURBO_URL="https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${JPEG_TURBO_RELEASE}.tar.gz"
+JPEG_TURBO_SHA256="7777c3c19762940cff42b3ba4d7cd5c52d1671b39a79532050c85efb99079064"
+
+# zlib (dependency of libpng)
+ZLIB_RELEASE="1.2.11"
+ZLIB_URL="https://www.zlib.net/zlib-${ZLIB_RELEASE}.tar.gz"
+ZLIB_SHA256="c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
+# The name in the .pc and the .dll generated don't match in zlib for Windows
+# because they use different .dll names in Windows. We avoid that by defining
+# UNIX=1. We also install all the .dll files to ${prefix}/lib instead of the
+# default ${prefix}/bin.
+ZLIB_FLAGS='-DUNIX=1 -DINSTALL_PKGCONFIG_DIR=/${CMAKE_INSTALL_PREFIX}/lib/pkgconfig -DINSTALL_BIN_DIR=/${CMAKE_INSTALL_PREFIX}/lib'
+
+# libpng
+LIBPNG_RELEASE="1.6.37"
+LIBPNG_URL="https://github.com/glennrp/libpng/archive/v${LIBPNG_RELEASE}.tar.gz"
+LIBPNG_SHA256="ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307"
+
+# giflib
+GIFLIB_RELEASE="5.2.1"
+GIFLIB_URL="https://netcologne.dl.sourceforge.net/project/giflib/giflib-${GIFLIB_RELEASE}.tar.gz"
+GIFLIB_SHA256="31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd"
+
+# A patch needed to compile GIFLIB in mingw.
+GIFLIB_PATCH_URL="https://github.com/msys2/MINGW-packages/raw/3afde38fcee7b3ba2cafd97d76cca8f06934504f/mingw-w64-giflib/001-mingw-build.patch"
+GIFLIB_PATCH_SHA256="2b2262ddea87fc07be82e10aeb39eb699239f883c899aa18a16e4d4e40af8ec8"
+
+# webp
+WEBP_RELEASE="1.0.2"
+WEBP_URL="https://codeload.github.com/webmproject/libwebp/tar.gz/v${WEBP_RELEASE}"
+WEBP_SHA256="347cf85ddc3497832b5fa9eee62164a37b249c83adae0ba583093e039bf4881f"
+
+# Google benchmark
+BENCHMARK_RELEASE="1.5.2"
+BENCHMARK_URL="https://github.com/google/benchmark/archive/v${BENCHMARK_RELEASE}.tar.gz"
+BENCHMARK_SHA256="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
+BENCHMARK_FLAGS="-DGOOGLETEST_PATH=${MYDIR}/../../third_party/googletest"
+# attribute(format(__MINGW_PRINTF_FORMAT, ...)) doesn't work in our
+# environment, so we disable the warning.
+BENCHMARK_FLAGS="-DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_TESTING=OFF \
+  -DCMAKE_CXX_FLAGS=-Wno-ignored-attributes \
+  -DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+
+# V8
+V8_VERSION="9.3.22"
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+# List of Ubuntu arch names supported by the builder (such as "i386").
+LIST_ARCHS=(
+  amd64
+  i386
+  arm64
+  armhf
+)
+
+# List of target triplets supported by the builder.
+LIST_TARGETS=(
+  x86_64-linux-gnu
+  i686-linux-gnu
+  arm-linux-gnueabihf
+  aarch64-linux-gnu
+)
+LIST_MINGW_TARGETS=(
+  i686-w64-mingw32
+  x86_64-w64-mingw32
+)
+LIST_WASM_TARGETS=(
+  wasm32
+)
+
+# Setup the apt repositories and supported architectures.
+setup_apt() {
+  apt-get update -y
+  apt-get install -y curl gnupg ca-certificates
+
+  apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
+
+  # node sources.
+  cat >/etc/apt/sources.list.d/nodesource.list <<EOF
+  deb https://deb.nodesource.com/node_14.x bionic main
+  deb-src https://deb.nodesource.com/node_14.x bionic main
+EOF
+  curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
+
+  local port_list=()
+  local main_list=()
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    if [[ "${ubarch}" != "amd64" && "${ubarch}" != "i386" ]]; then
+      # other archs are not part of the main mirrors, but available in
+      # ports.ubuntu.com.
+      port_list+=("${ubarch}")
+    else
+      main_list+=("${ubarch}")
+    fi
+    # Add the arch to the system.
+    if [[ "${ubarch}" != "amd64" ]]; then
+      dpkg --add-architecture "${ubarch}"
+    fi
+  done
+
+  # Update the sources.list with the split of supported architectures.
+  local bkplist="/etc/apt/sources.list.bkp"
+  [[ -e "${bkplist}" ]] || \
+    mv /etc/apt/sources.list "${bkplist}"
+
+  local newlist="/etc/apt/sources.list.tmp"
+  rm -f "${newlist}"
+  port_list=$(echo "${port_list[@]}" | tr ' ' ,)
+  if [[ -n "${port_list}" ]]; then
+    local port_url="http://ports.ubuntu.com/ubuntu-ports/"
+    grep -v -E '^#' "${bkplist}" |
+      sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${port_list}] ${port_url} \\2;" \
+      >>"${newlist}"
+  fi
+
+  main_list=$(echo "${main_list[@]}" | tr ' ' ,)
+  grep -v -E '^#' "${bkplist}" |
+    sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
+    >>"${newlist}"
+  mv "${newlist}" /etc/apt/sources.list
+}
+
+install_pkgs() {
+  packages=(
+    # Native compilers (minimum for SIMD is clang-7)
+    clang-7 clang-format-7 clang-tidy-7
+
+    # TODO: Consider adding clang-8 to every builder:
+    #   clang-8 clang-format-8 clang-tidy-8
+
+    # For cross-compiling to Windows with mingw.
+    mingw-w64
+    wine64
+    wine-binfmt
+
+    # Native tools.
+    bsdmainutils
+    cmake
+    extra-cmake-modules
+    git
+    llvm
+    nasm
+    ninja-build
+    parallel
+    pkg-config
+
+    # For compiling / testing JNI wrapper. JDK8 is almost 2x smaller than JDK11
+    # openjdk-8-jdk-headless would be 50MB smaller, unfortunately, CMake
+    # does mistakenly thinks it does not contain JNI feature.
+    openjdk-8-jdk
+
+    # These are used by the ./ci.sh lint in the native builder.
+    clang-format-7
+    clang-format-8
+
+    # For coverage builds
+    gcovr
+
+    # For compiling giflib documentation.
+    xmlto
+
+    # Common libraries.
+    libstdc++-8-dev
+
+    # We don't use tcmalloc on archs other than amd64. This installs
+    # libgoogle-perftools4:amd64.
+    google-perftools
+
+    # NodeJS for running WASM tests
+    nodejs
+
+    # To generate API documentation.
+    doxygen
+
+    # Freezes version that builds (passes tests). Newer version
+    # (2.30-21ubuntu1~18.04.4) claims to fix "On Intel Skylake
+    # (-march=native) generated avx512 instruction can be wrong",
+    # but newly added tests does not pass. Perhaps the problem is
+    # that mingw package is not updated.
+    binutils-source=2.30-15ubuntu1
+  )
+
+  # Install packages that are arch-dependent.
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    packages+=(
+      # Library dependencies. These normally depend on the target architecture
+      # we are compiling for and can't usually be installed for multiple
+      # architectures at the same time.
+      libgif7:"${ubarch}"
+      libjpeg-dev:"${ubarch}"
+      libpng-dev:"${ubarch}"
+
+      libstdc++-8-dev:"${ubarch}"
+
+      # For OpenEXR:
+      libilmbase12:"${ubarch}"
+      libopenexr22:"${ubarch}"
+
+      # TCMalloc dependency
+      libunwind-dev:"${ubarch}"
+
+      # Cross-compiling tools per arch.
+      libc6-dev-"${ubarch}"-cross
+      libstdc++-8-dev-"${ubarch}"-cross
+    )
+  done
+
+  local target
+  for target in "${LIST_TARGETS[@]}"; do
+    # Per target cross-compiling tools.
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      packages+=(
+        binutils-"${target}"
+        gcc-"${target}"
+      )
+    fi
+  done
+
+  # Install all the manual packages via "apt install" for the main arch. These
+  # will be installed for other archs via manual download and unpack.
+  apt install -y "${packages[@]}" "${UNPACK_PKGS[@]}"
+}
+
+# binutils <2.32 need a patch.
+install_binutils() {
+  local workdir=$(mktemp -d --suffix=_install)
+  CLEANUP_FILES+=("${workdir}")
+  pushd "${workdir}"
+  apt source binutils-mingw-w64
+  apt -y build-dep binutils-mingw-w64
+  cd binutils-mingw-w64-8ubuntu1
+  cp "${MYDIR}/binutils_align_fix.patch" debian/patches
+  echo binutils_align_fix.patch >> debian/patches/series
+  dpkg-buildpackage -b
+  cd ..
+  dpkg -i *deb
+  popd
+}
+
+# Install a library from the source code for multiple targets.
+# Usage: install_from_source <tar_url> <sha256> <target> [<target...>]
+install_from_source() {
+  local package="$1"
+  shift
+
+  local url
+  eval "url=\${${package}_URL}"
+  local sha256
+  eval "sha256=\${${package}_SHA256}"
+  # Optional package flags
+  local pkgflags
+  eval "pkgflags=\${${package}_FLAGS:-}"
+
+  local workdir=$(mktemp -d --suffix=_install)
+  CLEANUP_FILES+=("${workdir}")
+
+  local tarfile="${workdir}"/$(basename "${url}")
+  curl -L --output "${tarfile}" "${url}"
+  if ! echo "${sha256} ${tarfile}" | sha256sum -c --status -; then
+    echo "SHA256 mismatch for ${url}: expected ${sha256} but found:"
+    sha256sum "${tarfile}"
+    exit 1
+  fi
+
+  local target
+  for target in "$@"; do
+    echo "Installing ${package} for target ${target} from ${url}"
+
+    local srcdir="${workdir}/source-${target}"
+    mkdir -p "${srcdir}"
+    tar -zxf "${tarfile}" -C "${srcdir}" --strip-components=1
+
+    local prefix="/usr"
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      prefix="/usr/${target}"
+    fi
+
+    # Apply patches to buildfiles.
+    if [[ "${package}" == "GIFLIB" && "${target}" == *mingw32 ]]; then
+      # GIFLIB Makefile has several problems so we need to fix them here. We are
+      # using a patch from MSYS2 that already fixes the compilation for mingw.
+      local make_patch="${srcdir}/libgif.patch"
+      curl -L "${GIFLIB_PATCH_URL}" -o "${make_patch}"
+      echo "${GIFLIB_PATCH_SHA256} ${make_patch}" | sha256sum -c --status -
+      patch "${srcdir}/Makefile" < "${make_patch}"
+    elif [[ "${package}" == "LIBPNG" && "${target}" == wasm* ]]; then
+      # Cut the dependency to libm; there is pull request to fix it, so this
+      # might not be needed in the future.
+      sed -i 's/APPLE/EMSCRIPTEN/g' "${srcdir}/CMakeLists.txt"
+    fi
+
+    local cmake_args=()
+    local export_args=("CC=clang-7" "CXX=clang++-7")
+    local cmake="cmake"
+    local make="make"
+    local system_name="Linux"
+    if [[ "${target}" == *mingw32 ]]; then
+      system_name="Windows"
+      # When compiling with clang, CMake doesn't detect that we are using mingw.
+      cmake_args+=(
+        -DMINGW=1
+        # Googletest needs this when cross-compiling to windows
+        -DCMAKE_CROSSCOMPILING=1
+        -DHAVE_STD_REGEX=0
+        -DHAVE_POSIX_REGEX=0
+        -DHAVE_GNU_POSIX_REGEX=0
+      )
+      local windres=$(which ${target}-windres || true)
+      if [[ -n "${windres}" ]]; then
+        cmake_args+=(-DCMAKE_RC_COMPILER="${windres}")
+      fi
+    fi
+    if [[ "${target}" == wasm* ]]; then
+      system_name="WASM"
+      cmake="emcmake cmake"
+      make="emmake make"
+      export_args=()
+      cmake_args+=(
+        -DCMAKE_FIND_ROOT_PATH="${prefix}"
+        -DCMAKE_PREFIX_PATH="${prefix}"
+      )
+      # Static and shared library link to the same file -> race condition.
+      nproc=1
+    else
+      nproc=`nproc --all`
+    fi
+    cmake_args+=(-DCMAKE_SYSTEM_NAME="${system_name}")
+
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      # Cross-compiling.
+      cmake_args+=(
+        -DCMAKE_C_COMPILER_TARGET="${target}"
+        -DCMAKE_CXX_COMPILER_TARGET="${target}"
+        -DCMAKE_SYSTEM_PROCESSOR="${target%%-*}"
+      )
+    fi
+
+    if [[ -e "${srcdir}/CMakeLists.txt" ]]; then
+      # Most packages use cmake for building which is easier to configure for
+      # cross-compiling.
+      if [[ "${package}" == "JPEG_TURBO" && "${target}" == wasm* ]]; then
+        # JT erroneously detects WASM CPU as i386 and tries to use asm.
+        # Wasm/Emscripten support for dynamic linking is incomplete; disable
+        # to avoid CMake warning.
+        cmake_args+=(-DWITH_SIMD=0 -DENABLE_SHARED=OFF)
+      fi
+      (
+        cd "${srcdir}"
+        export ${export_args[@]}
+        ${cmake} \
+          -DCMAKE_INSTALL_PREFIX="${prefix}" \
+          "${cmake_args[@]}" ${pkgflags}
+        ${make} -j${nproc}
+        ${make} install
+      )
+    elif [[ "${package}" == "GIFLIB" ]]; then
+      # GIFLIB doesn't yet have a cmake build system. There is a pull
+      # request in giflib for adding CMakeLists.txt so this might not be
+      # needed in the future.
+      (
+        cd "${srcdir}"
+        local giflib_make_flags=(
+          CFLAGS="-O2 --target=${target} -std=gnu99"
+          PREFIX="${prefix}"
+        )
+        if [[ "${target}" != wasm* ]]; then
+          giflib_make_flags+=(CC=clang-7)
+        fi
+        # giflib make dependencies are not properly set up so parallel building
+        # doesn't work for everything.
+        ${make} -j${nproc} libgif.a "${giflib_make_flags[@]}"
+        ${make} -j${nproc} all "${giflib_make_flags[@]}"
+        ${make} install "${giflib_make_flags[@]}"
+      )
+    else
+      echo "Don't know how to install ${package}"
+      exit 1
+    fi
+
+    # CMake mistakenly uses ".so" libraries and EMCC fails to link properly.
+    if [[ "${target}" == wasm* ]]; then
+      rm -f "${prefix}/lib"/*.so*
+    fi
+  done
+}
+
+# Packages that are manually unpacked for each architecture.
+UNPACK_PKGS=(
+  libgif-dev
+  libclang-common-7-dev
+
+  # For OpenEXR:
+  libilmbase-dev
+  libopenexr-dev
+
+  # TCMalloc
+  libgoogle-perftools-dev
+  libtcmalloc-minimal4
+  libgoogle-perftools4
+)
+
+# Main script entry point.
+main() {
+  cd "${MYDIR}"
+
+  # Configure the repositories with the sources for multi-arch cross
+  # compilation.
+  setup_apt
+  apt-get update -y
+  apt-get dist-upgrade -y
+
+  install_pkgs
+  install_binutils
+  apt clean
+
+  # Remove prebuilt Java classes cache.
+  rm /usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/classes.jsa
+
+  # Manually extract packages for the target arch that can't install it directly
+  # at the same time as the native ones.
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    if [[ "${ubarch}" != "amd64" ]]; then
+      local pkg
+      for pkg in "${UNPACK_PKGS[@]}"; do
+        apt download "${pkg}":"${ubarch}"
+        dpkg -x "${pkg}"_*_"${ubarch}".deb /
+      done
+    fi
+  done
+  # TODO: Add clang from the llvm repos. This is problematic since we are
+  # installing libclang-common-7-dev:"${ubarch}" from the ubuntu ports repos
+  # which is not available in the llvm repos so it might have a different
+  # version than the ubuntu ones.
+
+  # Remove the win32 libgcc version. The gcc-mingw-w64-x86-64 (and i686)
+  # packages install two libgcc versions:
+  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-posix
+  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32
+  # (exact libgcc version number depends on the package version).
+  #
+  # Clang will pick the best libgcc, sorting by version, but it doesn't
+  # seem to be a way to specify one or the other one, except by passing
+  # -nostdlib and setting all the include paths from the command line.
+  # To check which one is being used you can run:
+  #   clang++-7 --target=x86_64-w64-mingw32 -v -print-libgcc-file-name
+  # We need to use the "posix" versions for thread support, so here we
+  # just remove the other one.
+  local target
+  for target in "${LIST_MINGW_TARGETS[@]}"; do
+    update-alternatives --set "${target}-gcc" $(which "${target}-gcc-posix")
+    local gcc_win32_path=$("${target}-cpp-win32" -print-libgcc-file-name)
+    rm -rf $(dirname "${gcc_win32_path}")
+  done
+
+  # TODO: Add msan for the target when cross-compiling. This only installs it
+  # for amd64.
+  ./msan_install.sh
+
+  # Build and install qemu user-linux targets.
+  ./qemu_install.sh
+
+  # Install emscripten SDK.
+  ./emsdk_install.sh
+
+  # Setup environment for building WASM libraries from sources.
+  source /opt/emsdk/emsdk_env.sh
+
+  # Install some dependency libraries manually for the different targets.
+
+  install_from_source JPEG_TURBO "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source ZLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source LIBPNG "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source GIFLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  # webp in Ubuntu is relatively old so we install it from source for everybody.
+  install_from_source WEBP "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
+
+  install_from_source BENCHMARK "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
+
+  # Install v8. v8 has better WASM SIMD support than NodeJS 14 (LTS).
+  # First we need the installer to install v8.
+  npm install jsvu -g
+  # install specific version;
+  HOME=/opt jsvu --os=linux64 "v8@${V8_VERSION}"
+  ln -s "/opt/.jsvu/v8-${V8_VERSION}" "/opt/.jsvu/v8"
+
+  # Cleanup.
+  find /var/lib/apt/lists/ -mindepth 1 -delete
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/docker/scripts/msan_install.sh b/third_party/jpeg-xl/docker/scripts/msan_install.sh
new file mode 100755
index 0000000000..0216f62b04
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/msan_install.sh
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
+CMAKE_FLAGS=${CMAKE_FLAGS:-}
+CMAKE_C_FLAGS=${CMAKE_C_FLAGS:-${CMAKE_FLAGS}}
+CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS:-${CMAKE_FLAGS}}
+CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
+
+CLANG_VERSION="${CLANG_VERSION:-}"
+# Detect the clang version suffix and store it in CLANG_VERSION. For example,
+# "6.0" for clang 6 or "7" for clang 7.
+detect_clang_version() {
+  if [[ -n "${CLANG_VERSION}" ]]; then
+    return 0
+  fi
+  local clang_version=$("${CC:-clang}" --version | head -n1)
+  local llvm_tag
+  case "${clang_version}" in
+    "clang version 6."*)
+      CLANG_VERSION="6.0"
+      ;;
+    "clang version 7."*)
+      CLANG_VERSION="7"
+      ;;
+    "clang version 8."*)
+      CLANG_VERSION="8"
+      ;;
+    "clang version 9."*)
+      CLANG_VERSION="9"
+      ;;
+    *)
+      echo "Unknown clang version: ${clang_version}" >&2
+      return 1
+  esac
+}
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+# Install libc++ libraries compiled with msan in the msan_prefix for the current
+# compiler version.
+cmd_msan_install() {
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+  # Detect the llvm to install:
+  export CC="${CC:-clang}"
+  export CXX="${CXX:-clang++}"
+  detect_clang_version
+  local llvm_tag
+  case "${CLANG_VERSION}" in
+    "6.0")
+      llvm_tag="llvmorg-6.0.1"
+      ;;
+    "7")
+      llvm_tag="llvmorg-7.0.1"
+      ;;
+    "8")
+      llvm_tag="llvmorg-8.0.0"
+      ;;
+    *)
+      echo "Unknown clang version: ${clang_version}" >&2
+      return 1
+  esac
+  local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
+  curl -L --show-error -o "${llvm_targz}" \
+    "https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
+  tar -C "${tmpdir}" -zxf "${llvm_targz}"
+  local llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
+
+  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
+  rm -rf "${msan_prefix}"
+
+  declare -A CMAKE_EXTRAS
+  CMAKE_EXTRAS[libcxx]="\
+    -DLIBCXX_CXX_ABI=libstdc++ \
+    -DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
+
+  for project in libcxx; do
+    local proj_build="${tmpdir}/build-${project}"
+    local proj_dir="${llvm_root}/${project}"
+    mkdir -p "${proj_build}"
+    cmake -B"${proj_build}" -H"${proj_dir}" \
+      -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_USE_SANITIZER=Memory \
+      -DLLVM_PATH="${llvm_root}/llvm" \
+      -DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
+                            head -n1)" \
+      -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
+      -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
+      -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
+      -DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
+      ${CMAKE_EXTRAS[${project}]}
+    cmake --build "${proj_build}"
+    ninja -C "${proj_build}" install
+  done
+}
+
+main() {
+  set -x
+  for version in 6.0 7 8; do
+    if ! which "clang-${version}" >/dev/null; then
+      echo "Skipping msan install for clang version ${version}"
+      continue
+    fi
+    (
+     trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+     export CLANG_VERSION=${version}
+     export CC=clang-${version}
+     export CXX=clang++-${version}
+     cmd_msan_install
+    ) &
+  done
+  wait
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/docker/scripts/qemu_install.sh b/third_party/jpeg-xl/docker/scripts/qemu_install.sh
new file mode 100755
index 0000000000..8106c4471d
--- /dev/null
+++ b/third_party/jpeg-xl/docker/scripts/qemu_install.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+QEMU_RELEASE="4.1.0"
+QEMU_URL="https://download.qemu.org/qemu-${QEMU_RELEASE}.tar.xz"
+QEMU_ARCHS=(
+  aarch64
+  arm
+  i386
+  # TODO: Consider adding these:
+  # aarch64_be
+  # mips64el
+  # mips64
+  # mips
+  # ppc64
+  # ppc
+)
+
+# Ubuntu packages not installed that are needed to build qemu.
+QEMU_BUILD_DEPS=(
+  libglib2.0-dev
+  libpixman-1-dev
+  flex
+  bison
+)
+
+set -eu -x
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+main() {
+  local workdir=$(mktemp -d --suffix=qemu)
+  CLEANUP_FILES+=("${workdir}")
+
+  apt install -y "${QEMU_BUILD_DEPS[@]}"
+
+  local qemutar="${workdir}/qemu.tar.gz"
+  curl --output "${qemutar}" "${QEMU_URL}"
+  tar -Jxf "${qemutar}" -C "${workdir}"
+  local srcdir="${workdir}/qemu-${QEMU_RELEASE}"
+
+  local builddir="${workdir}/build"
+  local prefixdir="${workdir}/prefix"
+  mkdir -p "${builddir}"
+
+  # List of targets to build.
+  local targets=""
+  local make_targets=()
+  local target
+  for target in "${QEMU_ARCHS[@]}"; do
+    targets="${targets} ${target}-linux-user"
+    # Build just the linux-user targets.
+    make_targets+=("${target}-linux-user/all")
+  done
+
+  cd "${builddir}"
+  "${srcdir}/configure" \
+    --prefix="${prefixdir}" \
+    --static --disable-system --enable-linux-user \
+    --target-list="${targets}"
+
+  make -j $(nproc --all || echo 1) "${make_targets[@]}"
+
+  # Manually install these into the non-standard location. This script runs as
+  # root anyway.
+  for target in "${QEMU_ARCHS[@]}"; do
+    cp "${target}-linux-user/qemu-${target}" "/usr/bin/qemu-${target}-static"
+  done
+
+  apt autoremove -y --purge "${QEMU_BUILD_DEPS[@]}"
+}
+
+main "$@"
diff --git a/third_party/jpeg-xl/examples/CMakeLists.txt b/third_party/jpeg-xl/examples/CMakeLists.txt
new file mode 100644
index 0000000000..88dc27c49f
--- /dev/null
+++ b/third_party/jpeg-xl/examples/CMakeLists.txt
@@ -0,0 +1,56 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Example project using libjxl.
+
+cmake_minimum_required(VERSION 3.10)
+
+project(SAMPLE_LIBJXL LANGUAGES C CXX)
+
+# Use pkg-config to find libjxl.
+find_package(PkgConfig)
+pkg_check_modules(Jxl REQUIRED IMPORTED_TARGET libjxl)
+pkg_check_modules(JxlThreads REQUIRED IMPORTED_TARGET libjxl_threads)
+
+# Build the example encoder/decoder binaries using the default shared libraries
+# installed.
+add_executable(decode_oneshot decode_oneshot.cc)
+target_link_libraries(decode_oneshot PkgConfig::Jxl PkgConfig::JxlThreads)
+
+add_executable(decode_progressive decode_progressive.cc)
+target_link_libraries(decode_progressive PkgConfig::Jxl PkgConfig::JxlThreads)
+
+add_executable(encode_oneshot encode_oneshot.cc)
+target_link_libraries(encode_oneshot PkgConfig::Jxl PkgConfig::JxlThreads)
+
+
+# Building a static binary with the static libjxl dependencies. How to load
+# static library configs from pkg-config and how to build static binaries
+# depends on the platform, and building static binaries in general has problems.
+# If you don't need static binaries you can remove this section.
+add_library(StaticJxl INTERFACE IMPORTED GLOBAL)
+set_target_properties(StaticJxl PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${Jxl_STATIC_INCLUDE_DIR}"
+    INTERFACE_COMPILE_OPTIONS "${Jxl_STATIC_CFLAGS_OTHER}"
+    INTERFACE_LINK_LIBRARIES "${Jxl_STATIC_LDFLAGS}"
+)
+add_library(StaticJxlThreads INTERFACE IMPORTED GLOBAL)
+set_target_properties(StaticJxlThreads PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${JxlThreads_STATIC_INCLUDE_DIR}"
+    INTERFACE_COMPILE_OPTIONS "${JxlThreads_STATIC_CFLAGS_OTHER}"
+    # libgcc uses weak symbols for pthread which means that -lpthread is not
+    # linked when compiling a static binary. This is a platform-specific fix for
+    # that.
+    INTERFACE_LINK_LIBRARIES
+      "${JxlThreads_STATIC_LDFLAGS} -Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
+)
+
+add_executable(decode_oneshot_static decode_oneshot.cc)
+target_link_libraries(decode_oneshot_static
+  -static StaticJxl StaticJxlThreads)
+
+add_executable(encode_oneshot_static encode_oneshot.cc)
+target_link_libraries(encode_oneshot_static
+  -static StaticJxl StaticJxlThreads)
diff --git a/third_party/jpeg-xl/examples/decode_exif_metadata.cc b/third_party/jpeg-xl/examples/decode_exif_metadata.cc
new file mode 100644
index 0000000000..97b0e52703
--- /dev/null
+++ b/third_party/jpeg-xl/examples/decode_exif_metadata.cc
@@ -0,0 +1,172 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This C++ example decodes a JPEG XL image in one shot (all input bytes
+// available at once). The example outputs the pixels and color information to a
+// floating point image and an ICC profile on disk.
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <vector>
+
+bool DecodeJpegXlExif(const uint8_t* jxl, size_t size,
+                      std::vector<uint8_t>* exif) {
+  auto dec = JxlDecoderMake(nullptr);
+
+  // We're only interested in the Exif boxes in this example, so don't
+  // subscribe to events related to pixel data.
+  if (JXL_DEC_SUCCESS != JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BOX)) {
+    fprintf(stderr, "JxlDecoderSubscribeEvents failed\n");
+    return false;
+  }
+  bool support_decompression = true;
+  if (JXL_DEC_SUCCESS != JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE)) {
+    fprintf(stderr,
+            "NOTE: decompressing brob boxes not supported with the currently "
+            "used jxl library.\n");
+    support_decompression = false;
+  }
+
+  JxlDecoderSetInput(dec.get(), jxl, size);
+  JxlDecoderCloseInput(dec.get());
+
+  const constexpr size_t kChunkSize = 65536;
+  size_t output_pos = 0;
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      fprintf(stderr, "Decoder error\n");
+      return false;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+      fprintf(stderr, "Error, already provided all input\n");
+      return false;
+    } else if (status == JXL_DEC_BOX) {
+      if (!exif->empty()) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec.get());
+        exif->resize(exif->size() - remaining);
+        // No need to wait for JXL_DEC_SUCCESS or decode other boxes.
+        return true;
+      }
+      JxlBoxType type;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetBoxType(dec.get(), type, support_decompression)) {
+        fprintf(stderr, "Error, failed to get box type\n");
+        return false;
+      }
+      if (!memcmp(type, "Exif", 4)) {
+        exif->resize(kChunkSize);
+        JxlDecoderSetBoxBuffer(dec.get(), exif->data(), exif->size());
+      }
+    } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+      size_t remaining = JxlDecoderReleaseBoxBuffer(dec.get());
+      output_pos += kChunkSize - remaining;
+      exif->resize(exif->size() + kChunkSize);
+      JxlDecoderSetBoxBuffer(dec.get(), exif->data() + output_pos,
+                             exif->size() - output_pos);
+    } else if (status == JXL_DEC_SUCCESS) {
+      if (!exif->empty()) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec.get());
+        exif->resize(exif->size() - remaining);
+        return true;
+      }
+      return true;
+    } else {
+      fprintf(stderr, "Unknown decoder status\n");
+      return false;
+    }
+  }
+}
+
+bool LoadFile(const char* filename, std::vector<uint8_t>* out) {
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  long size = ftell(file);
+  // Avoid invalid file or directory.
+  if (size >= LONG_MAX || size < 0) {
+    fclose(file);
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_SET) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  out->resize(size);
+  size_t readsize = fread(out->data(), 1, size, file);
+  if (fclose(file) != 0) {
+    return false;
+  }
+
+  return readsize == static_cast<size_t>(size);
+}
+
+bool WriteFile(const char* filename, const uint8_t* data, size_t size) {
+  FILE* file = fopen(filename, "wb");
+  if (!file) {
+    fprintf(stderr, "Could not open %s for writing", filename);
+    return false;
+  }
+  fwrite(data, 1, size, file);
+  if (fclose(file) != 0) {
+    return false;
+  }
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "Usage: %s <jxl> <exif>\n"
+            "Where:\n"
+            "  jxl = input JPEG XL image filename\n"
+            "  exif = output exif filename\n"
+            "Output files will be overwritten.\n",
+            argv[0]);
+    return 1;
+  }
+
+  const char* jxl_filename = argv[1];
+  const char* exif_filename = argv[2];
+
+  std::vector<uint8_t> jxl;
+  if (!LoadFile(jxl_filename, &jxl)) {
+    fprintf(stderr, "couldn't load %s\n", jxl_filename);
+    return 1;
+  }
+
+  std::vector<uint8_t> exif;
+  if (!DecodeJpegXlExif(jxl.data(), jxl.size(), &exif)) {
+    fprintf(stderr, "Error while decoding the jxl file\n");
+    return 1;
+  }
+  if (exif.empty()) {
+    printf("No exif data present in this image\n");
+  } else {
+    // TODO(lode): the exif box data contains the 4-byte TIFF header at the
+    // beginning, check whether this is desired to be part of the output, or
+    // should be removed.
+    if (!WriteFile(exif_filename, exif.data(), exif.size())) {
+      fprintf(stderr, "Error while writing the exif file\n");
+      return 1;
+    }
+    printf("Successfully wrote %s\n", exif_filename);
+  }
+  return 0;
+}
diff --git a/third_party/jpeg-xl/examples/decode_oneshot.cc b/third_party/jpeg-xl/examples/decode_oneshot.cc
new file mode 100644
index 0000000000..8cf9d4f3a6
--- /dev/null
+++ b/third_party/jpeg-xl/examples/decode_oneshot.cc
@@ -0,0 +1,251 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This C++ example decodes a JPEG XL image in one shot (all input bytes
+// available at once). The example outputs the pixels and color information to a
+// floating point image and an ICC profile on disk.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <vector>
+
+/** Decodes JPEG XL image to floating point pixels and ICC Profile. Pixel are
+ * stored as floating point, as interleaved RGBA (4 floating point values per
+ * pixel), line per line from top to bottom.  Pixel values have nominal range
+ * 0..1 but may go beyond this range for HDR or wide gamut. The ICC profile
+ * describes the color format of the pixel data.
+ */
+bool DecodeJpegXlOneShot(const uint8_t* jxl, size_t size,
+                         std::vector<float>* pixels, size_t* xsize,
+                         size_t* ysize, std::vector<uint8_t>* icc_profile) {
+  // Multi-threaded parallel runner.
+  auto runner = JxlResizableParallelRunnerMake(nullptr);
+
+  auto dec = JxlDecoderMake(nullptr);
+  if (JXL_DEC_SUCCESS !=
+      JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE)) {
+    fprintf(stderr, "JxlDecoderSubscribeEvents failed\n");
+    return false;
+  }
+
+  if (JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(dec.get(),
+                                                     JxlResizableParallelRunner,
+                                                     runner.get())) {
+    fprintf(stderr, "JxlDecoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  JxlBasicInfo info;
+  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+
+  JxlDecoderSetInput(dec.get(), jxl, size);
+  JxlDecoderCloseInput(dec.get());
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+
+    if (status == JXL_DEC_ERROR) {
+      fprintf(stderr, "Decoder error\n");
+      return false;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+      fprintf(stderr, "Error, already provided all input\n");
+      return false;
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(dec.get(), &info)) {
+        fprintf(stderr, "JxlDecoderGetBasicInfo failed\n");
+        return false;
+      }
+      *xsize = info.xsize;
+      *ysize = info.ysize;
+      JxlResizableParallelRunnerSetThreads(
+          runner.get(),
+          JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      // Get the ICC color profile of the pixel data
+      size_t icc_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetICCProfileSize(
+              dec.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) {
+        fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
+        return false;
+      }
+      icc_profile->resize(icc_size);
+      if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
+                                 dec.get(), &format,
+                                 JXL_COLOR_PROFILE_TARGET_DATA,
+                                 icc_profile->data(), icc_profile->size())) {
+        fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      size_t buffer_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderImageOutBufferSize(dec.get(), &format, &buffer_size)) {
+        fprintf(stderr, "JxlDecoderImageOutBufferSize failed\n");
+        return false;
+      }
+      if (buffer_size != *xsize * *ysize * 16) {
+        fprintf(stderr, "Invalid out buffer size %" PRIu64 " %" PRIu64 "\n",
+                static_cast<uint64_t>(buffer_size),
+                static_cast<uint64_t>(*xsize * *ysize * 16));
+        return false;
+      }
+      pixels->resize(*xsize * *ysize * 4);
+      void* pixels_buffer = (void*)pixels->data();
+      size_t pixels_buffer_size = pixels->size() * sizeof(float);
+      if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec.get(), &format,
+                                                         pixels_buffer,
+                                                         pixels_buffer_size)) {
+        fprintf(stderr, "JxlDecoderSetImageOutBuffer failed\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      // Nothing to do. Do not yet return. If the image is an animation, more
+      // full frames may be decoded. This example only keeps the last one.
+    } else if (status == JXL_DEC_SUCCESS) {
+      // All decoding successfully finished.
+      // It's not required to call JxlDecoderReleaseInput(dec.get()) here since
+      // the decoder will be destroyed.
+      return true;
+    } else {
+      fprintf(stderr, "Unknown decoder status\n");
+      return false;
+    }
+  }
+}
+
+/** Writes to .pfm file (Portable FloatMap). Gimp, tev viewer and ImageMagick
+ * support viewing this format.
+ * The input pixels are given as 32-bit floating point with 4-channel RGBA.
+ * The alpha channel will not be written since .pfm does not support it.
+ */
+bool WritePFM(const char* filename, const float* pixels, size_t xsize,
+              size_t ysize) {
+  FILE* file = fopen(filename, "wb");
+  if (!file) {
+    fprintf(stderr, "Could not open %s for writing", filename);
+    return false;
+  }
+  uint32_t endian_test = 1;
+  uint8_t little_endian[4];
+  memcpy(little_endian, &endian_test, 4);
+
+  fprintf(file, "PF\n%d %d\n%s\n", (int)xsize, (int)ysize,
+          little_endian[0] ? "-1.0" : "1.0");
+  for (int y = ysize - 1; y >= 0; y--) {
+    for (size_t x = 0; x < xsize; x++) {
+      for (size_t c = 0; c < 3; c++) {
+        const float* f = &pixels[(y * xsize + x) * 4 + c];
+        fwrite(f, 4, 1, file);
+      }
+    }
+  }
+  if (fclose(file) != 0) {
+    return false;
+  }
+  return true;
+}
+
+bool LoadFile(const char* filename, std::vector<uint8_t>* out) {
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  long size = ftell(file);
+  // Avoid invalid file or directory.
+  if (size >= LONG_MAX || size < 0) {
+    fclose(file);
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_SET) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  out->resize(size);
+  size_t readsize = fread(out->data(), 1, size, file);
+  if (fclose(file) != 0) {
+    return false;
+  }
+
+  return readsize == static_cast<size_t>(size);
+}
+
+bool WriteFile(const char* filename, const uint8_t* data, size_t size) {
+  FILE* file = fopen(filename, "wb");
+  if (!file) {
+    fprintf(stderr, "Could not open %s for writing", filename);
+    return false;
+  }
+  fwrite(data, 1, size, file);
+  if (fclose(file) != 0) {
+    return false;
+  }
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 4) {
+    fprintf(stderr,
+            "Usage: %s <jxl> <pfm> <icc>\n"
+            "Where:\n"
+            "  jxl = input JPEG XL image filename\n"
+            "  pfm = output Portable FloatMap image filename\n"
+            "  icc = output ICC color profile filename\n"
+            "Output files will be overwritten.\n",
+            argv[0]);
+    return 1;
+  }
+
+  const char* jxl_filename = argv[1];
+  const char* pfm_filename = argv[2];
+  const char* icc_filename = argv[3];
+
+  std::vector<uint8_t> jxl;
+  if (!LoadFile(jxl_filename, &jxl)) {
+    fprintf(stderr, "couldn't load %s\n", jxl_filename);
+    return 1;
+  }
+
+  std::vector<float> pixels;
+  std::vector<uint8_t> icc_profile;
+  size_t xsize = 0, ysize = 0;
+  if (!DecodeJpegXlOneShot(jxl.data(), jxl.size(), &pixels, &xsize, &ysize,
+                           &icc_profile)) {
+    fprintf(stderr, "Error while decoding the jxl file\n");
+    return 1;
+  }
+  if (!WritePFM(pfm_filename, pixels.data(), xsize, ysize)) {
+    fprintf(stderr, "Error while writing the PFM image file\n");
+    return 1;
+  }
+  if (!WriteFile(icc_filename, icc_profile.data(), icc_profile.size())) {
+    fprintf(stderr, "Error while writing the ICC profile file\n");
+    return 1;
+  }
+  printf("Successfully wrote %s and %s\n", pfm_filename, icc_filename);
+  return 0;
+}
diff --git a/third_party/jpeg-xl/examples/decode_progressive.cc b/third_party/jpeg-xl/examples/decode_progressive.cc
new file mode 100644
index 0000000000..0d035121e4
--- /dev/null
+++ b/third_party/jpeg-xl/examples/decode_progressive.cc
@@ -0,0 +1,245 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This C++ example decodes a JPEG XL image progressively (input bytes are
+// passed in chunks). The example outputs the intermediate steps to PAM files.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <vector>
+
+bool WritePAM(const char* filename, const uint8_t* buffer, size_t w, size_t h) {
+  FILE* fp = fopen(filename, "wb");
+  if (!fp) {
+    fprintf(stderr, "Could not open %s for writing", filename);
+    return false;
+  }
+  fprintf(fp,
+          "P7\nWIDTH %" PRIu64 "\nHEIGHT %" PRIu64
+          "\nDEPTH 4\nMAXVAL 255\nTUPLTYPE "
+          "RGB_ALPHA\nENDHDR\n",
+          static_cast<uint64_t>(w), static_cast<uint64_t>(h));
+  size_t num_bytes = w * h * 4;
+  if (fwrite(buffer, 1, num_bytes, fp) != num_bytes) {
+    fclose(fp);
+    return false;
+  };
+  if (fclose(fp) != 0) {
+    return false;
+  }
+  return true;
+}
+
+/** Decodes JPEG XL image to 8-bit integer RGBA pixels and an ICC Profile, in a
+ * progressive way, saving the intermediate steps.
+ */
+bool DecodeJpegXlProgressive(const uint8_t* jxl, size_t size,
+                             const char* filename, size_t chunksize) {
+  std::vector<uint8_t> pixels;
+  std::vector<uint8_t> icc_profile;
+  size_t xsize = 0, ysize = 0;
+
+  // Multi-threaded parallel runner.
+  auto runner = JxlResizableParallelRunnerMake(nullptr);
+
+  auto dec = JxlDecoderMake(nullptr);
+  if (JXL_DEC_SUCCESS !=
+      JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE)) {
+    fprintf(stderr, "JxlDecoderSubscribeEvents failed\n");
+    return false;
+  }
+
+  if (JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(dec.get(),
+                                                     JxlResizableParallelRunner,
+                                                     runner.get())) {
+    fprintf(stderr, "JxlDecoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  JxlBasicInfo info;
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+
+  size_t seen = 0;
+  JxlDecoderSetInput(dec.get(), jxl, chunksize);
+  size_t remaining = chunksize;
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+
+    if (status == JXL_DEC_ERROR) {
+      fprintf(stderr, "Decoder error\n");
+      return false;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT || status == JXL_DEC_SUCCESS ||
+               status == JXL_DEC_FULL_IMAGE) {
+      seen += remaining - JxlDecoderReleaseInput(dec.get());
+      printf("Flushing after %" PRIu64 " bytes\n", static_cast<uint64_t>(seen));
+      if (status == JXL_DEC_NEED_MORE_INPUT &&
+          JXL_DEC_SUCCESS != JxlDecoderFlushImage(dec.get())) {
+        printf("flush error (no preview yet)\n");
+      } else {
+        char fname[1024];
+        if (snprintf(fname, 1024, "%s-%" PRIu64 ".pam", filename,
+                     static_cast<uint64_t>(seen)) >= 1024) {
+          fprintf(stderr, "Filename too long\n");
+          return false;
+        };
+        if (!WritePAM(fname, pixels.data(), xsize, ysize)) {
+          fprintf(stderr, "Error writing progressive output\n");
+        }
+      }
+      remaining = size - seen;
+      if (remaining > chunksize) remaining = chunksize;
+      if (remaining == 0) {
+        if (status == JXL_DEC_NEED_MORE_INPUT) {
+          fprintf(stderr, "Error, already provided all input\n");
+          return false;
+        } else {
+          return true;
+        }
+      }
+      JxlDecoderSetInput(dec.get(), jxl + seen, remaining);
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(dec.get(), &info)) {
+        fprintf(stderr, "JxlDecoderGetBasicInfo failed\n");
+        return false;
+      }
+      xsize = info.xsize;
+      ysize = info.ysize;
+      JxlResizableParallelRunnerSetThreads(
+          runner.get(),
+          JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      // Get the ICC color profile of the pixel data
+      size_t icc_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetICCProfileSize(dec.get(), &format,
+                                      JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                      &icc_size)) {
+        fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
+        return false;
+      }
+      icc_profile.resize(icc_size);
+      if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
+                                 dec.get(), &format,
+                                 JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                 icc_profile.data(), icc_profile.size())) {
+        fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      size_t buffer_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderImageOutBufferSize(dec.get(), &format, &buffer_size)) {
+        fprintf(stderr, "JxlDecoderImageOutBufferSize failed\n");
+        return false;
+      }
+      if (buffer_size != xsize * ysize * 4) {
+        fprintf(stderr, "Invalid out buffer size %" PRIu64 " != %" PRIu64 "\n",
+                static_cast<uint64_t>(buffer_size),
+                static_cast<uint64_t>(xsize * ysize * 4));
+        return false;
+      }
+      pixels.resize(xsize * ysize * 4);
+      void* pixels_buffer = (void*)pixels.data();
+      size_t pixels_buffer_size = pixels.size() * sizeof(float);
+      if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec.get(), &format,
+                                                         pixels_buffer,
+                                                         pixels_buffer_size)) {
+        fprintf(stderr, "JxlDecoderSetImageOutBuffer failed\n");
+        return false;
+      }
+    } else {
+      fprintf(stderr, "Unknown decoder status\n");
+      return false;
+    }
+  }
+}
+
+bool LoadFile(const char* filename, std::vector<uint8_t>* out) {
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  long size = ftell(file);
+  // Avoid invalid file or directory.
+  if (size >= LONG_MAX || size < 0) {
+    fclose(file);
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_SET) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  out->resize(size);
+  size_t readsize = fread(out->data(), 1, size, file);
+  if (fclose(file) != 0) {
+    return false;
+  }
+
+  return readsize == static_cast<size_t>(size);
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    fprintf(
+        stderr,
+        "Usage: %s <jxl> <basename> [chunksize]\n"
+        "Where:\n"
+        "  jxl = input JPEG XL image filename\n"
+        "  basename = prefix of output filenames\n"
+        "  chunksize = loads chunksize bytes at a time and writes\n"
+        "              intermediate results to basename-[bytes loaded].pam\n"
+        "Output files will be overwritten.\n",
+        argv[0]);
+    return 1;
+  }
+
+  const char* jxl_filename = argv[1];
+  const char* png_filename = argv[2];
+
+  std::vector<uint8_t> jxl;
+  if (!LoadFile(jxl_filename, &jxl)) {
+    fprintf(stderr, "couldn't load %s\n", jxl_filename);
+    return 1;
+  }
+  size_t chunksize = jxl.size();
+  if (argc > 3) {
+    long cs = atol(argv[3]);
+    if (cs < 100) {
+      fprintf(stderr, "Chunk size is too low, try at least 100 bytes\n");
+      return 1;
+    }
+    chunksize = cs;
+  }
+
+  if (!DecodeJpegXlProgressive(jxl.data(), jxl.size(), png_filename,
+                               chunksize)) {
+    fprintf(stderr, "Error while decoding the jxl file\n");
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/jpeg-xl/examples/encode_oneshot.cc b/third_party/jpeg-xl/examples/encode_oneshot.cc
new file mode 100644
index 0000000000..49b360ce3b
--- /dev/null
+++ b/third_party/jpeg-xl/examples/encode_oneshot.cc
@@ -0,0 +1,276 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This example encodes a file containing a floating point image to another
+// file containing JPEG XL image with a single frame.
+
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <limits.h>
+#include <string.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+/**
+ * Reads from .pfm file (Portable FloatMap)
+ *
+ * @param filename name of the file to read
+ * @param pixels vector to fill with loaded pixels as 32-bit floating point with
+ * 3-channel RGB
+ * @param xsize set to width of loaded image
+ * @param ysize set to height of loaded image
+ */
+bool ReadPFM(const char* filename, std::vector<float>* pixels, uint32_t* xsize,
+             uint32_t* ysize) {
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    fprintf(stderr, "Could not open %s for reading.\n", filename);
+    return false;
+  }
+  uint32_t endian_test = 1;
+  uint8_t little_endian[4];
+  memcpy(little_endian, &endian_test, 4);
+
+  if (fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  long size = ftell(file);
+  // Avoid invalid file or directory.
+  if (size >= LONG_MAX || size < 0) {
+    fclose(file);
+    return false;
+  }
+
+  if (fseek(file, 0, SEEK_SET) != 0) {
+    fclose(file);
+    return false;
+  }
+
+  std::vector<char> data;
+  data.resize(size);
+
+  size_t readsize = fread(data.data(), 1, size, file);
+  if ((long)readsize != size) {
+    return false;
+  }
+  if (fclose(file) != 0) {
+    return false;
+  }
+
+  std::stringstream datastream;
+  std::string datastream_content(data.data(), data.size());
+  datastream.str(datastream_content);
+
+  std::string pf_token;
+  getline(datastream, pf_token, '\n');
+  if (pf_token != "PF") {
+    fprintf(stderr,
+            "%s doesn't seem to be a 3 channel Portable FloatMap file (missing "
+            "'PF\\n' "
+            "bytes).\n",
+            filename);
+    return false;
+  }
+
+  std::string xsize_token;
+  getline(datastream, xsize_token, ' ');
+  *xsize = std::stoi(xsize_token);
+
+  std::string ysize_token;
+  getline(datastream, ysize_token, '\n');
+  *ysize = std::stoi(ysize_token);
+
+  std::string endianness_token;
+  getline(datastream, endianness_token, '\n');
+  bool input_little_endian;
+  if (endianness_token == "1.0") {
+    input_little_endian = false;
+  } else if (endianness_token == "-1.0") {
+    input_little_endian = true;
+  } else {
+    fprintf(stderr,
+            "%s doesn't seem to be a Portable FloatMap file (endianness token "
+            "isn't '1.0' or '-1.0').\n",
+            filename);
+    return false;
+  }
+
+  size_t offset = pf_token.size() + 1 + xsize_token.size() + 1 +
+                  ysize_token.size() + 1 + endianness_token.size() + 1;
+
+  if (data.size() != *ysize * *xsize * 3 * 4 + offset) {
+    fprintf(stderr,
+            "%s doesn't seem to be a Portable FloatMap file (pixel data bytes "
+            "are %d, but expected %d * %d * 3 * 4 + %d (%d).\n",
+            filename, (int)data.size(), (int)*ysize, (int)*xsize, (int)offset,
+            (int)(*ysize * *xsize * 3 * 4 + offset));
+    return false;
+  }
+
+  if (!!little_endian[0] != input_little_endian) {
+    fprintf(stderr,
+            "%s has a different endianness than we do, conversion is not "
+            "supported.\n",
+            filename);
+    return false;
+  }
+
+  pixels->resize(*ysize * *xsize * 3);
+
+  for (int y = *ysize - 1; y >= 0; y--) {
+    for (int x = 0; x < (int)*xsize; x++) {
+      for (int c = 0; c < 3; c++) {
+        memcpy(pixels->data() + (y * *xsize + x) * 3 + c, data.data() + offset,
+               sizeof(float));
+        offset += sizeof(float);
+      }
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Compresses the provided pixels.
+ *
+ * @param pixels input pixels
+ * @param xsize width of the input image
+ * @param ysize height of the input image
+ * @param compressed will be populated with the compressed bytes
+ */
+bool EncodeJxlOneshot(const std::vector<float>& pixels, const uint32_t xsize,
+                      const uint32_t ysize, std::vector<uint8_t>* compressed) {
+  auto enc = JxlEncoderMake(/*memory_manager=*/nullptr);
+  auto runner = JxlThreadParallelRunnerMake(
+      /*memory_manager=*/nullptr,
+      JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  if (JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc.get(),
+                                                     JxlThreadParallelRunner,
+                                                     runner.get())) {
+    fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  JxlPixelFormat pixel_format = {3, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+
+  JxlBasicInfo basic_info;
+  JxlEncoderInitBasicInfo(&basic_info);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.bits_per_sample = 32;
+  basic_info.exponent_bits_per_sample = 8;
+  basic_info.uses_original_profile = JXL_FALSE;
+  if (JXL_ENC_SUCCESS != JxlEncoderSetBasicInfo(enc.get(), &basic_info)) {
+    fprintf(stderr, "JxlEncoderSetBasicInfo failed\n");
+    return false;
+  }
+
+  JxlColorEncoding color_encoding = {};
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetColorEncoding(enc.get(), &color_encoding)) {
+    fprintf(stderr, "JxlEncoderSetColorEncoding failed\n");
+    return false;
+  }
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), nullptr);
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                              (void*)pixels.data(),
+                              sizeof(float) * pixels.size())) {
+    fprintf(stderr, "JxlEncoderAddImageFrame failed\n");
+    return false;
+  }
+  JxlEncoderCloseInput(enc.get());
+
+  compressed->resize(64);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  if (JXL_ENC_SUCCESS != process_result) {
+    fprintf(stderr, "JxlEncoderProcessOutput failed\n");
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * Writes bytes to file.
+ */
+bool WriteFile(const std::vector<uint8_t>& bytes, const char* filename) {
+  FILE* file = fopen(filename, "wb");
+  if (!file) {
+    fprintf(stderr, "Could not open %s for writing\n", filename);
+    return false;
+  }
+  if (fwrite(bytes.data(), sizeof(uint8_t), bytes.size(), file) !=
+      bytes.size()) {
+    fprintf(stderr, "Could not write bytes to %s\n", filename);
+    fclose(file);
+    return false;
+  }
+  if (fclose(file) != 0) {
+    fprintf(stderr, "Could not close %s\n", filename);
+    return false;
+  }
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "Usage: %s <pfm> <jxl>\n"
+            "Where:\n"
+            "  pfm = input Portable FloatMap image filename\n"
+            "  jxl = output JPEG XL image filename\n"
+            "Output files will be overwritten.\n",
+            argv[0]);
+    return 1;
+  }
+
+  const char* pfm_filename = argv[1];
+  const char* jxl_filename = argv[2];
+
+  std::vector<float> pixels;
+  uint32_t xsize;
+  uint32_t ysize;
+  if (!ReadPFM(pfm_filename, &pixels, &xsize, &ysize)) {
+    fprintf(stderr, "Couldn't load %s\n", pfm_filename);
+    return 2;
+  }
+
+  std::vector<uint8_t> compressed;
+  if (!EncodeJxlOneshot(pixels, xsize, ysize, &compressed)) {
+    fprintf(stderr, "Couldn't encode jxl\n");
+    return 3;
+  }
+
+  if (!WriteFile(compressed, jxl_filename)) {
+    fprintf(stderr, "Couldn't write jxl file\n");
+    return 4;
+  }
+
+  return 0;
+}
diff --git a/third_party/jpeg-xl/examples/examples.cmake b/third_party/jpeg-xl/examples/examples.cmake
new file mode 100644
index 0000000000..fd159578bc
--- /dev/null
+++ b/third_party/jpeg-xl/examples/examples.cmake
@@ -0,0 +1,11 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+add_executable(decode_oneshot ${CMAKE_CURRENT_LIST_DIR}/decode_oneshot.cc)
+target_link_libraries(decode_oneshot jxl_dec jxl_threads)
+add_executable(decode_progressive ${CMAKE_CURRENT_LIST_DIR}/decode_progressive.cc)
+target_link_libraries(decode_progressive jxl_dec jxl_threads)
+add_executable(encode_oneshot ${CMAKE_CURRENT_LIST_DIR}/encode_oneshot.cc)
+target_link_libraries(encode_oneshot jxl jxl_threads)
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/.gitignore b/third_party/jpeg-xl/experimental/fast_lossless/.gitignore
new file mode 100644
index 0000000000..567609b123
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/README.md b/third_party/jpeg-xl/experimental/fast_lossless/README.md
new file mode 100644
index 0000000000..5f99c133d8
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/README.md
@@ -0,0 +1,10 @@
+# Fast-lossless
+This is a script to compile a standalone version of a JXL encoder that supports
+lossless compression, up to 16 bits, of 1- to 4-channel images and animations; it is
+very fast and compression is slightly worse than PNG for 8-bit nonphoto content
+and better or much better than PNG for all other situations.
+
+The main encoder is made out of two files, `lib/jxl/enc_fast_lossless.{cc,h}`;
+it automatically selects and runs a SIMD implementation supported by your CPU.
+
+This folder contains an example build script and `main` file.
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/build-android.sh b/third_party/jpeg-xl/experimental/fast_lossless/build-android.sh
new file mode 100755
index 0000000000..c155b2169a
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/build-android.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+set -e
+
+DIR=$(realpath "$(dirname "$0")")
+
+mkdir -p /tmp/build-android
+cd /tmp/build-android
+
+CXX="$ANDROID_NDK"/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android30-clang++
+if ! command -v "$CXX" >/dev/null ; then
+  printf >&2 '%s: Android C++ compiler not found, is ANDROID_NDK set properly?\n' "${0##*/}"
+  exit 1
+fi
+
+[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
+[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
+
+"$CXX" -O3 \
+  -I. lodepng.o \
+  -I"${DIR}"/../../ \
+  "${DIR}"/../../lib/jxl/enc_fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
+  -o fast_lossless
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/build.sh b/third_party/jpeg-xl/experimental/fast_lossless/build.sh
new file mode 100755
index 0000000000..e2c0aa3fd0
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/build.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+set -e
+
+DIR=$(realpath "$(dirname "$0")")
+mkdir -p "$DIR"/build
+cd "$DIR"/build
+
+# set CXX to clang++ if not set in the environment
+CXX="${CXX-clang++}"
+if ! command -v "$CXX" >/dev/null ; then
+  printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
+  exit 1
+fi
+
+[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
+[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
+
+"$CXX" -O3 \
+  -I. -g lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+  -o fast_lossless
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh b/third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
new file mode 100755
index 0000000000..a5e6aa2a52
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+set -e
+
+DIR=$(realpath "$(dirname "$0")")
+mkdir -p "$DIR"/build-aarch64
+cd "$DIR"/build-aarch64
+
+CXX="${CXX-aarch64-linux-gnu-c++}"
+if ! command -v "$CXX" >/dev/null ; then
+  printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
+  exit 1
+fi
+
+[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
+[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
+
+"$CXX" -O3 -static \
+  -I. lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+  -o fast_lossless
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless_main.cc b/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless_main.cc
new file mode 100644
index 0000000000..b59051d4e2
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless_main.cc
@@ -0,0 +1,113 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lodepng.h"
+#include "pam-input.h"
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    fprintf(stderr,
+            "Usage: %s in.png out.jxl [effort] [num_reps] [num_threads]\n",
+            argv[0]);
+    return 1;
+  }
+
+  const char* in = argv[1];
+  const char* out = argv[2];
+  int effort = argc >= 4 ? atoi(argv[3]) : 2;
+  size_t num_reps = argc >= 5 ? atoi(argv[4]) : 1;
+  size_t num_threads = argc >= 6 ? atoi(argv[5]) : 0;
+
+  if (effort < 0 || effort > 127) {
+    fprintf(
+        stderr,
+        "Effort should be between 0 and 127 (default is 2, more is slower)\n");
+    return 1;
+  }
+
+  unsigned char* png;
+  unsigned w, h;
+  size_t nb_chans = 4, bitdepth = 8;
+
+  unsigned error = lodepng_decode32_file(&png, &w, &h, in);
+
+  size_t width = w, height = h;
+  if (error && !DecodePAM(in, &png, &width, &height, &nb_chans, &bitdepth)) {
+    fprintf(stderr, "lodepng error %u: %s\n", error, lodepng_error_text(error));
+    return 1;
+  }
+
+  auto parallel_runner = [](void* num_threads_ptr, void* opaque,
+                            void fun(void*, size_t), size_t count) {
+    size_t num_threads = *(size_t*)num_threads_ptr;
+    if (num_threads == 0) {
+      num_threads = std::thread::hardware_concurrency();
+    }
+    if (num_threads > count) {
+      num_threads = count;
+    }
+    if (num_threads == 1) {
+      for (size_t i = 0; i < count; i++) {
+        fun(opaque, i);
+      }
+    } else {
+      std::atomic<int> task{0};
+      std::vector<std::thread> threads;
+      for (size_t i = 0; i < num_threads; i++) {
+        threads.push_back(std::thread([count, opaque, fun, &task]() {
+          while (true) {
+            int t = task++;
+            if (t >= count) break;
+            fun(opaque, t);
+          }
+        }));
+      }
+      for (auto& t : threads) t.join();
+    }
+  };
+
+  size_t encoded_size = 0;
+  unsigned char* encoded = nullptr;
+  size_t stride = width * nb_chans * (bitdepth > 8 ? 2 : 1);
+
+  auto start = std::chrono::high_resolution_clock::now();
+  for (size_t _ = 0; _ < num_reps; _++) {
+    free(encoded);
+    encoded_size = JxlFastLosslessEncode(
+        png, width, stride, height, nb_chans, bitdepth,
+        /*big_endian=*/true, effort, &encoded, &num_threads, +parallel_runner);
+  }
+  auto stop = std::chrono::high_resolution_clock::now();
+  if (num_reps > 1) {
+    float us =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+    size_t pixels = size_t{width} * size_t{height} * num_reps;
+    float mps = pixels / us;
+    fprintf(stderr, "%10.3f MP/s\n", mps);
+    fprintf(stderr, "%10.3f bits/pixel\n",
+            encoded_size * 8.0 / float(width) / float(height));
+  }
+
+  FILE* o = fopen(out, "wb");
+  if (!o) {
+    fprintf(stderr, "error opening %s: %s\n", out, strerror(errno));
+    return 1;
+  }
+  if (fwrite(encoded, 1, encoded_size, o) != encoded_size) {
+    fprintf(stderr, "error writing to %s: %s\n", out, strerror(errno));
+  }
+  fclose(o);
+}
diff --git a/third_party/jpeg-xl/experimental/fast_lossless/pam-input.h b/third_party/jpeg-xl/experimental/fast_lossless/pam-input.h
new file mode 100644
index 0000000000..4ecbe6b72d
--- /dev/null
+++ b/third_party/jpeg-xl/experimental/fast_lossless/pam-input.h
@@ -0,0 +1,289 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+bool error_msg(const char* message) {
+  fprintf(stderr, "%s\n", message);
+  return false;
+}
+#define return_on_error(X) \
+  if (!X) return false;
+
+size_t Log2(uint32_t value) { return 31 - __builtin_clz(value); }
+
+struct HeaderPNM {
+  size_t xsize;
+  size_t ysize;
+  bool is_gray;    // PGM
+  bool has_alpha;  // PAM
+  size_t bits_per_sample;
+};
+
+class Parser {
+ public:
+  explicit Parser(uint8_t* data, size_t length)
+      : pos_(data), end_(data + length) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  bool ParseHeader(HeaderPNM* header, const uint8_t** pos) {
+    // codec.cc ensures we have at least two bytes => no range check here.
+    if (pos_[0] != 'P') return false;
+    const uint8_t type = pos_[1];
+    pos_ += 2;
+
+    switch (type) {
+      case '5':
+        header->is_gray = true;
+        return ParseHeaderPNM(header, pos);
+
+      case '6':
+        header->is_gray = false;
+        return ParseHeaderPNM(header, pos);
+
+      case '7':
+        return ParseHeaderPAM(header, pos);
+    }
+    return false;
+  }
+
+  // Exposed for testing
+  bool ParseUnsigned(size_t* number) {
+    if (pos_ == end_) return error_msg("PNM: reached end before number");
+    if (!IsDigit(*pos_)) return error_msg("PNM: expected unsigned number");
+
+    *number = 0;
+    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+      *number *= 10;
+      *number += *pos_ - '0';
+      ++pos_;
+    }
+
+    return true;
+  }
+
+  bool ParseSigned(double* number) {
+    if (pos_ == end_) return error_msg("PNM: reached end before signed");
+
+    if (*pos_ != '-' && *pos_ != '+' && !IsDigit(*pos_)) {
+      return error_msg("PNM: expected signed number");
+    }
+
+    // Skip sign
+    const bool is_neg = *pos_ == '-';
+    if (is_neg || *pos_ == '+') {
+      ++pos_;
+      if (pos_ == end_) return error_msg("PNM: reached end before digits");
+    }
+
+    // Leading digits
+    *number = 0.0;
+    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+      *number *= 10;
+      *number += *pos_ - '0';
+      ++pos_;
+    }
+
+    // Decimal places?
+    if (pos_ < end_ && *pos_ == '.') {
+      ++pos_;
+      double place = 0.1;
+      while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+        *number += (*pos_ - '0') * place;
+        place *= 0.1;
+        ++pos_;
+      }
+    }
+
+    if (is_neg) *number = -*number;
+    return true;
+  }
+
+ private:
+  static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  bool SkipBlank() {
+    if (pos_ == end_) return error_msg("PNM: reached end before blank");
+    const uint8_t c = *pos_;
+    if (c != ' ' && c != '\n') return error_msg("PNM: expected blank");
+    ++pos_;
+    return true;
+  }
+
+  bool SkipSingleWhitespace() {
+    if (pos_ == end_) return error_msg("PNM: reached end before whitespace");
+    if (!IsWhitespace(*pos_)) return error_msg("PNM: expected whitespace");
+    ++pos_;
+    return true;
+  }
+
+  bool SkipWhitespace() {
+    if (pos_ == end_) return error_msg("PNM: reached end before whitespace");
+    if (!IsWhitespace(*pos_) && *pos_ != '#') {
+      return error_msg("PNM: expected whitespace/comment");
+    }
+
+    while (pos_ < end_ && IsWhitespace(*pos_)) {
+      ++pos_;
+    }
+
+    // Comment(s)
+    while (pos_ != end_ && *pos_ == '#') {
+      while (pos_ != end_ && !IsLineBreak(*pos_)) {
+        ++pos_;
+      }
+      // Newline(s)
+      while (pos_ != end_ && IsLineBreak(*pos_)) pos_++;
+    }
+
+    while (pos_ < end_ && IsWhitespace(*pos_)) {
+      ++pos_;
+    }
+    return true;
+  }
+
+  bool MatchString(const char* keyword) {
+    const uint8_t* ppos = pos_;
+    while (*keyword) {
+      if (ppos >= end_) return error_msg("PAM: unexpected end of input");
+      if (*keyword != *ppos) return false;
+      ppos++;
+      keyword++;
+    }
+    pos_ = ppos;
+    return_on_error(SkipWhitespace());
+    return true;
+  }
+
+  bool ParseHeaderPAM(HeaderPNM* header, const uint8_t** pos) {
+    size_t num_channels = 3;
+    size_t max_val = 255;
+    while (!MatchString("ENDHDR")) {
+      return_on_error(SkipWhitespace());
+      if (MatchString("WIDTH")) {
+        return_on_error(ParseUnsigned(&header->xsize));
+      } else if (MatchString("HEIGHT")) {
+        return_on_error(ParseUnsigned(&header->ysize));
+      } else if (MatchString("DEPTH")) {
+        return_on_error(ParseUnsigned(&num_channels));
+      } else if (MatchString("MAXVAL")) {
+        return_on_error(ParseUnsigned(&max_val));
+      } else if (MatchString("TUPLTYPE")) {
+        if (MatchString("RGB_ALPHA")) {
+          header->has_alpha = true;
+        } else if (MatchString("RGB")) {
+        } else if (MatchString("GRAYSCALE_ALPHA")) {
+          header->has_alpha = true;
+          header->is_gray = true;
+        } else if (MatchString("GRAYSCALE")) {
+          header->is_gray = true;
+        } else if (MatchString("BLACKANDWHITE_ALPHA")) {
+          header->has_alpha = true;
+          header->is_gray = true;
+          max_val = 1;
+        } else if (MatchString("BLACKANDWHITE")) {
+          header->is_gray = true;
+          max_val = 1;
+        } else {
+          return error_msg("PAM: unknown TUPLTYPE");
+        }
+      } else {
+        return error_msg("PAM: unknown header keyword");
+      }
+    }
+    if (num_channels !=
+        (header->has_alpha ? 1 : 0) + (header->is_gray ? 1 : 3)) {
+      return error_msg("PAM: bad DEPTH");
+    }
+    if (max_val == 0 || max_val >= 65536) {
+      return error_msg("PAM: bad MAXVAL");
+    }
+    header->bits_per_sample = Log2(max_val + 1);
+
+    *pos = pos_;
+    return true;
+  }
+
+  bool ParseHeaderPNM(HeaderPNM* header, const uint8_t** pos) {
+    return_on_error(SkipWhitespace());
+    return_on_error(ParseUnsigned(&header->xsize));
+
+    return_on_error(SkipWhitespace());
+    return_on_error(ParseUnsigned(&header->ysize));
+
+    return_on_error(SkipWhitespace());
+    size_t max_val;
+    return_on_error(ParseUnsigned(&max_val));
+    if (max_val == 0 || max_val >= 65536) {
+      return error_msg("PNM: bad MaxVal");
+    }
+    header->bits_per_sample = Log2(max_val + 1);
+
+    return_on_error(SkipSingleWhitespace());
+
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+bool load_file(unsigned char** out, size_t* outsize, const char* filename) {
+  FILE* file;
+  file = fopen(filename, "rb");
+  if (!file) return false;
+  if (fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return false;
+  }
+  *outsize = ftell(file);
+  if (*outsize == LONG_MAX || *outsize < 9 || fseek(file, 0, SEEK_SET)) {
+    fclose(file);
+    return false;
+  }
+  *out = (unsigned char*)malloc(*outsize);
+  if (!(*out)) return false;
+  size_t readsize;
+  readsize = fread(*out, 1, *outsize, file);
+  fclose(file);
+  if (readsize != *outsize) return false;
+  return true;
+}
+
+bool DecodePAM(const char* filename, uint8_t** buffer, size_t* w, size_t* h,
+               size_t* nb_chans, size_t* bitdepth) {
+  unsigned char* in_file;
+  size_t in_size;
+  if (!load_file(&in_file, &in_size, filename))
+    return error_msg("Could not read input file");
+  Parser parser(in_file, in_size);
+  HeaderPNM header = {};
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&header, &pos)) return false;
+
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
+    return error_msg("PNM: bits_per_sample invalid (can do at most 16-bit)");
+  }
+  *w = header.xsize;
+  *h = header.ysize;
+  *bitdepth = header.bits_per_sample;
+  *nb_chans = (header.is_gray ? 1 : 3) + (header.has_alpha ? 1 : 0);
+
+  size_t pnm_remaining_size = in_file + in_size - pos;
+  size_t buffer_size = *w * *h * *nb_chans * (*bitdepth > 8 ? 2 : 1);
+  if (pnm_remaining_size < buffer_size) {
+    return error_msg("PNM file too small");
+  }
+  *buffer = (uint8_t*)malloc(buffer_size);
+  memcpy(*buffer, pos, buffer_size);
+  return true;
+}
diff --git a/third_party/jpeg-xl/lib/BUILD b/third_party/jpeg-xl/lib/BUILD
new file mode 100644
index 0000000000..1707a36f77
--- /dev/null
+++ b/third_party/jpeg-xl/lib/BUILD
@@ -0,0 +1,256 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Load sources/headers/tests lists.
+load(
+    "jxl_lists.bzl",
+    "libjxl_base_sources",
+    "libjxl_codec_apng_sources",
+    "libjxl_codec_exr_sources",
+    "libjxl_codec_gif_sources",
+    "libjxl_codec_jpegli_sources",
+    "libjxl_codec_jpg_sources",
+    "libjxl_codec_jxl_sources",
+    "libjxl_codec_npy_sources",
+    "libjxl_codec_pgx_sources",
+    "libjxl_codec_pnm_sources",
+    "libjxl_dec_box_sources",
+    "libjxl_dec_jpeg_sources",
+    "libjxl_dec_sources",
+    "libjxl_enc_sources",
+    "libjxl_extras_for_tools_sources",
+    "libjxl_extras_sources",
+    #'libjxl_gbench_sources',
+    "libjxl_jpegli_sources",
+    "libjxl_jpegli_testlib_files",
+    "libjxl_jpegli_tests",
+    "libjxl_major_version",
+    "libjxl_minor_version",
+    "libjxl_patch_version",
+    "libjxl_public_headers",
+    "libjxl_testlib_files",
+    "libjxl_tests",
+    "libjxl_threads_public_headers",
+    "libjxl_threads_sources",
+)
+load(
+    "jxl_vars.bzl",
+    "libjxl_deps_brotli",
+    "libjxl_deps_exr",
+    "libjxl_deps_gif",
+    "libjxl_deps_gtest",
+    "libjxl_deps_hwy",
+    "libjxl_deps_hwy_nanobenchmark",
+    "libjxl_deps_hwy_test_util",
+    "libjxl_deps_jpeg",
+    "libjxl_deps_jxl_box",
+    "libjxl_deps_png",
+    "libjxl_deps_runfiles",
+    "libjxl_deps_skcms",
+    "libjxl_deps_testdata",
+    "libjxl_root_package",
+    "libjxl_test_shards",
+    "libjxl_test_timeouts",
+)
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+
+DEFAULT_VISIBILITY = ["//:__subpackages__"]
+
+DEFAULT_COMPATIBILITY = []
+
+INCLUDES_DIR = "include"
+
+package(
+    default_visibility = ["//:__subpackages__"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+EXPORT_TEMPLATE = """
+#ifndef @_EXPORT_H
+#define @_EXPORT_H
+
+#define @_EXPORT
+#define @_NO_EXPORT
+
+#ifndef @_DEPRECATED
+#  define @_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#endif
+"""
+
+JXL_EXPORT_H = INCLUDES_DIR + "/jxl/jxl_export.h"
+
+genrule(
+    name = "create_jxl_export",
+    outs = [JXL_EXPORT_H],
+    cmd = "echo '" + EXPORT_TEMPLATE.replace("@", "JXL") + "' > $@",
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+JXL_THREADS_EXPORT_H = INCLUDES_DIR + "/jxl/jxl_threads_export.h"
+
+genrule(
+    name = "create_jxl_threads_export",
+    outs = [JXL_THREADS_EXPORT_H],
+    cmd = "echo '" + EXPORT_TEMPLATE.replace("@", "JXL_THREADS") + "' > $@",
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+JXL_VERSION_H = INCLUDES_DIR + "/jxl/version.h"
+
+expand_template(
+    name = "expand_jxl_version",
+    out = JXL_VERSION_H,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    substitutions = {
+        "@JPEGXL_MAJOR_VERSION@": str(libjxl_major_version),
+        "@JPEGXL_MINOR_VERSION@": str(libjxl_minor_version),
+        "@JPEGXL_PATCH_VERSION@": str(libjxl_patch_version),
+    },
+    template = "jxl/version.h.in",
+)
+
+cc_library(
+    name = "jxl_version",
+    hdrs = [JXL_VERSION_H],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+)
+
+cc_library(
+    name = "includes",
+    hdrs = libjxl_public_headers + [JXL_EXPORT_H],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+    deps = [":jxl_version"],
+)
+
+cc_library(
+    name = "base",
+    srcs = [path for path in libjxl_base_sources if path.endswith(".cc")],
+    hdrs = [path for path in libjxl_base_sources if path.endswith(".h")],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [
+        ":includes",
+    ] + libjxl_deps_hwy,
+)
+
+cc_library(
+    name = "jpegxl",
+    srcs = libjxl_dec_sources + libjxl_dec_box_sources + libjxl_dec_jpeg_sources + libjxl_enc_sources,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = ["JPEGXL_ENABLE_SKCMS=1"],
+    deps = [
+        ":base",
+        ":includes",
+    ] + libjxl_deps_brotli + libjxl_deps_hwy + libjxl_deps_skcms,
+)
+
+cc_library(
+    name = "jpegxl_private",
+    hdrs = [
+        path
+        for path in libjxl_dec_sources + libjxl_dec_box_sources + libjxl_dec_jpeg_sources + libjxl_enc_sources
+        if path.endswith(".h") and not path.endswith("-inl.h")
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [":jpegxl"],
+)
+
+cc_library(
+    name = "jpegxl_threads",
+    srcs = libjxl_threads_sources,
+    hdrs = libjxl_threads_public_headers + [JXL_THREADS_EXPORT_H],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+    deps = [
+        ":base",
+        ":includes",
+    ],
+)
+
+CODEC_FILES = libjxl_codec_apng_sources + libjxl_codec_exr_sources + libjxl_codec_gif_sources + libjxl_codec_jpegli_sources + libjxl_codec_jpg_sources + libjxl_codec_jxl_sources + libjxl_codec_npy_sources + libjxl_codec_pgx_sources + libjxl_codec_pnm_sources
+
+CODEC_SRCS = [path for path in CODEC_FILES if path.endswith(".cc")]
+
+CODEC_HDRS = [path for path in CODEC_FILES if path.endswith(".h")]
+
+cc_library(
+    name = "jpegli",
+    srcs = libjxl_jpegli_sources,
+    hdrs = [
+        "jpegli/common_internal.h",  # TODO(eustas): should not be here
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [
+        ":jpegxl_private",
+    ] + libjxl_deps_hwy + libjxl_deps_jpeg,
+)
+
+# TODO(eustas): build codecs separately?
+cc_library(
+    name = "jpegxl_extras",
+    srcs = libjxl_extras_sources + libjxl_extras_for_tools_sources + CODEC_SRCS,
+    hdrs = CODEC_HDRS,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = [
+        "JPEGXL_ENABLE_APNG=1",
+        "JPEGXL_ENABLE_EXR=1",
+        "JPEGXL_ENABLE_GIF=1",
+        "JPEGXL_ENABLE_JPEG=1",
+        "JPEGXL_ENABLE_JPEGLI=1",
+    ],
+    deps = [
+        ":jpegli",
+        ":jpegxl_private",
+        ":jpegxl_threads",
+        ":jxl_version",
+    ] + libjxl_deps_exr + libjxl_deps_gif + libjxl_deps_jpeg + libjxl_deps_png,
+)
+
+TESTLIB_FILES = libjxl_testlib_files + libjxl_jpegli_testlib_files
+
+cc_library(
+    name = "test_utils",
+    testonly = 1,
+    srcs = [path for path in TESTLIB_FILES if not path.endswith(".h")],
+    hdrs = [path for path in TESTLIB_FILES if path.endswith(".h")],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = [
+        'JPEGXL_ROOT_PACKAGE=\'"' + libjxl_root_package + '"\'',
+    ],
+    deps = [
+        ":jpegli",
+        ":jpegxl_extras",
+        ":jpegxl_private",
+    ] + libjxl_deps_runfiles,
+)
+
+TESTS = [path.partition(".")[0] for path in libjxl_tests + libjxl_jpegli_tests]
+
+[
+    cc_test(
+        name = test,
+        timeout = libjxl_test_timeouts.get(test, "moderate"),
+        srcs = [
+            test + ".cc",
+            "jpegli/testing.h",
+            "jxl/testing.h",
+        ],
+        data = ["//:testdata"],
+        shard_count = libjxl_test_shards.get(test, 1),
+        deps = [
+            ":jpegxl_extras",
+            ":jpegxl_private",
+            ":jpegxl_threads",
+            ":test_utils",
+        ] + libjxl_deps_gtest + libjxl_deps_hwy_test_util + libjxl_deps_hwy_nanobenchmark + libjxl_deps_jxl_box,
+    )
+    for test in TESTS
+]
diff --git a/third_party/jpeg-xl/lib/CMakeLists.txt b/third_party/jpeg-xl/lib/CMakeLists.txt
new file mode 100644
index 0000000000..a340288726
--- /dev/null
+++ b/third_party/jpeg-xl/lib/CMakeLists.txt
@@ -0,0 +1,168 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+set(JPEGXL_MAJOR_VERSION 0)
+set(JPEGXL_MINOR_VERSION 9)
+set(JPEGXL_PATCH_VERSION 0)
+set(JPEGXL_LIBRARY_VERSION
+    "${JPEGXL_MAJOR_VERSION}.${JPEGXL_MINOR_VERSION}.${JPEGXL_PATCH_VERSION}")
+
+# This is the library API/ABI compatibility version. Changing this value makes
+# the shared library incompatible with previous version. A program linked
+# against this shared library SOVERSION will not run with an older SOVERSION.
+# It is important to update this value when making incompatible API/ABI changes
+# so that programs that depend on libjxl can update their dependencies. Semantic
+# versioning allows 0.y.z to have incompatible changes in minor versions.
+set(JPEGXL_SO_MINOR_VERSION 9)
+if (JPEGXL_MAJOR_VERSION EQUAL 0)
+  set(JPEGXL_LIBRARY_SOVERSION
+      "${JPEGXL_MAJOR_VERSION}.${JPEGXL_SO_MINOR_VERSION}")
+else()
+  set(JPEGXL_LIBRARY_SOVERSION "${JPEGXL_MAJOR_VERSION}")
+endif()
+
+
+# List of warning and feature flags for our library and tests.
+if (MSVC)
+  set(JPEGXL_INTERNAL_FLAGS
+    # TODO(janwas): add flags
+  )
+else ()
+  set(JPEGXL_INTERNAL_FLAGS
+    # F_FLAGS
+    -fmerge-all-constants
+    -fno-builtin-fwrite
+    -fno-builtin-fread
+
+    # WARN_FLAGS
+    -Wall
+    -Wextra
+    -Wc++11-compat
+    -Warray-bounds
+    -Wformat-security
+    -Wimplicit-fallthrough
+    -Wno-register  # Needed by public headers in lcms
+    -Wno-unused-function
+    -Wno-unused-parameter
+    -Wnon-virtual-dtor
+    -Woverloaded-virtual
+    -Wvla
+  )
+
+  # Warning flags supported by clang.
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND JPEGXL_INTERNAL_FLAGS
+      -Wdeprecated-increment-bool
+      # TODO(deymo): Add -Wextra-semi once we update third_party/highway.
+      # -Wextra-semi
+      -Wfloat-overflow-conversion
+      -Wfloat-zero-conversion
+      -Wfor-loop-analysis
+      -Wgnu-redeclared-enum
+      -Winfinite-recursion
+      -Wliteral-conversion
+      -Wno-c++98-compat
+      -Wno-unused-command-line-argument
+      -Wprivate-header
+      -Wself-assign
+      -Wstring-conversion
+      -Wtautological-overlap-compare
+      -Wthread-safety-analysis
+      -Wundefined-func-template
+      -Wunreachable-code
+      -Wunused-comparison
+    )
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+      list(APPEND HWY_FLAGS -Wc++2a-extensions)
+    endif()
+  endif()  # Clang
+
+  if (WIN32)
+    list(APPEND JPEGXL_INTERNAL_FLAGS
+      -Wno-cast-align
+      -Wno-double-promotion
+      -Wno-float-equal
+      -Wno-format-nonliteral
+      -Wno-shadow
+      -Wno-sign-conversion
+      -Wno-zero-as-null-pointer-constant
+    )
+
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND JPEGXL_INTERNAL_FLAGS
+        -Wno-used-but-marked-unused
+        -Wno-unused-template
+        -Wno-unused-member-function
+        -Wno-shadow-field-in-constructor
+        -Wno-language-extension-token
+        -Wno-global-constructors
+        -Wno-c++98-compat-pedantic
+      )
+    endif()  # Clang
+  else()  # WIN32
+    list(APPEND JPEGXL_INTERNAL_FLAGS
+      -fsized-deallocation
+      -fno-exceptions
+
+      # Language flags
+      -fmath-errno
+    )
+
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND JPEGXL_INTERNAL_FLAGS
+        -fnew-alignment=8
+        -fno-cxx-exceptions
+        -fno-slp-vectorize
+        -fno-vectorize
+
+        -disable-free
+        -disable-llvm-verifier
+      )
+    endif()  # Clang
+  endif()  # WIN32
+endif()  #!MSVC
+
+# strips the -static suffix from all the elements in LIST
+function(strip_static OUTPUT_VAR LIB_LIST)
+  foreach(lib IN LISTS ${LIB_LIST})
+    string(REGEX REPLACE "-static$" "" lib "${lib}")
+    list(APPEND out_list "${lib}")
+  endforeach()
+  set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
+endfunction()
+
+# The jxl library definition.
+include(jxl.cmake)
+
+# Other libraries outside the core jxl library.
+if(JPEGXL_ENABLE_TOOLS)
+  include(jxl_extras.cmake)
+endif()
+include(jxl_threads.cmake)
+find_package(JPEG)
+if (JPEG_FOUND AND JPEGXL_ENABLE_JPEGLI)
+  include(jpegli.cmake)
+endif()
+
+# Install all the library headers from the source and the generated ones. There
+# is no distinction on which libraries use which header since it is expected
+# that all developer libraries are available together at build time.
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/jxl
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/jxl
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+
+if(BUILD_TESTING)
+  cmake_policy(SET CMP0057 NEW)  # https://gitlab.kitware.com/cmake/cmake/issues/18198
+  include(GoogleTest)
+endif()
+
+# Tests for the jxl library.
+include(jxl_tests.cmake)
+
+if(BUILD_TESTING)
+  # Google benchmark for the jxl library
+  include(jxl_benchmark.cmake)
+endif()
diff --git a/third_party/jpeg-xl/lib/compatibility.cmake b/third_party/jpeg-xl/lib/compatibility.cmake
new file mode 100644
index 0000000000..9d99d29482
--- /dev/null
+++ b/third_party/jpeg-xl/lib/compatibility.cmake
@@ -0,0 +1,30 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+function(jxl_discover_tests TESTNAME)
+  if (CMAKE_VERSION VERSION_LESS "3.10.3")
+    gtest_discover_tests(${TESTNAME} TIMEOUT 240)
+  else ()
+    gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 240)
+  endif ()
+endfunction()
+
+function(jxl_link_libraries DST SRC)
+  if (CMAKE_VERSION VERSION_LESS "3.13.5")
+    target_include_directories(${DST} SYSTEM PUBLIC
+       $<BUILD_INTERFACE:$<TARGET_PROPERTY:${SRC},INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>>
+    )
+    add_dependencies(${DST} ${SRC})
+  else()
+    target_link_libraries(${DST} PUBLIC ${SRC})
+  endif()
+endfunction()
+
+
+if (CMAKE_VERSION VERSION_LESS "3.12.4")
+  set(JXL_HWY_INCLUDE_DIRS "$<BUILD_INTERFACE:$<TARGET_PROPERTY:hwy,INTERFACE_INCLUDE_DIRECTORIES>>")
+else()
+  set(JXL_HWY_INCLUDE_DIRS "$<BUILD_INTERFACE:$<TARGET_PROPERTY:$<IF:$<TARGET_EXISTS:hwy::hwy>,hwy::hwy,hwy>,INTERFACE_INCLUDE_DIRECTORIES>>")
+endif()
diff --git a/third_party/jpeg-xl/lib/extras/LICENSE.apngdis b/third_party/jpeg-xl/lib/extras/LICENSE.apngdis
new file mode 100644
index 0000000000..eb0ba7c07b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/LICENSE.apngdis
@@ -0,0 +1,27 @@
+APNG Disassembler 2.8
+
+Deconstructs APNG files into individual frames.
+
+http://apngdis.sourceforge.net
+
+Copyright (c) 2010-2015 Max Stepin
+maxst at users.sourceforge.net
+
+zlib license
+------------
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
diff --git a/third_party/jpeg-xl/lib/extras/README.md b/third_party/jpeg-xl/lib/extras/README.md
new file mode 100644
index 0000000000..06a9b5ea07
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/README.md
@@ -0,0 +1,5 @@
+## JPEG XL "extras"
+
+The files in this directory do not form part of the library or codec and are
+only used by tests or specific internal tools that have access to the internals
+of the library.
diff --git a/third_party/jpeg-xl/lib/extras/codec.cc b/third_party/jpeg-xl/lib/extras/codec.cc
new file mode 100644
index 0000000000..5d3f00706e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/codec.cc
@@ -0,0 +1,191 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+#if JPEGXL_ENABLE_APNG
+#include "lib/extras/enc/apng.h"
+#endif
+#if JPEGXL_ENABLE_JPEG
+#include "lib/extras/enc/jpg.h"
+#endif
+#if JPEGXL_ENABLE_EXR
+#include "lib/extras/enc/exr.h"
+#endif
+
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/enc/pgx.h"
+#include "lib/extras/enc/pnm.h"
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+namespace {
+
+// Any valid encoding is larger (ensures codecs can read the first few bytes)
+constexpr size_t kMinBytes = 9;
+
+}  // namespace
+
+Status SetFromBytes(const Span<const uint8_t> bytes,
+                    const extras::ColorHints& color_hints, CodecInOut* io,
+                    ThreadPool* pool, const SizeConstraints* constraints,
+                    extras::Codec* orig_codec) {
+  if (bytes.size() < kMinBytes) return JXL_FAILURE("Too few bytes");
+
+  extras::PackedPixelFile ppf;
+  if (extras::DecodeBytes(bytes, color_hints, &ppf, constraints, orig_codec)) {
+    return ConvertPackedPixelFileToCodecInOut(ppf, pool, io);
+  }
+  return JXL_FAILURE("Codecs failed to decode");
+}
+
+Status SetFromFile(const std::string& pathname,
+                   const extras::ColorHints& color_hints, CodecInOut* io,
+                   ThreadPool* pool, const SizeConstraints* constraints,
+                   extras::Codec* orig_codec) {
+  std::vector<uint8_t> encoded;
+  JXL_RETURN_IF_ERROR(ReadFile(pathname, &encoded));
+  JXL_RETURN_IF_ERROR(SetFromBytes(Span<const uint8_t>(encoded), color_hints,
+                                   io, pool, constraints, orig_codec));
+  return true;
+}
+
+Status Encode(const CodecInOut& io, const extras::Codec codec,
+              const ColorEncoding& c_desired, size_t bits_per_sample,
+              std::vector<uint8_t>* bytes, ThreadPool* pool) {
+  JXL_CHECK(!io.Main().c_current().ICC().empty());
+  JXL_CHECK(!c_desired.ICC().empty());
+  io.CheckMetadata();
+  if (io.Main().IsJPEG()) {
+    JXL_WARNING("Writing JPEG data as pixels");
+  }
+  JxlPixelFormat format = {
+      0,  // num_channels is ignored by the converter
+      bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16, JXL_BIG_ENDIAN,
+      0};
+  const bool floating_point = bits_per_sample > 16;
+  std::unique_ptr<extras::Encoder> encoder;
+  std::ostringstream os;
+  switch (codec) {
+    case extras::Codec::kPNG:
+#if JPEGXL_ENABLE_APNG
+      encoder = extras::GetAPNGEncoder();
+      break;
+#else
+      return JXL_FAILURE("JPEG XL was built without (A)PNG support");
+#endif
+    case extras::Codec::kJPG:
+#if JPEGXL_ENABLE_JPEG
+      format.data_type = JXL_TYPE_UINT8;
+      encoder = extras::GetJPEGEncoder();
+      os << io.jpeg_quality;
+      encoder->SetOption("q", os.str());
+      break;
+#else
+      return JXL_FAILURE("JPEG XL was built without JPEG support");
+#endif
+    case extras::Codec::kPNM:
+      if (io.Main().HasAlpha()) {
+        encoder = extras::GetPAMEncoder();
+      } else if (io.Main().IsGray()) {
+        encoder = extras::GetPGMEncoder();
+      } else if (!floating_point) {
+        encoder = extras::GetPPMEncoder();
+      } else {
+        format.data_type = JXL_TYPE_FLOAT;
+        format.endianness = JXL_LITTLE_ENDIAN;
+        encoder = extras::GetPFMEncoder();
+      }
+      break;
+    case extras::Codec::kPGX:
+      encoder = extras::GetPGXEncoder();
+      break;
+    case extras::Codec::kGIF:
+      return JXL_FAILURE("Encoding to GIF is not implemented");
+    case extras::Codec::kEXR:
+#if JPEGXL_ENABLE_EXR
+      format.data_type = JXL_TYPE_FLOAT;
+      encoder = extras::GetEXREncoder();
+      break;
+#else
+      return JXL_FAILURE("JPEG XL was built without OpenEXR support");
+#endif
+    case extras::Codec::kUnknown:
+      return JXL_FAILURE("Cannot encode using Codec::kUnknown");
+  }
+
+  if (!encoder) {
+    return JXL_FAILURE("Invalid codec.");
+  }
+
+  extras::PackedPixelFile ppf;
+  JXL_RETURN_IF_ERROR(
+      ConvertCodecInOutToPackedPixelFile(io, format, c_desired, pool, &ppf));
+  ppf.info.bits_per_sample = bits_per_sample;
+  if (format.data_type == JXL_TYPE_FLOAT) {
+    ppf.info.bits_per_sample = 32;
+    ppf.info.exponent_bits_per_sample = 8;
+  }
+  extras::EncodedImage encoded_image;
+  JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded_image, pool));
+  JXL_ASSERT(encoded_image.bitstreams.size() == 1);
+  *bytes = encoded_image.bitstreams[0];
+
+  return true;
+}
+
+Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired,
+                    size_t bits_per_sample, const std::string& pathname,
+                    ThreadPool* pool) {
+  const std::string extension = Extension(pathname);
+  const extras::Codec codec =
+      extras::CodecFromExtension(extension, &bits_per_sample);
+
+  // Warn about incorrect usage of PGM/PGX/PPM - only the latter supports
+  // color, but CodecFromExtension lumps them all together.
+  if (codec == extras::Codec::kPNM && extension != ".pfm") {
+    if (io.Main().HasAlpha() && extension != ".pam") {
+      JXL_WARNING(
+          "For images with alpha, the filename should end with .pam.\n");
+    } else if (!io.Main().IsGray() && extension == ".pgm") {
+      JXL_WARNING("For color images, the filename should end with .ppm.\n");
+    } else if (io.Main().IsGray() && extension == ".ppm") {
+      JXL_WARNING(
+          "For grayscale images, the filename should not end with .ppm.\n");
+    }
+    if (bits_per_sample > 16) {
+      JXL_WARNING("PPM only supports up to 16 bits per sample");
+      bits_per_sample = 16;
+    }
+  } else if (codec == extras::Codec::kPGX && !io.Main().IsGray()) {
+    JXL_WARNING("Storing color image to PGX - use .ppm extension instead.\n");
+  }
+  if (bits_per_sample > 16 && codec == extras::Codec::kPNG) {
+    JXL_WARNING("PNG only supports up to 16 bits per sample");
+    bits_per_sample = 16;
+  }
+
+  std::vector<uint8_t> encoded;
+  return Encode(io, codec, c_desired, bits_per_sample, &encoded, pool) &&
+         WriteFile(encoded, pathname);
+}
+
+Status EncodeToFile(const CodecInOut& io, const std::string& pathname,
+                    ThreadPool* pool) {
+  // TODO(lode): need to take the floating_point_sample field into account
+  return EncodeToFile(io, io.metadata.m.color_encoding,
+                      io.metadata.m.bit_depth.bits_per_sample, pathname, pool);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/codec.h b/third_party/jpeg-xl/lib/extras/codec.h
new file mode 100644
index 0000000000..80a42f926c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/codec.h
@@ -0,0 +1,70 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_CODEC_H_
+#define LIB_EXTRAS_CODEC_H_
+
+// Facade for image encoders/decoders (PNG, PNM, ...).
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/field_encodings.h"  // MakeBit
+
+namespace jxl {
+
+struct SizeConstraints;
+
+// Decodes "bytes" and sets io->metadata.m.
+// color_space_hint may specify the color space, otherwise, defaults to sRGB.
+Status SetFromBytes(Span<const uint8_t> bytes,
+                    const extras::ColorHints& color_hints, CodecInOut* io,
+                    ThreadPool* pool = nullptr,
+                    const SizeConstraints* constraints = nullptr,
+                    extras::Codec* orig_codec = nullptr);
+// Helper function to use no color_space_hint.
+JXL_INLINE Status SetFromBytes(const Span<const uint8_t> bytes, CodecInOut* io,
+                               ThreadPool* pool = nullptr,
+                               const SizeConstraints* constraints = nullptr,
+                               extras::Codec* orig_codec = nullptr) {
+  return SetFromBytes(bytes, extras::ColorHints(), io, pool, constraints,
+                      orig_codec);
+}
+
+// Reads from file and calls SetFromBytes.
+Status SetFromFile(const std::string& pathname,
+                   const extras::ColorHints& color_hints, CodecInOut* io,
+                   ThreadPool* pool = nullptr,
+                   const SizeConstraints* constraints = nullptr,
+                   extras::Codec* orig_codec = nullptr);
+
+// Replaces "bytes" with an encoding of pixels transformed from c_current
+// color space to c_desired.
+Status Encode(const CodecInOut& io, extras::Codec codec,
+              const ColorEncoding& c_desired, size_t bits_per_sample,
+              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
+
+// Deduces codec, calls Encode and writes to file.
+Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired,
+                    size_t bits_per_sample, const std::string& pathname,
+                    ThreadPool* pool = nullptr);
+// Same, but defaults to metadata.original color_encoding and bits_per_sample.
+Status EncodeToFile(const CodecInOut& io, const std::string& pathname,
+                    ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_CODEC_H_
diff --git a/third_party/jpeg-xl/lib/extras/codec_test.cc b/third_party/jpeg-xl/lib/extras/codec_test.cc
new file mode 100644
index 0000000000..66a8563639
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/codec_test.cc
@@ -0,0 +1,645 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/dec/pgx.h"
+#include "lib/extras/dec/pnm.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+using test::ThreadPoolForTests;
+
+namespace extras {
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::Field;
+using ::testing::IsEmpty;
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+std::string ExtensionFromCodec(Codec codec, const bool is_gray,
+                               const bool has_alpha,
+                               const size_t bits_per_sample) {
+  switch (codec) {
+    case Codec::kJPG:
+      return ".jpg";
+    case Codec::kPGX:
+      return ".pgx";
+    case Codec::kPNG:
+      return ".png";
+    case Codec::kPNM:
+      if (bits_per_sample == 32) return ".pfm";
+      if (has_alpha) return ".pam";
+      return is_gray ? ".pgm" : ".ppm";
+    case Codec::kGIF:
+      return ".gif";
+    case Codec::kEXR:
+      return ".exr";
+    case Codec::kUnknown:
+      return std::string();
+  }
+  JXL_UNREACHABLE;
+  return std::string();
+}
+
+void VerifySameImage(const PackedImage& im0, size_t bits_per_sample0,
+                     const PackedImage& im1, size_t bits_per_sample1,
+                     bool lossless = true) {
+  ASSERT_EQ(im0.xsize, im1.xsize);
+  ASSERT_EQ(im0.ysize, im1.ysize);
+  ASSERT_EQ(im0.format.num_channels, im1.format.num_channels);
+  auto get_factor = [](JxlPixelFormat f, size_t bits) -> double {
+    return 1.0 / ((1u << std::min(test::GetPrecision(f.data_type), bits)) - 1);
+  };
+  double factor0 = get_factor(im0.format, bits_per_sample0);
+  double factor1 = get_factor(im1.format, bits_per_sample1);
+  auto pixels0 = static_cast<const uint8_t*>(im0.pixels());
+  auto pixels1 = static_cast<const uint8_t*>(im1.pixels());
+  auto rgba0 =
+      test::ConvertToRGBA32(pixels0, im0.xsize, im0.ysize, im0.format, factor0);
+  auto rgba1 =
+      test::ConvertToRGBA32(pixels1, im1.xsize, im1.ysize, im1.format, factor1);
+  double tolerance =
+      lossless ? 0.5 * std::min(factor0, factor1) : 3.0f / 255.0f;
+  if (bits_per_sample0 == 32 || bits_per_sample1 == 32) {
+    tolerance = 0.5 * std::max(factor0, factor1);
+  }
+  for (size_t y = 0; y < im0.ysize; ++y) {
+    for (size_t x = 0; x < im0.xsize; ++x) {
+      for (size_t c = 0; c < im0.format.num_channels; ++c) {
+        size_t ix = (y * im0.xsize + x) * 4 + c;
+        double val0 = rgba0[ix];
+        double val1 = rgba1[ix];
+        ASSERT_NEAR(val1, val0, tolerance)
+            << "y = " << y << " x = " << x << " c = " << c;
+      }
+    }
+  }
+}
+
+JxlColorEncoding CreateTestColorEncoding(bool is_gray) {
+  JxlColorEncoding c;
+  c.color_space = is_gray ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
+  c.white_point = JXL_WHITE_POINT_D65;
+  c.primaries = JXL_PRIMARIES_P3;
+  c.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
+  c.transfer_function = JXL_TRANSFER_FUNCTION_LINEAR;
+  // Roundtrip through internal color encoding to fill in primaries and white
+  // point CIE xy coordinates.
+  ColorEncoding c_internal;
+  JXL_CHECK(ConvertExternalToInternalColorEncoding(c, &c_internal));
+  ConvertInternalToExternalColorEncoding(c_internal, &c);
+  return c;
+}
+
+std::vector<uint8_t> GenerateICC(JxlColorEncoding color_encoding) {
+  ColorEncoding c;
+  JXL_CHECK(ConvertExternalToInternalColorEncoding(color_encoding, &c));
+  JXL_CHECK(c.CreateICC());
+  PaddedBytes icc = c.ICC();
+  return std::vector<uint8_t>(icc.begin(), icc.end());
+}
+
+void StoreRandomValue(uint8_t* out, Rng* rng, JxlPixelFormat format,
+                      size_t bits_per_sample) {
+  uint64_t max_val = (1ull << bits_per_sample) - 1;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    *out = rng->UniformU(0, max_val);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    uint32_t val = rng->UniformU(0, max_val);
+    if (format.endianness == JXL_BIG_ENDIAN) {
+      StoreBE16(val, out);
+    } else {
+      StoreLE16(val, out);
+    }
+  } else {
+    ASSERT_EQ(format.data_type, JXL_TYPE_FLOAT);
+    float val = rng->UniformF(0.0, 1.0);
+    uint32_t uval;
+    memcpy(&uval, &val, 4);
+    if (format.endianness == JXL_BIG_ENDIAN) {
+      StoreBE32(uval, out);
+    } else {
+      StoreLE32(uval, out);
+    }
+  }
+}
+
+void FillPackedImage(size_t bits_per_sample, PackedImage* image) {
+  JxlPixelFormat format = image->format;
+  size_t bytes_per_channel = PackedImage::BitsPerChannel(format.data_type) / 8;
+  uint8_t* out = static_cast<uint8_t*>(image->pixels());
+  size_t stride = image->xsize * format.num_channels * bytes_per_channel;
+  ASSERT_EQ(image->pixels_size, image->ysize * stride);
+  Rng rng(129);
+  for (size_t y = 0; y < image->ysize; ++y) {
+    for (size_t x = 0; x < image->xsize; ++x) {
+      for (size_t c = 0; c < format.num_channels; ++c) {
+        StoreRandomValue(out, &rng, format, bits_per_sample);
+        out += bytes_per_channel;
+      }
+    }
+  }
+}
+
+struct TestImageParams {
+  Codec codec;
+  size_t xsize;
+  size_t ysize;
+  size_t bits_per_sample;
+  bool is_gray;
+  bool add_alpha;
+  bool big_endian;
+  bool add_extra_channels;
+
+  bool ShouldTestRoundtrip() const {
+    if (codec == Codec::kPNG) {
+      return bits_per_sample <= 16;
+    } else if (codec == Codec::kPNM) {
+      // TODO(szabadka) Make PNM encoder endianness-aware.
+      return ((bits_per_sample <= 16 && big_endian) ||
+              (bits_per_sample == 32 && !add_alpha && !big_endian));
+    } else if (codec == Codec::kPGX) {
+      return ((bits_per_sample == 8 || bits_per_sample == 16) && is_gray &&
+              !add_alpha);
+    } else if (codec == Codec::kEXR) {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+      // OpenEXR 2.3 has a memory leak in IlmThread_2_3::ThreadPool
+      return false;
+#else
+      return bits_per_sample == 32 && !is_gray;
+#endif
+    } else if (codec == Codec::kJPG) {
+      return bits_per_sample == 8 && !add_alpha;
+    } else {
+      return false;
+    }
+  }
+
+  JxlPixelFormat PixelFormat() const {
+    JxlPixelFormat format;
+    format.num_channels = (is_gray ? 1 : 3) + (add_alpha ? 1 : 0);
+    format.data_type = (bits_per_sample == 32 ? JXL_TYPE_FLOAT
+                        : bits_per_sample > 8 ? JXL_TYPE_UINT16
+                                              : JXL_TYPE_UINT8);
+    format.endianness = big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN;
+    format.align = 0;
+    return format;
+  }
+
+  std::string DebugString() const {
+    std::ostringstream os;
+    os << "bps:" << bits_per_sample << " gr:" << is_gray << " al:" << add_alpha
+       << " be: " << big_endian << " ec: " << add_extra_channels;
+    return os.str();
+  }
+};
+
+void CreateTestImage(const TestImageParams& params, PackedPixelFile* ppf) {
+  ppf->info.xsize = params.xsize;
+  ppf->info.ysize = params.ysize;
+  ppf->info.bits_per_sample = params.bits_per_sample;
+  ppf->info.exponent_bits_per_sample = params.bits_per_sample == 32 ? 8 : 0;
+  ppf->info.num_color_channels = params.is_gray ? 1 : 3;
+  ppf->info.alpha_bits = params.add_alpha ? params.bits_per_sample : 0;
+  ppf->info.alpha_premultiplied = (params.codec == Codec::kEXR);
+
+  JxlColorEncoding color_encoding = CreateTestColorEncoding(params.is_gray);
+  ppf->icc = GenerateICC(color_encoding);
+  ppf->color_encoding = color_encoding;
+
+  PackedFrame frame(params.xsize, params.ysize, params.PixelFormat());
+  FillPackedImage(params.bits_per_sample, &frame.color);
+  if (params.add_extra_channels) {
+    for (size_t i = 0; i < 7; ++i) {
+      JxlPixelFormat ec_format = params.PixelFormat();
+      ec_format.num_channels = 1;
+      PackedImage ec(params.xsize, params.ysize, ec_format);
+      FillPackedImage(params.bits_per_sample, &ec);
+      frame.extra_channels.emplace_back(std::move(ec));
+      PackedExtraChannel pec;
+      pec.ec_info.bits_per_sample = params.bits_per_sample;
+      pec.ec_info.type = static_cast<JxlExtraChannelType>(i);
+      ppf->extra_channels_info.emplace_back(std::move(pec));
+    }
+  }
+  ppf->frames.emplace_back(std::move(frame));
+}
+
+// Ensures reading a newly written file leads to the same image pixels.
+void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
+  if (!params.ShouldTestRoundtrip()) return;
+
+  std::string extension = ExtensionFromCodec(
+      params.codec, params.is_gray, params.add_alpha, params.bits_per_sample);
+  printf("Codec %s %s\n", extension.c_str(), params.DebugString().c_str());
+
+  PackedPixelFile ppf_in;
+  CreateTestImage(params, &ppf_in);
+
+  EncodedImage encoded;
+  auto encoder = Encoder::FromExtension(extension);
+  ASSERT_TRUE(encoder.get());
+  ASSERT_TRUE(encoder->Encode(ppf_in, &encoded, pool));
+  ASSERT_EQ(encoded.bitstreams.size(), 1);
+
+  PackedPixelFile ppf_out;
+  ColorHints color_hints;
+  if (params.codec == Codec::kPNM || params.codec == Codec::kPGX) {
+    color_hints.Add("color_space",
+                    params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
+  }
+  ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
+                          color_hints, &ppf_out));
+  if (params.codec == Codec::kPNG && ppf_out.icc.empty()) {
+    // Decoding a PNG may drop the ICC profile if there's a valid cICP chunk.
+    // Rendering intent is not preserved in this case.
+    EXPECT_EQ(ppf_in.color_encoding.color_space,
+              ppf_out.color_encoding.color_space);
+    EXPECT_EQ(ppf_in.color_encoding.white_point,
+              ppf_out.color_encoding.white_point);
+    if (ppf_in.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+      EXPECT_EQ(ppf_in.color_encoding.primaries,
+                ppf_out.color_encoding.primaries);
+    }
+    EXPECT_EQ(ppf_in.color_encoding.transfer_function,
+              ppf_out.color_encoding.transfer_function);
+    EXPECT_EQ(ppf_out.color_encoding.rendering_intent,
+              JXL_RENDERING_INTENT_RELATIVE);
+  } else if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
+             params.codec != Codec::kEXR) {
+    EXPECT_EQ(ppf_in.icc, ppf_out.icc);
+  }
+
+  ASSERT_EQ(ppf_out.frames.size(), 1);
+  const auto& frame_in = ppf_in.frames[0];
+  const auto& frame_out = ppf_out.frames[0];
+  VerifySameImage(frame_in.color, ppf_in.info.bits_per_sample, frame_out.color,
+                  ppf_out.info.bits_per_sample,
+                  /*lossless=*/params.codec != Codec::kJPG);
+  ASSERT_EQ(frame_in.extra_channels.size(), frame_out.extra_channels.size());
+  ASSERT_EQ(ppf_out.extra_channels_info.size(),
+            frame_out.extra_channels.size());
+  for (size_t i = 0; i < frame_in.extra_channels.size(); ++i) {
+    VerifySameImage(frame_in.extra_channels[i], ppf_in.info.bits_per_sample,
+                    frame_out.extra_channels[i], ppf_out.info.bits_per_sample,
+                    /*lossless=*/true);
+    EXPECT_EQ(ppf_out.extra_channels_info[i].ec_info.type,
+              ppf_in.extra_channels_info[i].ec_info.type);
+  }
+}
+
+TEST(CodecTest, TestRoundTrip) {
+  ThreadPoolForTests pool(12);
+
+  TestImageParams params;
+  params.xsize = 7;
+  params.ysize = 4;
+
+  for (Codec codec : AvailableCodecs()) {
+    for (int bits_per_sample : {4, 8, 10, 12, 16, 32}) {
+      for (bool is_gray : {false, true}) {
+        for (bool add_alpha : {false, true}) {
+          for (bool big_endian : {false, true}) {
+            params.codec = codec;
+            params.bits_per_sample = static_cast<size_t>(bits_per_sample);
+            params.is_gray = is_gray;
+            params.add_alpha = add_alpha;
+            params.big_endian = big_endian;
+            params.add_extra_channels = false;
+            TestRoundTrip(params, &pool);
+            if (codec == Codec::kPNM && add_alpha) {
+              params.add_extra_channels = true;
+              TestRoundTrip(params, &pool);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CodecTest, LosslessPNMRoundtrip) {
+  ThreadPoolForTests pool(12);
+
+  static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"};
+  static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"};
+  for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) {
+    for (size_t channels = 1; channels <= 4; ++channels) {
+      if (bit_depth == 1 && (channels == 2 || channels == 4)) continue;
+      std::string extension(kExtension[channels]);
+      std::string filename = "jxl/flower/flower_small." +
+                             std::string(kChannels[channels]) + ".depth" +
+                             std::to_string(bit_depth) + extension;
+      const PaddedBytes orig = jxl::test::ReadTestData(filename);
+
+      PackedPixelFile ppf;
+      ColorHints color_hints;
+      color_hints.Add("color_space",
+                      channels < 3 ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
+      ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(orig.data(), orig.size()),
+                              color_hints, &ppf));
+
+      EncodedImage encoded;
+      auto encoder = Encoder::FromExtension(extension);
+      ASSERT_TRUE(encoder.get());
+      ASSERT_TRUE(encoder->Encode(ppf, &encoded, &pool));
+      ASSERT_EQ(encoded.bitstreams.size(), 1);
+      ASSERT_EQ(orig.size(), encoded.bitstreams[0].size());
+      EXPECT_EQ(0,
+                memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size()));
+    }
+  }
+}
+
+void DecodeRoundtrip(const std::string& pathname, ThreadPool* pool,
+                     CodecInOut& io,
+                     const ColorHints& color_hints = ColorHints()) {
+  const PaddedBytes orig = jxl::test::ReadTestData(pathname);
+  JXL_CHECK(SetFromBytes(Span<const uint8_t>(orig), color_hints, &io, pool));
+  const ImageBundle& ib1 = io.Main();
+
+  // Encode/Decode again to make sure Encode carries through all metadata.
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(Encode(io, Codec::kPNG, io.metadata.m.color_encoding,
+                   io.metadata.m.bit_depth.bits_per_sample, &encoded, pool));
+
+  CodecInOut io2;
+  JXL_CHECK(
+      SetFromBytes(Span<const uint8_t>(encoded), color_hints, &io2, pool));
+  const ImageBundle& ib2 = io2.Main();
+  EXPECT_EQ(Description(ib1.metadata()->color_encoding),
+            Description(ib2.metadata()->color_encoding));
+  EXPECT_EQ(Description(ib1.c_current()), Description(ib2.c_current()));
+
+  size_t bits_per_sample = io2.metadata.m.bit_depth.bits_per_sample;
+
+  // "Same" pixels?
+  double max_l1 = bits_per_sample <= 12 ? 1.3 : 2E-3;
+  double max_rel = bits_per_sample <= 12 ? 6E-3 : 1E-4;
+  if (ib1.metadata()->color_encoding.IsGray()) {
+    max_rel *= 2.0;
+  } else if (ib1.metadata()->color_encoding.primaries != Primaries::kSRGB) {
+    // Need more tolerance for large gamuts (anything but sRGB)
+    max_l1 *= 1.5;
+    max_rel *= 3.0;
+  }
+  JXL_ASSERT_OK(
+      VerifyRelativeError(ib1.color(), ib2.color(), max_l1, max_rel, _));
+
+  // Simulate the encoder removing profile and decoder restoring it.
+  if (!ib2.metadata()->color_encoding.WantICC()) {
+    io2.metadata.m.color_encoding.InternalRemoveICC();
+    EXPECT_TRUE(io2.metadata.m.color_encoding.CreateICC());
+  }
+}
+
+#if 0
+TEST(CodecTest, TestMetadataSRGB) {
+  ThreadPoolForTests pool(12);
+
+  const char* paths[] = {"external/raw.pixls/DJI-FC6310-16bit_srgb8_v4_krita.png",
+                         "external/raw.pixls/Google-Pixel2XL-16bit_srgb8_v4_krita.png",
+                         "external/raw.pixls/HUAWEI-EVA-L09-16bit_srgb8_dt.png",
+                         "external/raw.pixls/Nikon-D300-12bit_srgb8_dt.png",
+                         "external/raw.pixls/Sony-DSC-RX1RM2-14bit_srgb8_v4_krita.png"};
+  for (const char* relative_pathname : paths) {
+    CodecInOut io;
+    DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool, io);
+    EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample);
+
+    EXPECT_EQ(64, io.xsize());
+    EXPECT_EQ(64, io.ysize());
+    EXPECT_FALSE(io.metadata.m.HasAlpha());
+
+    const ColorEncoding& c_original = io.metadata.m.color_encoding;
+    EXPECT_FALSE(c_original.ICC().empty());
+    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
+    EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
+    EXPECT_EQ(Primaries::kSRGB, c_original.primaries);
+    EXPECT_TRUE(c_original.tf.IsSRGB());
+  }
+}
+
+TEST(CodecTest, TestMetadataLinear) {
+  ThreadPoolForTests pool(12);
+
+  const char* paths[3] = {
+      "external/raw.pixls/Google-Pixel2XL-16bit_acescg_g1_v4_krita.png",
+      "external/raw.pixls/HUAWEI-EVA-L09-16bit_709_g1_dt.png",
+      "external/raw.pixls/Nikon-D300-12bit_2020_g1_dt.png",
+  };
+  const WhitePoint white_points[3] = {WhitePoint::kCustom, WhitePoint::kD65,
+                                      WhitePoint::kD65};
+  const Primaries primaries[3] = {Primaries::kCustom, Primaries::kSRGB,
+                                  Primaries::k2100};
+
+  for (size_t i = 0; i < 3; ++i) {
+    CodecInOut io;
+    DecodeRoundtrip(paths[i], Codec::kPNG, &pool, io);
+    EXPECT_EQ(16, io.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample);
+
+    EXPECT_EQ(64, io.xsize());
+    EXPECT_EQ(64, io.ysize());
+    EXPECT_FALSE(io.metadata.m.HasAlpha());
+
+    const ColorEncoding& c_original = io.metadata.m.color_encoding;
+    EXPECT_FALSE(c_original.ICC().empty());
+    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
+    EXPECT_EQ(white_points[i], c_original.white_point);
+    EXPECT_EQ(primaries[i], c_original.primaries);
+    EXPECT_TRUE(c_original.tf.IsLinear());
+  }
+}
+
+TEST(CodecTest, TestMetadataICC) {
+  ThreadPoolForTests pool(12);
+
+  const char* paths[] = {
+      "external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png",
+      "external/raw.pixls/Sony-DSC-RX1RM2-14bit_709_v4_krita.png",
+  };
+  for (const char* relative_pathname : paths) {
+    CodecInOut io;
+    DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool, io);
+    EXPECT_GE(16, io.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_LE(14, io.metadata.m.bit_depth.bits_per_sample);
+
+    EXPECT_EQ(64, io.xsize());
+    EXPECT_EQ(64, io.ysize());
+    EXPECT_FALSE(io.metadata.m.HasAlpha());
+
+    const ColorEncoding& c_original = io.metadata.m.color_encoding;
+    EXPECT_FALSE(c_original.ICC().empty());
+    EXPECT_EQ(RenderingIntent::kPerceptual, c_original.rendering_intent);
+    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
+    EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
+    EXPECT_EQ(Primaries::kSRGB, c_original.primaries);
+    EXPECT_EQ(TransferFunction::k709, c_original.tf.GetTransferFunction());
+  }
+}
+
+TEST(CodecTest, Testexternal/pngsuite) {
+  ThreadPoolForTests pool(12);
+
+  // Ensure we can load PNG with text, japanese UTF-8, compressed text.
+  CodecInOut tmp1;
+  DecodeRoundtrip("external/pngsuite/ct1n0g04.png", Codec::kPNG, &pool, tmp1);
+  CodecInOut tmp2;
+  DecodeRoundtrip("external/pngsuite/ctjn0g04.png", Codec::kPNG, &pool, tmp2);
+  CodecInOut tmp3;
+  DecodeRoundtrip("external/pngsuite/ctzn0g04.png", Codec::kPNG, &pool, tmp3);
+
+  // Extract gAMA
+  CodecInOut b1;
+  DecodeRoundtrip("external/pngsuite/g10n3p04.png", Codec::kPNG, &pool, b1);
+  EXPECT_TRUE(b1.metadata.color_encoding.tf.IsLinear());
+
+  // Extract cHRM
+  CodecInOut b_p;
+  DecodeRoundtrip("external/pngsuite/ccwn2c08.png", Codec::kPNG, &pool, b_p);
+  EXPECT_EQ(Primaries::kSRGB, b_p.metadata.color_encoding.primaries);
+  EXPECT_EQ(WhitePoint::kD65, b_p.metadata.color_encoding.white_point);
+
+  // Extract EXIF from (new-style) dedicated chunk
+  CodecInOut b_exif;
+  DecodeRoundtrip("external/pngsuite/exif2c08.png", Codec::kPNG, &pool, b_exif);
+  EXPECT_EQ(978, b_exif.blobs.exif.size());
+}
+#endif
+
+void VerifyWideGamutMetadata(const std::string& relative_pathname,
+                             const Primaries primaries, ThreadPool* pool) {
+  CodecInOut io;
+  DecodeRoundtrip(relative_pathname, pool, io);
+
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+
+  const ColorEncoding& c_original = io.metadata.m.color_encoding;
+  EXPECT_FALSE(c_original.ICC().empty());
+  EXPECT_EQ(RenderingIntent::kAbsolute, c_original.rendering_intent);
+  EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
+  EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
+  EXPECT_EQ(primaries, c_original.primaries);
+}
+
+TEST(CodecTest, TestWideGamut) {
+  ThreadPoolForTests pool(12);
+  // VerifyWideGamutMetadata("external/wide-gamut-tests/P3-sRGB-color-bars.png",
+  //                        Primaries::kP3, &pool);
+  VerifyWideGamutMetadata("external/wide-gamut-tests/P3-sRGB-color-ring.png",
+                          Primaries::kP3, &pool);
+  // VerifyWideGamutMetadata("external/wide-gamut-tests/R2020-sRGB-color-bars.png",
+  //                        Primaries::k2100, &pool);
+  // VerifyWideGamutMetadata("external/wide-gamut-tests/R2020-sRGB-color-ring.png",
+  //                        Primaries::k2100, &pool);
+}
+
+TEST(CodecTest, TestPNM) { TestCodecPNM(); }
+
+TEST(CodecTest, FormatNegotiation) {
+  const std::vector<JxlPixelFormat> accepted_formats = {
+      {/*num_channels=*/4,
+       /*data_type=*/JXL_TYPE_UINT16,
+       /*endianness=*/JXL_NATIVE_ENDIAN,
+       /*align=*/0},
+      {/*num_channels=*/3,
+       /*data_type=*/JXL_TYPE_UINT8,
+       /*endianness=*/JXL_NATIVE_ENDIAN,
+       /*align=*/0},
+      {/*num_channels=*/3,
+       /*data_type=*/JXL_TYPE_UINT16,
+       /*endianness=*/JXL_NATIVE_ENDIAN,
+       /*align=*/0},
+      {/*num_channels=*/1,
+       /*data_type=*/JXL_TYPE_UINT8,
+       /*endianness=*/JXL_NATIVE_ENDIAN,
+       /*align=*/0},
+  };
+
+  JxlBasicInfo info;
+  JxlEncoderInitBasicInfo(&info);
+  info.bits_per_sample = 12;
+  info.num_color_channels = 2;
+
+  JxlPixelFormat format;
+  EXPECT_FALSE(SelectFormat(accepted_formats, info, &format));
+
+  info.num_color_channels = 3;
+  ASSERT_TRUE(SelectFormat(accepted_formats, info, &format));
+  EXPECT_EQ(format.num_channels, info.num_color_channels);
+  // 16 is the smallest accepted format that can accommodate the 12-bit data.
+  EXPECT_EQ(format.data_type, JXL_TYPE_UINT16);
+}
+
+TEST(CodecTest, EncodeToPNG) {
+  ThreadPool* const pool = nullptr;
+
+  std::unique_ptr<Encoder> png_encoder = Encoder::FromExtension(".png");
+  ASSERT_THAT(png_encoder, NotNull());
+
+  const PaddedBytes original_png = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  PackedPixelFile ppf;
+  ASSERT_TRUE(extras::DecodeBytes(Span<const uint8_t>(original_png),
+                                  ColorHints(), &ppf));
+
+  const JxlPixelFormat& format = ppf.frames.front().color.format;
+  ASSERT_THAT(
+      png_encoder->AcceptedFormats(),
+      Contains(AllOf(Field(&JxlPixelFormat::num_channels, format.num_channels),
+                     Field(&JxlPixelFormat::data_type, format.data_type),
+                     Field(&JxlPixelFormat::endianness, format.endianness))));
+  EncodedImage encoded_png;
+  ASSERT_TRUE(png_encoder->Encode(ppf, &encoded_png, pool));
+  EXPECT_THAT(encoded_png.icc, IsEmpty());
+  ASSERT_THAT(encoded_png.bitstreams, SizeIs(1));
+
+  PackedPixelFile decoded_ppf;
+  ASSERT_TRUE(
+      extras::DecodeBytes(Span<const uint8_t>(encoded_png.bitstreams.front()),
+                          ColorHints(), &decoded_ppf));
+
+  ASSERT_EQ(decoded_ppf.info.bits_per_sample, ppf.info.bits_per_sample);
+  ASSERT_EQ(decoded_ppf.frames.size(), 1);
+  VerifySameImage(ppf.frames[0].color, ppf.info.bits_per_sample,
+                  decoded_ppf.frames[0].color,
+                  decoded_ppf.info.bits_per_sample);
+}
+
+}  // namespace
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/apng.cc b/third_party/jpeg-xl/lib/extras/dec/apng.cc
new file mode 100644
index 0000000000..c16fb5c81f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/apng.cc
@@ -0,0 +1,962 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/apng.h"
+
+// Parts of this code are taken from apngdis, which has the following license:
+/* APNG Disassembler 2.8
+ *
+ * Deconstructs APNG files into individual frames.
+ *
+ * http://apngdis.sourceforge.net
+ *
+ * Copyright (c) 2010-2015 Max Stepin
+ * maxst at users.sourceforge.net
+ *
+ * zlib license
+ * ------------
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ */
+
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/scope_guard.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/sanitizers.h"
+#include "png.h" /* original (unpatched) libpng is ok */
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+
+/* hIST chunk tail is not proccesed properly; skip this chunk completely;
+   see https://github.com/glennrp/libpng/pull/413 */
+const png_byte kIgnoredPngChunks[] = {
+    104, 73, 83, 84, '\0' /* hIST */
+};
+
+// Returns floating-point value from the PNG encoding (times 10^5).
+static double F64FromU32(const uint32_t x) {
+  return static_cast<int32_t>(x) * 1E-5;
+}
+
+Status DecodeSRGB(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size");
+  // (PNG uses the same values as ICC.)
+  if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent");
+  color_encoding->white_point = JXL_WHITE_POINT_D65;
+  color_encoding->primaries = JXL_PRIMARIES_SRGB;
+  color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+  color_encoding->rendering_intent =
+      static_cast<JxlRenderingIntent>(payload[0]);
+  return true;
+}
+
+// If the cICP profile is not fully supported, return false and leave
+// color_encoding unmodified.
+Status DecodeCICP(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 4) return JXL_FAILURE("Wrong cICP size");
+  JxlColorEncoding color_enc = *color_encoding;
+
+  // From https://www.itu.int/rec/T-REC-H.273-202107-I/en
+  if (payload[0] == 1) {
+    // IEC 61966-2-1 sRGB
+    color_enc.primaries = JXL_PRIMARIES_SRGB;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 4) {
+    // Rec. ITU-R BT.470-6 System M
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.67;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.21;
+    color_enc.primaries_green_xy[1] = 0.71;
+    color_enc.primaries_blue_xy[0] = 0.14;
+    color_enc.primaries_blue_xy[1] = 0.08;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 5) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.64;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.29;
+    color_enc.primaries_green_xy[1] = 0.60;
+    color_enc.primaries_blue_xy[0] = 0.15;
+    color_enc.primaries_blue_xy[1] = 0.06;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 6 || payload[0] == 7) {
+    // SMPTE ST 170 (2004) / SMPTE ST 240 (1999)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.310;
+    color_enc.primaries_green_xy[1] = 0.595;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.070;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 8) {
+    // Generic film (colour filters using Illuminant C)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.681;
+    color_enc.primaries_red_xy[1] = 0.319;
+    color_enc.primaries_green_xy[0] = 0.243;
+    color_enc.primaries_green_xy[1] = 0.692;
+    color_enc.primaries_blue_xy[0] = 0.145;
+    color_enc.primaries_blue_xy[1] = 0.049;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 9) {
+    // Rec. ITU-R BT.2100-2
+    color_enc.primaries = JXL_PRIMARIES_2100;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 10) {
+    // CIE 1931 XYZ
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 1;
+    color_enc.primaries_red_xy[1] = 0;
+    color_enc.primaries_green_xy[0] = 0;
+    color_enc.primaries_green_xy[1] = 1;
+    color_enc.primaries_blue_xy[0] = 0;
+    color_enc.primaries_blue_xy[1] = 0;
+    color_enc.white_point = JXL_WHITE_POINT_E;
+  } else if (payload[0] == 11) {
+    // SMPTE RP 431-2 (2011)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_DCI;
+  } else if (payload[0] == 12) {
+    // SMPTE EG 432-1 (2010)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 22) {
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.295;
+    color_enc.primaries_green_xy[1] = 0.605;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.077;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else {
+    JXL_WARNING("Unsupported primaries specified in cICP chunk: %d",
+                static_cast<int>(payload[0]));
+    return false;
+  }
+
+  if (payload[1] == 1 || payload[1] == 6 || payload[1] == 14 ||
+      payload[1] == 15) {
+    // Rec. ITU-R BT.709-6
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_709;
+  } else if (payload[1] == 4) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.2;
+  } else if (payload[1] == 5) {
+    // Rec. ITU-R BT.470-6 System B, G
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.8;
+  } else if (payload[1] == 8 || payload[1] == 13 || payload[1] == 16 ||
+             payload[1] == 17 || payload[1] == 18) {
+    // These codes all match the corresponding JXL enum values
+    color_enc.transfer_function = static_cast<JxlTransferFunction>(payload[1]);
+  } else {
+    JXL_WARNING("Unsupported transfer function specified in cICP chunk: %d",
+                static_cast<int>(payload[1]));
+    return false;
+  }
+
+  if (payload[2] != 0) {
+    JXL_WARNING("Unsupported color space specified in cICP chunk: %d",
+                static_cast<int>(payload[2]));
+    return false;
+  }
+  if (payload[3] != 1) {
+    JXL_WARNING("Unsupported full-range flag specified in cICP chunk: %d",
+                static_cast<int>(payload[3]));
+    return false;
+  }
+  // cICP has no rendering intent, so use the default
+  color_enc.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
+  *color_encoding = color_enc;
+  return true;
+}
+
+Status DecodeGAMA(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size");
+  color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+  color_encoding->gamma = F64FromU32(LoadBE32(payload));
+  return true;
+}
+
+Status DecodeCHRM(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 32) return JXL_FAILURE("Wrong cHRM size");
+
+  color_encoding->white_point = JXL_WHITE_POINT_CUSTOM;
+  color_encoding->white_point_xy[0] = F64FromU32(LoadBE32(payload + 0));
+  color_encoding->white_point_xy[1] = F64FromU32(LoadBE32(payload + 4));
+
+  color_encoding->primaries = JXL_PRIMARIES_CUSTOM;
+  color_encoding->primaries_red_xy[0] = F64FromU32(LoadBE32(payload + 8));
+  color_encoding->primaries_red_xy[1] = F64FromU32(LoadBE32(payload + 12));
+  color_encoding->primaries_green_xy[0] = F64FromU32(LoadBE32(payload + 16));
+  color_encoding->primaries_green_xy[1] = F64FromU32(LoadBE32(payload + 20));
+  color_encoding->primaries_blue_xy[0] = F64FromU32(LoadBE32(payload + 24));
+  color_encoding->primaries_blue_xy[1] = F64FromU32(LoadBE32(payload + 28));
+  return true;
+}
+
+// Retrieves XMP and EXIF/IPTC from itext and text.
+class BlobsReaderPNG {
+ public:
+  static Status Decode(const png_text_struct& info, PackedMetadata* metadata) {
+    // We trust these are properly null-terminated by libpng.
+    const char* key = info.key;
+    const char* value = info.text;
+    if (strstr(key, "XML:com.adobe.xmp")) {
+      metadata->xmp.resize(strlen(value));  // safe, see above
+      memcpy(metadata->xmp.data(), value, metadata->xmp.size());
+    }
+
+    std::string type;
+    std::vector<uint8_t> bytes;
+
+    // Handle text chunks annotated with key "Raw profile type ####", with
+    // #### a type, which may contain metadata.
+    const char* kKey = "Raw profile type ";
+    if (strncmp(key, kKey, strlen(kKey)) != 0) return false;
+
+    if (!MaybeDecodeBase16(key, value, &type, &bytes)) {
+      JXL_WARNING("Couldn't parse 'Raw format type' text chunk");
+      return false;
+    }
+    if (type == "exif") {
+      // Remove "Exif\0\0" prefix if present
+      if (bytes.size() >= sizeof kExifSignature &&
+          memcmp(bytes.data(), kExifSignature, sizeof kExifSignature) == 0) {
+        bytes.erase(bytes.begin(), bytes.begin() + sizeof kExifSignature);
+      }
+      if (!metadata->exif.empty()) {
+        JXL_WARNING("overwriting EXIF (%" PRIuS " bytes) with base16 (%" PRIuS
+                    " bytes)",
+                    metadata->exif.size(), bytes.size());
+      }
+      metadata->exif = std::move(bytes);
+    } else if (type == "iptc") {
+      // TODO (jon): Deal with IPTC in some way
+    } else if (type == "8bim") {
+      // TODO (jon): Deal with 8bim in some way
+    } else if (type == "xmp") {
+      if (!metadata->xmp.empty()) {
+        JXL_WARNING("overwriting XMP (%" PRIuS " bytes) with base16 (%" PRIuS
+                    " bytes)",
+                    metadata->xmp.size(), bytes.size());
+      }
+      metadata->xmp = std::move(bytes);
+    } else {
+      JXL_WARNING("Unknown type in 'Raw format type' text chunk: %s: %" PRIuS
+                  " bytes",
+                  type.c_str(), bytes.size());
+    }
+    return true;
+  }
+
+ private:
+  // Returns false if invalid.
+  static JXL_INLINE Status DecodeNibble(const char c,
+                                        uint32_t* JXL_RESTRICT nibble) {
+    if ('a' <= c && c <= 'f') {
+      *nibble = 10 + c - 'a';
+    } else if ('0' <= c && c <= '9') {
+      *nibble = c - '0';
+    } else {
+      *nibble = 0;
+      return JXL_FAILURE("Invalid metadata nibble");
+    }
+    JXL_ASSERT(*nibble < 16);
+    return true;
+  }
+
+  // Returns false if invalid.
+  static JXL_INLINE Status DecodeDecimal(const char** pos, const char* end,
+                                         uint32_t* JXL_RESTRICT value) {
+    size_t len = 0;
+    *value = 0;
+    while (*pos < end) {
+      char next = **pos;
+      if (next >= '0' && next <= '9') {
+        *value = (*value * 10) + static_cast<uint32_t>(next - '0');
+        len++;
+        if (len > 8) {
+          break;
+        }
+      } else {
+        // Do not consume terminator (non-decimal digit).
+        break;
+      }
+      (*pos)++;
+    }
+    if (len == 0 || len > 8) {
+      return JXL_FAILURE("Failed to parse decimal");
+    }
+    return true;
+  }
+
+  // Parses a PNG text chunk with key of the form "Raw profile type ####", with
+  // #### a type.
+  // Returns whether it could successfully parse the content.
+  // We trust key and encoded are null-terminated because they come from
+  // libpng.
+  static Status MaybeDecodeBase16(const char* key, const char* encoded,
+                                  std::string* type,
+                                  std::vector<uint8_t>* bytes) {
+    const char* encoded_end = encoded + strlen(encoded);
+
+    const char* kKey = "Raw profile type ";
+    if (strncmp(key, kKey, strlen(kKey)) != 0) return false;
+    *type = key + strlen(kKey);
+    const size_t kMaxTypeLen = 20;
+    if (type->length() > kMaxTypeLen) return false;  // Type too long
+
+    // Header: freeform string and number of bytes
+    // Expected format is:
+    // \n
+    // profile name/description\n
+    //       40\n               (the number of bytes after hex-decoding)
+    // 01234566789abcdef....\n  (72 bytes per line max).
+    // 012345667\n              (last line)
+    const char* pos = encoded;
+
+    if (*(pos++) != '\n') return false;
+    while (pos < encoded_end && *pos != '\n') {
+      pos++;
+    }
+    if (pos == encoded_end) return false;
+    // We parsed so far a \n, some number of non \n characters and are now
+    // pointing at a \n.
+    if (*(pos++) != '\n') return false;
+    // Skip leading spaces
+    while (pos < encoded_end && *pos == ' ') {
+      pos++;
+    }
+    uint32_t bytes_to_decode = 0;
+    JXL_RETURN_IF_ERROR(DecodeDecimal(&pos, encoded_end, &bytes_to_decode));
+
+    // We need 2*bytes for the hex values plus 1 byte every 36 values,
+    // plus terminal \n for length.
+    const unsigned long needed_bytes =
+        bytes_to_decode * 2 + 1 + DivCeil(bytes_to_decode, 36);
+    if (needed_bytes != static_cast<size_t>(encoded_end - pos)) {
+      return JXL_FAILURE("Not enough bytes to parse %d bytes in hex",
+                         bytes_to_decode);
+    }
+    JXL_ASSERT(bytes->empty());
+    bytes->reserve(bytes_to_decode);
+
+    // Encoding: base16 with newline after 72 chars.
+    // pos points to the \n before the first line of hex values.
+    for (size_t i = 0; i < bytes_to_decode; ++i) {
+      if (i % 36 == 0) {
+        if (pos + 1 >= encoded_end) return false;  // Truncated base16 1
+        if (*pos != '\n') return false;            // Expected newline
+        ++pos;
+      }
+
+      if (pos + 2 >= encoded_end) return false;  // Truncated base16 2;
+      uint32_t nibble0, nibble1;
+      JXL_RETURN_IF_ERROR(DecodeNibble(pos[0], &nibble0));
+      JXL_RETURN_IF_ERROR(DecodeNibble(pos[1], &nibble1));
+      bytes->push_back(static_cast<uint8_t>((nibble0 << 4) + nibble1));
+      pos += 2;
+    }
+    if (pos + 1 != encoded_end) return false;  // Too many encoded bytes
+    if (pos[0] != '\n') return false;          // Incorrect metadata terminator
+    return true;
+  }
+};
+
+constexpr bool isAbc(char c) {
+  return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+constexpr uint32_t kId_IHDR = 0x52444849;
+constexpr uint32_t kId_acTL = 0x4C546361;
+constexpr uint32_t kId_fcTL = 0x4C546366;
+constexpr uint32_t kId_IDAT = 0x54414449;
+constexpr uint32_t kId_fdAT = 0x54416466;
+constexpr uint32_t kId_IEND = 0x444E4549;
+constexpr uint32_t kId_cICP = 0x50434963;
+constexpr uint32_t kId_iCCP = 0x50434369;
+constexpr uint32_t kId_sRGB = 0x42475273;
+constexpr uint32_t kId_gAMA = 0x414D4167;
+constexpr uint32_t kId_cHRM = 0x4D524863;
+constexpr uint32_t kId_eXIf = 0x66495865;
+
+struct APNGFrame {
+  std::vector<uint8_t> pixels;
+  std::vector<uint8_t*> rows;
+  unsigned int w, h, delay_num, delay_den;
+};
+
+struct Reader {
+  const uint8_t* next;
+  const uint8_t* last;
+  bool Read(void* data, size_t len) {
+    size_t cap = last - next;
+    size_t to_copy = std::min(cap, len);
+    memcpy(data, next, to_copy);
+    next += to_copy;
+    return (len == to_copy);
+  }
+  bool Eof() { return next == last; }
+};
+
+const unsigned long cMaxPNGSize = 1000000UL;
+const size_t kMaxPNGChunkSize = 1lu << 30;  // 1 GB
+
+void info_fn(png_structp png_ptr, png_infop info_ptr) {
+  png_set_expand(png_ptr);
+  png_set_palette_to_rgb(png_ptr);
+  png_set_tRNS_to_alpha(png_ptr);
+  (void)png_set_interlace_handling(png_ptr);
+  png_read_update_info(png_ptr, info_ptr);
+}
+
+void row_fn(png_structp png_ptr, png_bytep new_row, png_uint_32 row_num,
+            int pass) {
+  APNGFrame* frame = (APNGFrame*)png_get_progressive_ptr(png_ptr);
+  JXL_CHECK(frame);
+  JXL_CHECK(row_num < frame->rows.size());
+  JXL_CHECK(frame->rows[row_num] < frame->pixels.data() + frame->pixels.size());
+  png_progressive_combine_row(png_ptr, frame->rows[row_num], new_row);
+}
+
+inline unsigned int read_chunk(Reader* r, std::vector<uint8_t>* pChunk) {
+  unsigned char len[4];
+  if (r->Read(&len, 4)) {
+    const auto size = png_get_uint_32(len);
+    // Check first, to avoid overflow.
+    if (size > kMaxPNGChunkSize) {
+      JXL_WARNING("APNG chunk size is too big");
+      return 0;
+    }
+    pChunk->resize(size + 12);
+    memcpy(pChunk->data(), len, 4);
+    if (r->Read(pChunk->data() + 4, pChunk->size() - 4)) {
+      return LoadLE32(pChunk->data() + 4);
+    }
+  }
+  return 0;
+}
+
+int processing_start(png_structp& png_ptr, png_infop& info_ptr, void* frame_ptr,
+                     bool hasInfo, std::vector<uint8_t>& chunkIHDR,
+                     std::vector<std::vector<uint8_t>>& chunksInfo) {
+  unsigned char header[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+
+  // Cleanup prior decoder, if any.
+  png_destroy_read_struct(&png_ptr, &info_ptr, 0);
+  // Just in case. Not all versions on libpng wipe-out the pointers.
+  png_ptr = nullptr;
+  info_ptr = nullptr;
+
+  png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+  info_ptr = png_create_info_struct(png_ptr);
+  if (!png_ptr || !info_ptr) return 1;
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    return 1;
+  }
+
+  png_set_keep_unknown_chunks(png_ptr, 1, kIgnoredPngChunks,
+                              (int)sizeof(kIgnoredPngChunks) / 5);
+
+  png_set_crc_action(png_ptr, PNG_CRC_QUIET_USE, PNG_CRC_QUIET_USE);
+  png_set_progressive_read_fn(png_ptr, frame_ptr, info_fn, row_fn, NULL);
+
+  png_process_data(png_ptr, info_ptr, header, 8);
+  png_process_data(png_ptr, info_ptr, chunkIHDR.data(), chunkIHDR.size());
+
+  if (hasInfo) {
+    for (unsigned int i = 0; i < chunksInfo.size(); i++) {
+      png_process_data(png_ptr, info_ptr, chunksInfo[i].data(),
+                       chunksInfo[i].size());
+    }
+  }
+  return 0;
+}
+
+int processing_data(png_structp png_ptr, png_infop info_ptr, unsigned char* p,
+                    unsigned int size) {
+  if (!png_ptr || !info_ptr) return 1;
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    return 1;
+  }
+
+  png_process_data(png_ptr, info_ptr, p, size);
+  return 0;
+}
+
+int processing_finish(png_structp png_ptr, png_infop info_ptr,
+                      PackedMetadata* metadata) {
+  unsigned char footer[12] = {0, 0, 0, 0, 73, 69, 78, 68, 174, 66, 96, 130};
+
+  if (!png_ptr || !info_ptr) return 1;
+
+  if (setjmp(png_jmpbuf(png_ptr))) {
+    return 1;
+  }
+
+  png_process_data(png_ptr, info_ptr, footer, 12);
+  // before destroying: check if we encountered any metadata chunks
+  png_textp text_ptr;
+  int num_text;
+  png_get_text(png_ptr, info_ptr, &text_ptr, &num_text);
+  for (int i = 0; i < num_text; i++) {
+    (void)BlobsReaderPNG::Decode(text_ptr[i], metadata);
+  }
+
+  return 0;
+}
+
+}  // namespace
+
+Status DecodeImageAPNG(const Span<const uint8_t> bytes,
+                       const ColorHints& color_hints, PackedPixelFile* ppf,
+                       const SizeConstraints* constraints) {
+  Reader r;
+  unsigned int id, j, w, h, w0, h0, x0, y0;
+  unsigned int delay_num, delay_den, dop, bop, rowbytes, imagesize;
+  unsigned char sig[8];
+  png_structp png_ptr = nullptr;
+  png_infop info_ptr = nullptr;
+  std::vector<uint8_t> chunk;
+  std::vector<uint8_t> chunkIHDR;
+  std::vector<std::vector<uint8_t>> chunksInfo;
+  bool isAnimated = false;
+  bool hasInfo = false;
+  APNGFrame frameRaw = {};
+  uint32_t num_channels;
+  JxlPixelFormat format;
+  unsigned int bytes_per_pixel = 0;
+
+  struct FrameInfo {
+    PackedImage data;
+    uint32_t duration;
+    size_t x0, xsize;
+    size_t y0, ysize;
+    uint32_t dispose_op;
+    uint32_t blend_op;
+  };
+
+  std::vector<FrameInfo> frames;
+
+  // Make sure png memory is released in any case.
+  auto scope_guard = MakeScopeGuard([&]() {
+    png_destroy_read_struct(&png_ptr, &info_ptr, 0);
+    // Just in case. Not all versions on libpng wipe-out the pointers.
+    png_ptr = nullptr;
+    info_ptr = nullptr;
+  });
+
+  r = {bytes.data(), bytes.data() + bytes.size()};
+  // Not a PNG => not an error
+  unsigned char png_signature[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+  if (!r.Read(sig, 8) || memcmp(sig, png_signature, 8) != 0) {
+    return false;
+  }
+  id = read_chunk(&r, &chunkIHDR);
+
+  ppf->info.exponent_bits_per_sample = 0;
+  ppf->info.alpha_exponent_bits = 0;
+  ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+  ppf->frames.clear();
+
+  bool have_color = false;
+  bool have_cicp = false, have_iccp = false, have_srgb = false;
+  bool errorstate = true;
+  if (id == kId_IHDR && chunkIHDR.size() == 25) {
+    x0 = 0;
+    y0 = 0;
+    delay_num = 1;
+    delay_den = 10;
+    dop = 0;
+    bop = 0;
+
+    w0 = w = png_get_uint_32(chunkIHDR.data() + 8);
+    h0 = h = png_get_uint_32(chunkIHDR.data() + 12);
+    if (w > cMaxPNGSize || h > cMaxPNGSize) {
+      return false;
+    }
+
+    // default settings in case e.g. only gAMA is given
+    ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB;
+    ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+    ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+    ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+    ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
+
+    if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
+                          chunkIHDR, chunksInfo)) {
+      while (!r.Eof()) {
+        id = read_chunk(&r, &chunk);
+        if (!id) break;
+
+        if (id == kId_acTL && !hasInfo && !isAnimated) {
+          isAnimated = true;
+          ppf->info.have_animation = true;
+          ppf->info.animation.tps_numerator = 1000;
+          ppf->info.animation.tps_denominator = 1;
+        } else if (id == kId_IEND ||
+                   (id == kId_fcTL && (!hasInfo || isAnimated))) {
+          if (hasInfo) {
+            if (!processing_finish(png_ptr, info_ptr, &ppf->metadata)) {
+              // Allocates the frame buffer.
+              uint32_t duration = delay_num * 1000 / delay_den;
+              frames.push_back(FrameInfo{PackedImage(w0, h0, format), duration,
+                                         x0, w0, y0, h0, dop, bop});
+              auto& frame = frames.back().data;
+              for (size_t y = 0; y < h0; ++y) {
+                memcpy(static_cast<uint8_t*>(frame.pixels()) + frame.stride * y,
+                       frameRaw.rows[y], bytes_per_pixel * w0);
+              }
+            } else {
+              break;
+            }
+          }
+
+          if (id == kId_IEND) {
+            errorstate = false;
+            break;
+          }
+          if (chunk.size() < 34) {
+            return JXL_FAILURE("Received a chunk that is too small (%" PRIuS
+                               "B)",
+                               chunk.size());
+          }
+          // At this point the old frame is done. Let's start a new one.
+          w0 = png_get_uint_32(chunk.data() + 12);
+          h0 = png_get_uint_32(chunk.data() + 16);
+          x0 = png_get_uint_32(chunk.data() + 20);
+          y0 = png_get_uint_32(chunk.data() + 24);
+          delay_num = png_get_uint_16(chunk.data() + 28);
+          delay_den = png_get_uint_16(chunk.data() + 30);
+          dop = chunk[32];
+          bop = chunk[33];
+
+          if (!delay_den) delay_den = 100;
+
+          if (w0 > cMaxPNGSize || h0 > cMaxPNGSize || x0 > cMaxPNGSize ||
+              y0 > cMaxPNGSize || x0 + w0 > w || y0 + h0 > h || dop > 2 ||
+              bop > 1) {
+            break;
+          }
+
+          if (hasInfo) {
+            memcpy(chunkIHDR.data() + 8, chunk.data() + 12, 8);
+            if (processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
+                                 chunkIHDR, chunksInfo)) {
+              break;
+            }
+          }
+        } else if (id == kId_IDAT) {
+          // First IDAT chunk means we now have all header info
+          hasInfo = true;
+          JXL_CHECK(w == png_get_image_width(png_ptr, info_ptr));
+          JXL_CHECK(h == png_get_image_height(png_ptr, info_ptr));
+          int colortype = png_get_color_type(png_ptr, info_ptr);
+          int png_bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+          ppf->info.bits_per_sample = png_bit_depth;
+          png_color_8p sigbits = NULL;
+          png_get_sBIT(png_ptr, info_ptr, &sigbits);
+          if (colortype & 1) {
+            // palette will actually be 8-bit regardless of the index bitdepth
+            ppf->info.bits_per_sample = 8;
+          }
+          if (colortype & 2) {
+            ppf->info.num_color_channels = 3;
+            ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB;
+            if (sigbits && sigbits->red == sigbits->green &&
+                sigbits->green == sigbits->blue)
+              ppf->info.bits_per_sample = sigbits->red;
+          } else {
+            ppf->info.num_color_channels = 1;
+            ppf->color_encoding.color_space = JXL_COLOR_SPACE_GRAY;
+            if (sigbits) ppf->info.bits_per_sample = sigbits->gray;
+          }
+          if (colortype & 4 ||
+              png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
+            ppf->info.alpha_bits = ppf->info.bits_per_sample;
+            if (sigbits) {
+              if (sigbits->alpha &&
+                  sigbits->alpha != ppf->info.bits_per_sample) {
+                return JXL_FAILURE("Unsupported alpha bit-depth");
+              }
+              ppf->info.alpha_bits = sigbits->alpha;
+            }
+          } else {
+            ppf->info.alpha_bits = 0;
+          }
+          ppf->color_encoding.color_space =
+              (ppf->info.num_color_channels == 1 ? JXL_COLOR_SPACE_GRAY
+                                                 : JXL_COLOR_SPACE_RGB);
+          ppf->info.xsize = w;
+          ppf->info.ysize = h;
+          JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, w, h));
+          num_channels =
+              ppf->info.num_color_channels + (ppf->info.alpha_bits ? 1 : 0);
+          format = {
+              /*num_channels=*/num_channels,
+              /*data_type=*/ppf->info.bits_per_sample > 8 ? JXL_TYPE_UINT16
+                                                          : JXL_TYPE_UINT8,
+              /*endianness=*/JXL_BIG_ENDIAN,
+              /*align=*/0,
+          };
+          if (png_bit_depth > 8 && format.data_type == JXL_TYPE_UINT8) {
+            png_set_strip_16(png_ptr);
+          }
+          bytes_per_pixel =
+              num_channels * (format.data_type == JXL_TYPE_UINT16 ? 2 : 1);
+          rowbytes = w * bytes_per_pixel;
+          imagesize = h * rowbytes;
+          frameRaw.pixels.resize(imagesize);
+          frameRaw.rows.resize(h);
+          for (j = 0; j < h; j++)
+            frameRaw.rows[j] = frameRaw.pixels.data() + j * rowbytes;
+
+          if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
+            break;
+          }
+        } else if (id == kId_fdAT && isAnimated) {
+          png_save_uint_32(chunk.data() + 4, chunk.size() - 16);
+          memcpy(chunk.data() + 8, "IDAT", 4);
+          if (processing_data(png_ptr, info_ptr, chunk.data() + 4,
+                              chunk.size() - 4)) {
+            break;
+          }
+        } else if (id == kId_cICP) {
+          // Color profile chunks: cICP has the highest priority, followed by
+          // iCCP and sRGB (which shouldn't co-exist, but if they do, we use
+          // iCCP), followed finally by gAMA and cHRM.
+          if (DecodeCICP(chunk.data() + 8, chunk.size() - 12,
+                         &ppf->color_encoding)) {
+            have_cicp = true;
+            have_color = true;
+            ppf->icc.clear();
+          }
+        } else if (!have_cicp && id == kId_iCCP) {
+          if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
+            JXL_WARNING("Corrupt iCCP chunk");
+            break;
+          }
+
+          // TODO(jon): catch special case of PQ and synthesize color encoding
+          // in that case
+          int compression_type;
+          png_bytep profile;
+          png_charp name;
+          png_uint_32 proflen = 0;
+          auto ok = png_get_iCCP(png_ptr, info_ptr, &name, &compression_type,
+                                 &profile, &proflen);
+          if (ok && proflen) {
+            ppf->icc.assign(profile, profile + proflen);
+            have_color = true;
+            have_iccp = true;
+          } else {
+            // TODO(eustas): JXL_WARNING?
+          }
+        } else if (!have_cicp && !have_iccp && id == kId_sRGB) {
+          JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12,
+                                         &ppf->color_encoding));
+          have_srgb = true;
+          have_color = true;
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) {
+          JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12,
+                                         &ppf->color_encoding));
+          have_color = true;
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) {
+          JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12,
+                                         &ppf->color_encoding));
+          have_color = true;
+        } else if (id == kId_eXIf) {
+          ppf->metadata.exif.resize(chunk.size() - 12);
+          memcpy(ppf->metadata.exif.data(), chunk.data() + 8,
+                 chunk.size() - 12);
+        } else if (!isAbc(chunk[4]) || !isAbc(chunk[5]) || !isAbc(chunk[6]) ||
+                   !isAbc(chunk[7])) {
+          break;
+        } else {
+          if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
+            break;
+          }
+          if (!hasInfo) {
+            chunksInfo.push_back(chunk);
+            continue;
+          }
+        }
+      }
+    }
+
+    JXL_RETURN_IF_ERROR(ApplyColorHints(
+        color_hints, have_color, ppf->info.num_color_channels == 1, ppf));
+  }
+
+  if (errorstate) return false;
+
+  bool has_nontrivial_background = false;
+  bool previous_frame_should_be_cleared = false;
+  enum {
+    DISPOSE_OP_NONE = 0,
+    DISPOSE_OP_BACKGROUND = 1,
+    DISPOSE_OP_PREVIOUS = 2,
+  };
+  enum {
+    BLEND_OP_SOURCE = 0,
+    BLEND_OP_OVER = 1,
+  };
+  for (size_t i = 0; i < frames.size(); i++) {
+    auto& frame = frames[i];
+    JXL_ASSERT(frame.data.xsize == frame.xsize);
+    JXL_ASSERT(frame.data.ysize == frame.ysize);
+
+    // Before encountering a DISPOSE_OP_NONE frame, the canvas is filled with 0,
+    // so DISPOSE_OP_BACKGROUND and DISPOSE_OP_PREVIOUS are equivalent.
+    if (frame.dispose_op == DISPOSE_OP_NONE) {
+      has_nontrivial_background = true;
+    }
+    bool should_blend = frame.blend_op == BLEND_OP_OVER;
+    bool use_for_next_frame =
+        has_nontrivial_background && frame.dispose_op != DISPOSE_OP_PREVIOUS;
+    size_t x0 = frame.x0;
+    size_t y0 = frame.y0;
+    size_t xsize = frame.data.xsize;
+    size_t ysize = frame.data.ysize;
+    if (previous_frame_should_be_cleared) {
+      size_t px0 = frames[i - 1].x0;
+      size_t py0 = frames[i - 1].y0;
+      size_t pxs = frames[i - 1].xsize;
+      size_t pys = frames[i - 1].ysize;
+      if (px0 >= x0 && py0 >= y0 && px0 + pxs <= x0 + xsize &&
+          py0 + pys <= y0 + ysize && frame.blend_op == BLEND_OP_SOURCE &&
+          use_for_next_frame) {
+        // If the previous frame is entirely contained in the current frame and
+        // we are using BLEND_OP_SOURCE, nothing special needs to be done.
+        ppf->frames.emplace_back(std::move(frame.data));
+      } else if (px0 == x0 && py0 == y0 && px0 + pxs == x0 + xsize &&
+                 py0 + pys == y0 + ysize && use_for_next_frame) {
+        // If the new frame has the same size as the old one, but we are
+        // blending, we can instead just not blend.
+        should_blend = false;
+        ppf->frames.emplace_back(std::move(frame.data));
+      } else if (px0 <= x0 && py0 <= y0 && px0 + pxs >= x0 + xsize &&
+                 py0 + pys >= y0 + ysize && use_for_next_frame) {
+        // If the new frame is contained within the old frame, we can pad the
+        // new frame with zeros and not blend.
+        PackedImage new_data(pxs, pys, frame.data.format);
+        memset(new_data.pixels(), 0, new_data.pixels_size);
+        for (size_t y = 0; y < ysize; y++) {
+          size_t bytes_per_pixel =
+              PackedImage::BitsPerChannel(new_data.format.data_type) *
+              new_data.format.num_channels / 8;
+          memcpy(static_cast<uint8_t*>(new_data.pixels()) +
+                     new_data.stride * (y + y0 - py0) +
+                     bytes_per_pixel * (x0 - px0),
+                 static_cast<const uint8_t*>(frame.data.pixels()) +
+                     frame.data.stride * y,
+                 xsize * bytes_per_pixel);
+        }
+
+        x0 = px0;
+        y0 = py0;
+        xsize = pxs;
+        ysize = pys;
+        should_blend = false;
+        ppf->frames.emplace_back(std::move(new_data));
+      } else {
+        // If all else fails, insert a dummy blank frame with kReplace.
+        PackedImage blank(pxs, pys, frame.data.format);
+        memset(blank.pixels(), 0, blank.pixels_size);
+        ppf->frames.emplace_back(std::move(blank));
+        auto& pframe = ppf->frames.back();
+        pframe.frame_info.layer_info.crop_x0 = px0;
+        pframe.frame_info.layer_info.crop_y0 = py0;
+        pframe.frame_info.layer_info.xsize = pxs;
+        pframe.frame_info.layer_info.ysize = pys;
+        pframe.frame_info.duration = 0;
+        bool is_full_size = px0 == 0 && py0 == 0 && pxs == ppf->info.xsize &&
+                            pys == ppf->info.ysize;
+        pframe.frame_info.layer_info.have_crop = is_full_size ? 0 : 1;
+        pframe.frame_info.layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
+        pframe.frame_info.layer_info.blend_info.source = 1;
+        pframe.frame_info.layer_info.save_as_reference = 1;
+        ppf->frames.emplace_back(std::move(frame.data));
+      }
+    } else {
+      ppf->frames.emplace_back(std::move(frame.data));
+    }
+
+    auto& pframe = ppf->frames.back();
+    pframe.frame_info.layer_info.crop_x0 = x0;
+    pframe.frame_info.layer_info.crop_y0 = y0;
+    pframe.frame_info.layer_info.xsize = xsize;
+    pframe.frame_info.layer_info.ysize = ysize;
+    pframe.frame_info.duration = frame.duration;
+    pframe.frame_info.layer_info.blend_info.blendmode =
+        should_blend ? JXL_BLEND_BLEND : JXL_BLEND_REPLACE;
+    bool is_full_size = x0 == 0 && y0 == 0 && xsize == ppf->info.xsize &&
+                        ysize == ppf->info.ysize;
+    pframe.frame_info.layer_info.have_crop = is_full_size ? 0 : 1;
+    pframe.frame_info.layer_info.blend_info.source = 1;
+    pframe.frame_info.layer_info.blend_info.alpha = 0;
+    pframe.frame_info.layer_info.save_as_reference = use_for_next_frame ? 1 : 0;
+
+    previous_frame_should_be_cleared =
+        has_nontrivial_background && frame.dispose_op == DISPOSE_OP_BACKGROUND;
+  }
+  if (ppf->frames.empty()) return JXL_FAILURE("No frames decoded");
+  ppf->frames.back().frame_info.is_last = true;
+
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/apng.h b/third_party/jpeg-xl/lib/extras/dec/apng.h
new file mode 100644
index 0000000000..6502ac80c0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/apng.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_APNG_H_
+#define LIB_EXTRAS_DEC_APNG_H_
+
+// Decodes APNG images in memory.
+
+#include <stdint.h>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Decodes `bytes` into `ppf`.
+Status DecodeImageAPNG(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                       PackedPixelFile* ppf,
+                       const SizeConstraints* constraints = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_APNG_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/color_description.cc b/third_party/jpeg-xl/lib/extras/dec/color_description.cc
new file mode 100644
index 0000000000..54f6aa4206
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/color_description.cc
@@ -0,0 +1,218 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/color_description.h"
+
+#include <errno.h>
+
+#include <cmath>
+
+namespace jxl {
+
+namespace {
+
+template <typename T>
+struct EnumName {
+  const char* name;
+  T value;
+};
+
+const EnumName<JxlColorSpace> kJxlColorSpaceNames[] = {
+    {"RGB", JXL_COLOR_SPACE_RGB},
+    {"Gra", JXL_COLOR_SPACE_GRAY},
+    {"XYB", JXL_COLOR_SPACE_XYB},
+    {"CS?", JXL_COLOR_SPACE_UNKNOWN},
+};
+
+const EnumName<JxlWhitePoint> kJxlWhitePointNames[] = {
+    {"D65", JXL_WHITE_POINT_D65},
+    {"Cst", JXL_WHITE_POINT_CUSTOM},
+    {"EER", JXL_WHITE_POINT_E},
+    {"DCI", JXL_WHITE_POINT_DCI},
+};
+
+const EnumName<JxlPrimaries> kJxlPrimariesNames[] = {
+    {"SRG", JXL_PRIMARIES_SRGB},
+    {"Cst", JXL_PRIMARIES_CUSTOM},
+    {"202", JXL_PRIMARIES_2100},
+    {"DCI", JXL_PRIMARIES_P3},
+};
+
+const EnumName<JxlTransferFunction> kJxlTransferFunctionNames[] = {
+    {"709", JXL_TRANSFER_FUNCTION_709},
+    {"TF?", JXL_TRANSFER_FUNCTION_UNKNOWN},
+    {"Lin", JXL_TRANSFER_FUNCTION_LINEAR},
+    {"SRG", JXL_TRANSFER_FUNCTION_SRGB},
+    {"PeQ", JXL_TRANSFER_FUNCTION_PQ},
+    {"DCI", JXL_TRANSFER_FUNCTION_DCI},
+    {"HLG", JXL_TRANSFER_FUNCTION_HLG},
+    {"", JXL_TRANSFER_FUNCTION_GAMMA},
+};
+
+const EnumName<JxlRenderingIntent> kJxlRenderingIntentNames[] = {
+    {"Per", JXL_RENDERING_INTENT_PERCEPTUAL},
+    {"Rel", JXL_RENDERING_INTENT_RELATIVE},
+    {"Sat", JXL_RENDERING_INTENT_SATURATION},
+    {"Abs", JXL_RENDERING_INTENT_ABSOLUTE},
+};
+
+template <typename T>
+Status ParseEnum(const std::string& token, const EnumName<T>* enum_values,
+                 size_t enum_len, T* value) {
+  for (size_t i = 0; i < enum_len; i++) {
+    if (enum_values[i].name == token) {
+      *value = enum_values[i].value;
+      return true;
+    }
+  }
+  return false;
+}
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+#define PARSE_ENUM(type, token, value) \
+  ParseEnum<type>(token, k##type##Names, ARRAY_SIZE(k##type##Names), value)
+
+class Tokenizer {
+ public:
+  Tokenizer(const std::string* input, char separator)
+      : input_(input), separator_(separator) {}
+
+  Status Next(std::string* next) {
+    const size_t end = input_->find(separator_, start_);
+    if (end == std::string::npos) {
+      *next = input_->substr(start_);  // rest of string
+    } else {
+      *next = input_->substr(start_, end - start_);
+    }
+    if (next->empty()) return JXL_FAILURE("Missing token");
+    start_ = end + 1;
+    return true;
+  }
+
+ private:
+  const std::string* const input_;  // not owned
+  const char separator_;
+  size_t start_ = 0;  // of next token
+};
+
+Status ParseDouble(const std::string& num, double* d) {
+  char* end;
+  errno = 0;
+  *d = strtod(num.c_str(), &end);
+  if (*d == 0.0 && end == num.c_str()) {
+    return JXL_FAILURE("Invalid double: %s", num.c_str());
+  }
+  if (std::isnan(*d)) {
+    return JXL_FAILURE("Invalid double: %s", num.c_str());
+  }
+  if (errno == ERANGE) {
+    return JXL_FAILURE("Double out of range: %s", num.c_str());
+  }
+  return true;
+}
+
+Status ParseDouble(Tokenizer* tokenizer, double* d) {
+  std::string num;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&num));
+  return ParseDouble(num, d);
+}
+
+Status ParseColorSpace(Tokenizer* tokenizer, JxlColorEncoding* c) {
+  std::string str;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&str));
+  JxlColorSpace cs;
+  if (PARSE_ENUM(JxlColorSpace, str, &cs)) {
+    c->color_space = cs;
+    return true;
+  }
+
+  return JXL_FAILURE("Unknown ColorSpace %s", str.c_str());
+}
+
+Status ParseWhitePoint(Tokenizer* tokenizer, JxlColorEncoding* c) {
+  if (c->color_space == JXL_COLOR_SPACE_XYB) {
+    // Implicit white point.
+    c->white_point = JXL_WHITE_POINT_D65;
+    return true;
+  }
+
+  std::string str;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&str));
+  if (PARSE_ENUM(JxlWhitePoint, str, &c->white_point)) return true;
+
+  Tokenizer xy_tokenizer(&str, ';');
+  c->white_point = JXL_WHITE_POINT_CUSTOM;
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->white_point_xy + 0));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->white_point_xy + 1));
+  return true;
+}
+
+Status ParsePrimaries(Tokenizer* tokenizer, JxlColorEncoding* c) {
+  if (c->color_space == JXL_COLOR_SPACE_GRAY ||
+      c->color_space == JXL_COLOR_SPACE_XYB) {
+    // No primaries case.
+    return true;
+  }
+
+  std::string str;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&str));
+  if (PARSE_ENUM(JxlPrimaries, str, &c->primaries)) return true;
+
+  Tokenizer xy_tokenizer(&str, ';');
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_red_xy + 0));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_red_xy + 1));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_green_xy + 0));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_green_xy + 1));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_blue_xy + 0));
+  JXL_RETURN_IF_ERROR(ParseDouble(&xy_tokenizer, c->primaries_blue_xy + 1));
+  c->primaries = JXL_PRIMARIES_CUSTOM;
+
+  return JXL_FAILURE("Invalid primaries %s", str.c_str());
+}
+
+Status ParseRenderingIntent(Tokenizer* tokenizer, JxlColorEncoding* c) {
+  std::string str;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&str));
+  if (PARSE_ENUM(JxlRenderingIntent, str, &c->rendering_intent)) return true;
+
+  return JXL_FAILURE("Invalid RenderingIntent %s\n", str.c_str());
+}
+
+Status ParseTransferFunction(Tokenizer* tokenizer, JxlColorEncoding* c) {
+  if (c->color_space == JXL_COLOR_SPACE_XYB) {
+    // Implicit TF.
+    c->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    c->gamma = 1 / 3.;
+    return true;
+  }
+
+  std::string str;
+  JXL_RETURN_IF_ERROR(tokenizer->Next(&str));
+  if (PARSE_ENUM(JxlTransferFunction, str, &c->transfer_function)) {
+    return true;
+  }
+
+  if (str[0] == 'g') {
+    JXL_RETURN_IF_ERROR(ParseDouble(str.substr(1), &c->gamma));
+    c->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    return true;
+  }
+
+  return JXL_FAILURE("Invalid gamma %s", str.c_str());
+}
+
+}  // namespace
+
+Status ParseDescription(const std::string& description, JxlColorEncoding* c) {
+  *c = {};
+  Tokenizer tokenizer(&description, '_');
+  JXL_RETURN_IF_ERROR(ParseColorSpace(&tokenizer, c));
+  JXL_RETURN_IF_ERROR(ParseWhitePoint(&tokenizer, c));
+  JXL_RETURN_IF_ERROR(ParsePrimaries(&tokenizer, c));
+  JXL_RETURN_IF_ERROR(ParseRenderingIntent(&tokenizer, c));
+  JXL_RETURN_IF_ERROR(ParseTransferFunction(&tokenizer, c));
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/color_description.h b/third_party/jpeg-xl/lib/extras/dec/color_description.h
new file mode 100644
index 0000000000..23680ff7c6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/color_description.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_COLOR_DESCRIPTION_H_
+#define LIB_EXTRAS_COLOR_DESCRIPTION_H_
+
+#include <jxl/color_encoding.h>
+
+#include <string>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Parse the color description into a JxlColorEncoding "RGB_D65_SRG_Rel_Lin".
+Status ParseDescription(const std::string& description,
+                        JxlColorEncoding* JXL_RESTRICT c);
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_COLOR_DESCRIPTION_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/color_description_test.cc b/third_party/jpeg-xl/lib/extras/dec/color_description_test.cc
new file mode 100644
index 0000000000..a1c04a94e4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/color_description_test.cc
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/color_description.h"
+
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+// Verify ParseDescription(Description) yields the same ColorEncoding
+TEST(ColorDescriptionTest, RoundTripAll) {
+  for (const auto& cdesc : test::AllEncodings()) {
+    const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc);
+    const std::string description = Description(c_original);
+    printf("%s\n", description.c_str());
+
+    JxlColorEncoding c_external = {};
+    EXPECT_TRUE(ParseDescription(description, &c_external));
+    ColorEncoding c_internal;
+    EXPECT_TRUE(
+        ConvertExternalToInternalColorEncoding(c_external, &c_internal));
+    EXPECT_TRUE(c_original.SameColorEncoding(c_internal))
+        << "Where c_original=" << c_original
+        << " and c_internal=" << c_internal;
+  }
+}
+
+TEST(ColorDescriptionTest, NanGamma) {
+  const std::string description = "Gra_2_Per_gnan";
+  JxlColorEncoding c;
+  EXPECT_FALSE(ParseDescription(description, &c));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/color_hints.cc b/third_party/jpeg-xl/lib/extras/dec/color_hints.cc
new file mode 100644
index 0000000000..53f7cd0543
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/color_hints.cc
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/color_hints.h"
+
+#include <jxl/encode.h>
+
+#include "lib/extras/dec/color_description.h"
+#include "lib/jxl/base/file_io.h"
+
+namespace jxl {
+namespace extras {
+
+Status ApplyColorHints(const ColorHints& color_hints,
+                       const bool color_already_set, const bool is_gray,
+                       PackedPixelFile* ppf) {
+  if (color_already_set) {
+    return color_hints.Foreach(
+        [](const std::string& key, const std::string& /*value*/) {
+          JXL_WARNING("Decoder ignoring %s hint", key.c_str());
+          return true;
+        });
+  }
+
+  bool got_color_space = false;
+
+  JXL_RETURN_IF_ERROR(color_hints.Foreach(
+      [is_gray, ppf, &got_color_space](const std::string& key,
+                                       const std::string& value) -> Status {
+        if (key == "color_space") {
+          JxlColorEncoding c_original_external;
+          if (!ParseDescription(value, &c_original_external)) {
+            return JXL_FAILURE("Failed to apply color_space");
+          }
+          ppf->color_encoding = c_original_external;
+
+          if (is_gray !=
+              (ppf->color_encoding.color_space == JXL_COLOR_SPACE_GRAY)) {
+            return JXL_FAILURE("mismatch between file and color_space hint");
+          }
+
+          got_color_space = true;
+        } else if (key == "icc_pathname") {
+          JXL_RETURN_IF_ERROR(ReadFile(value, &ppf->icc));
+          got_color_space = true;
+        } else {
+          JXL_WARNING("Ignoring %s hint", key.c_str());
+        }
+        return true;
+      }));
+
+  if (!got_color_space) {
+    ppf->color_encoding.color_space =
+        is_gray ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
+    ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+    ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+    ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+  }
+
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/color_hints.h b/third_party/jpeg-xl/lib/extras/dec/color_hints.h
new file mode 100644
index 0000000000..9c7de884f9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/color_hints.h
@@ -0,0 +1,72 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_COLOR_HINTS_H_
+#define LIB_EXTRAS_COLOR_HINTS_H_
+
+// Not all the formats implemented in the extras lib support bundling color
+// information into the file, and those that support it may not have it.
+// To allow attaching color information to those file formats the caller can
+// define these color hints.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+class ColorHints {
+ public:
+  // key=color_space, value=Description(c/pp): specify the ColorEncoding of
+  //   the pixels for decoding. Otherwise, if the codec did not obtain an ICC
+  //   profile from the image, assume sRGB.
+  //
+  // Strings are taken from the command line, so avoid spaces for convenience.
+  void Add(const std::string& key, const std::string& value) {
+    kv_.emplace_back(key, value);
+  }
+
+  // Calls `func(key, value)` for each key/value in the order they were added,
+  // returning false immediately if `func` returns false.
+  template <class Func>
+  Status Foreach(const Func& func) const {
+    for (const KeyValue& kv : kv_) {
+      Status ok = func(kv.key, kv.value);
+      if (!ok) {
+        return JXL_FAILURE("ColorHints::Foreach returned false");
+      }
+    }
+    return true;
+  }
+
+ private:
+  // Splitting into key/value avoids parsing in each codec.
+  struct KeyValue {
+    KeyValue(std::string key, std::string value)
+        : key(std::move(key)), value(std::move(value)) {}
+
+    std::string key;
+    std::string value;
+  };
+
+  std::vector<KeyValue> kv_;
+};
+
+// Apply the color hints to the decoded image in PackedPixelFile if any.
+// color_already_set tells whether the color encoding was already set, in which
+// case the hints are ignored if any hint is passed.
+Status ApplyColorHints(const ColorHints& color_hints, bool color_already_set,
+                       bool is_gray, PackedPixelFile* ppf);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_COLOR_HINTS_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/decode.cc b/third_party/jpeg-xl/lib/extras/dec/decode.cc
new file mode 100644
index 0000000000..e1b0365274
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/decode.h"
+
+#include <locale>
+
+#if JPEGXL_ENABLE_APNG
+#include "lib/extras/dec/apng.h"
+#endif
+#if JPEGXL_ENABLE_EXR
+#include "lib/extras/dec/exr.h"
+#endif
+#if JPEGXL_ENABLE_GIF
+#include "lib/extras/dec/gif.h"
+#endif
+#if JPEGXL_ENABLE_JPEG
+#include "lib/extras/dec/jpg.h"
+#endif
+#include "lib/extras/dec/pgx.h"
+#include "lib/extras/dec/pnm.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+// Any valid encoding is larger (ensures codecs can read the first few bytes)
+constexpr size_t kMinBytes = 9;
+
+}  // namespace
+
+std::vector<Codec> AvailableCodecs() {
+  std::vector<Codec> out;
+#if JPEGXL_ENABLE_APNG
+  out.push_back(Codec::kPNG);
+#endif
+#if JPEGXL_ENABLE_EXR
+  out.push_back(Codec::kEXR);
+#endif
+#if JPEGXL_ENABLE_GIF
+  out.push_back(Codec::kGIF);
+#endif
+#if JPEGXL_ENABLE_JPEG
+  out.push_back(Codec::kJPG);
+#endif
+  out.push_back(Codec::kPGX);
+  out.push_back(Codec::kPNM);
+  return out;
+}
+
+Codec CodecFromExtension(std::string extension,
+                         size_t* JXL_RESTRICT bits_per_sample) {
+  std::transform(
+      extension.begin(), extension.end(), extension.begin(),
+      [](char c) { return std::tolower(c, std::locale::classic()); });
+  if (extension == ".png") return Codec::kPNG;
+
+  if (extension == ".jpg") return Codec::kJPG;
+  if (extension == ".jpeg") return Codec::kJPG;
+
+  if (extension == ".pgx") return Codec::kPGX;
+
+  if (extension == ".pam") return Codec::kPNM;
+  if (extension == ".pnm") return Codec::kPNM;
+  if (extension == ".pgm") return Codec::kPNM;
+  if (extension == ".ppm") return Codec::kPNM;
+  if (extension == ".pfm") {
+    if (bits_per_sample != nullptr) *bits_per_sample = 32;
+    return Codec::kPNM;
+  }
+
+  if (extension == ".gif") return Codec::kGIF;
+
+  if (extension == ".exr") return Codec::kEXR;
+
+  return Codec::kUnknown;
+}
+
+Status DecodeBytes(const Span<const uint8_t> bytes,
+                   const ColorHints& color_hints, extras::PackedPixelFile* ppf,
+                   const SizeConstraints* constraints, Codec* orig_codec) {
+  if (bytes.size() < kMinBytes) return JXL_FAILURE("Too few bytes");
+
+  *ppf = extras::PackedPixelFile();
+
+  // Default values when not set by decoders.
+  ppf->info.uses_original_profile = true;
+  ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+  const auto choose_codec = [&]() -> Codec {
+#if JPEGXL_ENABLE_APNG
+    if (DecodeImageAPNG(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPNG;
+    }
+#endif
+    if (DecodeImagePGX(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPGX;
+    }
+    if (DecodeImagePNM(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPNM;
+    }
+#if JPEGXL_ENABLE_GIF
+    if (DecodeImageGIF(bytes, color_hints, ppf, constraints)) {
+      return Codec::kGIF;
+    }
+#endif
+#if JPEGXL_ENABLE_JPEG
+    if (DecodeImageJPG(bytes, color_hints, ppf, constraints)) {
+      return Codec::kJPG;
+    }
+#endif
+#if JPEGXL_ENABLE_EXR
+    if (DecodeImageEXR(bytes, color_hints, ppf, constraints)) {
+      return Codec::kEXR;
+    }
+#endif
+    return Codec::kUnknown;
+  };
+
+  Codec codec = choose_codec();
+  if (codec == Codec::kUnknown) {
+    return JXL_FAILURE("Codecs failed to decode");
+  }
+  if (orig_codec) *orig_codec = codec;
+
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/decode.h b/third_party/jpeg-xl/lib/extras/dec/decode.h
new file mode 100644
index 0000000000..f802041026
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.h
@@ -0,0 +1,55 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_DECODE_H_
+#define LIB_EXTRAS_DEC_DECODE_H_
+
+// Facade for image decoders (PNG, PNM, ...).
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Codecs supported by CodecInOut::Encode.
+enum class Codec : uint32_t {
+  kUnknown,  // for CodecFromExtension
+  kPNG,
+  kPNM,
+  kPGX,
+  kJPG,
+  kGIF,
+  kEXR
+};
+
+std::vector<Codec> AvailableCodecs();
+
+// If and only if extension is ".pfm", *bits_per_sample is updated to 32 so
+// that Encode() would encode to PFM instead of PPM.
+Codec CodecFromExtension(std::string extension,
+                         size_t* JXL_RESTRICT bits_per_sample = nullptr);
+
+// Decodes "bytes" info *ppf.
+// color_space_hint may specify the color space, otherwise, defaults to sRGB.
+Status DecodeBytes(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                   extras::PackedPixelFile* ppf,
+                   const SizeConstraints* constraints = nullptr,
+                   Codec* orig_codec = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_DECODE_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/exr.cc b/third_party/jpeg-xl/lib/extras/dec/exr.cc
new file mode 100644
index 0000000000..f174ccd0c9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/exr.cc
@@ -0,0 +1,184 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/exr.h"
+
+#include <ImfChromaticitiesAttribute.h>
+#include <ImfIO.h>
+#include <ImfRgbaFile.h>
+#include <ImfStandardAttributes.h>
+
+#include <vector>
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+namespace OpenEXR = OPENEXR_IMF_NAMESPACE;
+namespace Imath = IMATH_NAMESPACE;
+
+// OpenEXR::Int64 is deprecated in favor of using uint64_t directly, but using
+// uint64_t as recommended causes build failures with previous OpenEXR versions
+// on macOS, where the definition for OpenEXR::Int64 was actually not equivalent
+// to uint64_t. This alternative should work in all cases.
+using ExrInt64 = decltype(std::declval<OpenEXR::IStream>().tellg());
+
+constexpr int kExrBitsPerSample = 16;
+constexpr int kExrAlphaBits = 16;
+
+class InMemoryIStream : public OpenEXR::IStream {
+ public:
+  // The data pointed to by `bytes` must outlive the InMemoryIStream.
+  explicit InMemoryIStream(const Span<const uint8_t> bytes)
+      : IStream(/*fileName=*/""), bytes_(bytes) {}
+
+  bool isMemoryMapped() const override { return true; }
+  char* readMemoryMapped(const int n) override {
+    JXL_ASSERT(pos_ + n <= bytes_.size());
+    char* const result =
+        const_cast<char*>(reinterpret_cast<const char*>(bytes_.data() + pos_));
+    pos_ += n;
+    return result;
+  }
+  bool read(char c[], const int n) override {
+    std::copy_n(readMemoryMapped(n), n, c);
+    return pos_ < bytes_.size();
+  }
+
+  ExrInt64 tellg() override { return pos_; }
+  void seekg(const ExrInt64 pos) override {
+    JXL_ASSERT(pos + 1 <= bytes_.size());
+    pos_ = pos;
+  }
+
+ private:
+  const Span<const uint8_t> bytes_;
+  size_t pos_ = 0;
+};
+
+}  // namespace
+
+Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+  InMemoryIStream is(bytes);
+
+#ifdef __EXCEPTIONS
+  std::unique_ptr<OpenEXR::RgbaInputFile> input_ptr;
+  try {
+    input_ptr.reset(new OpenEXR::RgbaInputFile(is));
+  } catch (...) {
+    return JXL_FAILURE("OpenEXR failed to parse input");
+  }
+  OpenEXR::RgbaInputFile& input = *input_ptr;
+#else
+  OpenEXR::RgbaInputFile input(is);
+#endif
+
+  if ((input.channels() & OpenEXR::RgbaChannels::WRITE_RGB) !=
+      OpenEXR::RgbaChannels::WRITE_RGB) {
+    return JXL_FAILURE("only RGB OpenEXR files are supported");
+  }
+  const bool has_alpha = (input.channels() & OpenEXR::RgbaChannels::WRITE_A) ==
+                         OpenEXR::RgbaChannels::WRITE_A;
+
+  const float intensity_target = OpenEXR::hasWhiteLuminance(input.header())
+                                     ? OpenEXR::whiteLuminance(input.header())
+                                     : 0;
+
+  auto image_size = input.displayWindow().size();
+  // Size is computed as max - min, but both bounds are inclusive.
+  ++image_size.x;
+  ++image_size.y;
+
+  ppf->info.xsize = image_size.x;
+  ppf->info.ysize = image_size.y;
+  ppf->info.num_color_channels = 3;
+
+  const JxlDataType data_type =
+      kExrBitsPerSample == 16 ? JXL_TYPE_FLOAT16 : JXL_TYPE_FLOAT;
+  const JxlPixelFormat format{
+      /*num_channels=*/3u + (has_alpha ? 1u : 0u),
+      /*data_type=*/data_type,
+      /*endianness=*/JXL_NATIVE_ENDIAN,
+      /*align=*/0,
+  };
+  ppf->frames.clear();
+  // Allocates the frame buffer.
+  ppf->frames.emplace_back(image_size.x, image_size.y, format);
+  const auto& frame = ppf->frames.back();
+
+  const int row_size = input.dataWindow().size().x + 1;
+  // Number of rows to read at a time.
+  // https://www.openexr.com/documentation/ReadingAndWritingImageFiles.pdf
+  // recommends reading the whole file at once.
+  const int y_chunk_size = input.displayWindow().size().y + 1;
+  std::vector<OpenEXR::Rgba> input_rows(row_size * y_chunk_size);
+  for (int start_y =
+           std::max(input.dataWindow().min.y, input.displayWindow().min.y);
+       start_y <=
+       std::min(input.dataWindow().max.y, input.displayWindow().max.y);
+       start_y += y_chunk_size) {
+    // Inclusive.
+    const int end_y = std::min(
+        start_y + y_chunk_size - 1,
+        std::min(input.dataWindow().max.y, input.displayWindow().max.y));
+    input.setFrameBuffer(
+        input_rows.data() - input.dataWindow().min.x - start_y * row_size,
+        /*xStride=*/1, /*yStride=*/row_size);
+    input.readPixels(start_y, end_y);
+    for (int exr_y = start_y; exr_y <= end_y; ++exr_y) {
+      const int image_y = exr_y - input.displayWindow().min.y;
+      const OpenEXR::Rgba* const JXL_RESTRICT input_row =
+          &input_rows[(exr_y - start_y) * row_size];
+      uint8_t* row = static_cast<uint8_t*>(frame.color.pixels()) +
+                     frame.color.stride * image_y;
+      const uint32_t pixel_size =
+          (3 + (has_alpha ? 1 : 0)) * kExrBitsPerSample / 8;
+      for (int exr_x =
+               std::max(input.dataWindow().min.x, input.displayWindow().min.x);
+           exr_x <=
+           std::min(input.dataWindow().max.x, input.displayWindow().max.x);
+           ++exr_x) {
+        const int image_x = exr_x - input.displayWindow().min.x;
+        memcpy(row + image_x * pixel_size,
+               input_row + (exr_x - input.dataWindow().min.x), pixel_size);
+      }
+    }
+  }
+
+  ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_LINEAR;
+  ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB;
+  ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+  ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+  if (OpenEXR::hasChromaticities(input.header())) {
+    ppf->color_encoding.primaries = JXL_PRIMARIES_CUSTOM;
+    ppf->color_encoding.white_point = JXL_WHITE_POINT_CUSTOM;
+    const auto& chromaticities = OpenEXR::chromaticities(input.header());
+    ppf->color_encoding.primaries_red_xy[0] = chromaticities.red.x;
+    ppf->color_encoding.primaries_red_xy[1] = chromaticities.red.y;
+    ppf->color_encoding.primaries_green_xy[0] = chromaticities.green.x;
+    ppf->color_encoding.primaries_green_xy[1] = chromaticities.green.y;
+    ppf->color_encoding.primaries_blue_xy[0] = chromaticities.blue.x;
+    ppf->color_encoding.primaries_blue_xy[1] = chromaticities.blue.y;
+    ppf->color_encoding.white_point_xy[0] = chromaticities.white.x;
+    ppf->color_encoding.white_point_xy[1] = chromaticities.white.y;
+  }
+
+  // EXR uses binary16 or binary32 floating point format.
+  ppf->info.bits_per_sample = kExrBitsPerSample;
+  ppf->info.exponent_bits_per_sample = kExrBitsPerSample == 16 ? 5 : 8;
+  if (has_alpha) {
+    ppf->info.alpha_bits = kExrAlphaBits;
+    ppf->info.alpha_exponent_bits = ppf->info.exponent_bits_per_sample;
+    ppf->info.alpha_premultiplied = true;
+  }
+  ppf->info.intensity_target = intensity_target;
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/exr.h b/third_party/jpeg-xl/lib/extras/dec/exr.h
new file mode 100644
index 0000000000..6b7c5b714d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/exr.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_EXR_H_
+#define LIB_EXTRAS_DEC_EXR_H_
+
+// Decodes OpenEXR images in memory.
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Decodes `bytes` into `ppf`. color_hints are ignored.
+Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_EXR_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/gif.cc b/third_party/jpeg-xl/lib/extras/dec/gif.cc
new file mode 100644
index 0000000000..4593382b92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/gif.cc
@@ -0,0 +1,400 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/gif.h"
+
+#include <gif_lib.h>
+#include <jxl/codestream_header.h>
+#include <string.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+struct ReadState {
+  Span<const uint8_t> bytes;
+};
+
+struct DGifCloser {
+  void operator()(GifFileType* const ptr) const { DGifCloseFile(ptr, nullptr); }
+};
+using GifUniquePtr = std::unique_ptr<GifFileType, DGifCloser>;
+
+struct PackedRgba {
+  uint8_t r, g, b, a;
+};
+
+struct PackedRgb {
+  uint8_t r, g, b;
+};
+
+void ensure_have_alpha(PackedFrame* frame) {
+  if (!frame->extra_channels.empty()) return;
+  const JxlPixelFormat alpha_format{
+      /*num_channels=*/1u,
+      /*data_type=*/JXL_TYPE_UINT8,
+      /*endianness=*/JXL_NATIVE_ENDIAN,
+      /*align=*/0,
+  };
+  frame->extra_channels.emplace_back(frame->color.xsize, frame->color.ysize,
+                                     alpha_format);
+  // We need to set opaque-by-default.
+  std::fill_n(static_cast<uint8_t*>(frame->extra_channels[0].pixels()),
+              frame->color.xsize * frame->color.ysize, 255u);
+}
+
+}  // namespace
+
+Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+  int error = GIF_OK;
+  ReadState state = {bytes};
+  const auto ReadFromSpan = [](GifFileType* const gif, GifByteType* const bytes,
+                               int n) {
+    ReadState* const state = reinterpret_cast<ReadState*>(gif->UserData);
+    // giflib API requires the input size `n` to be signed int.
+    if (static_cast<size_t>(n) > state->bytes.size()) {
+      n = state->bytes.size();
+    }
+    memcpy(bytes, state->bytes.data(), n);
+    state->bytes.remove_prefix(n);
+    return n;
+  };
+  GifUniquePtr gif(DGifOpen(&state, ReadFromSpan, &error));
+  if (gif == nullptr) {
+    if (error == D_GIF_ERR_NOT_GIF_FILE) {
+      // Not an error.
+      return false;
+    } else {
+      return JXL_FAILURE("Failed to read GIF: %s", GifErrorString(error));
+    }
+  }
+  error = DGifSlurp(gif.get());
+  if (error != GIF_OK) {
+    return JXL_FAILURE("Failed to read GIF: %s", GifErrorString(gif->Error));
+  }
+
+  msan::UnpoisonMemory(gif.get(), sizeof(*gif));
+  if (gif->SColorMap) {
+    msan::UnpoisonMemory(gif->SColorMap, sizeof(*gif->SColorMap));
+    msan::UnpoisonMemory(
+        gif->SColorMap->Colors,
+        sizeof(*gif->SColorMap->Colors) * gif->SColorMap->ColorCount);
+  }
+  msan::UnpoisonMemory(gif->SavedImages,
+                       sizeof(*gif->SavedImages) * gif->ImageCount);
+
+  JXL_RETURN_IF_ERROR(
+      VerifyDimensions<uint32_t>(constraints, gif->SWidth, gif->SHeight));
+  uint64_t total_pixel_count =
+      static_cast<uint64_t>(gif->SWidth) * gif->SHeight;
+  for (int i = 0; i < gif->ImageCount; ++i) {
+    const SavedImage& image = gif->SavedImages[i];
+    uint32_t w = image.ImageDesc.Width;
+    uint32_t h = image.ImageDesc.Height;
+    JXL_RETURN_IF_ERROR(VerifyDimensions<uint32_t>(constraints, w, h));
+    uint64_t pixel_count = static_cast<uint64_t>(w) * h;
+    if (total_pixel_count + pixel_count < total_pixel_count) {
+      return JXL_FAILURE("Image too big");
+    }
+    total_pixel_count += pixel_count;
+    if (constraints && (total_pixel_count > constraints->dec_max_pixels)) {
+      return JXL_FAILURE("Image too big");
+    }
+  }
+
+  if (!gif->SColorMap) {
+    for (int i = 0; i < gif->ImageCount; ++i) {
+      if (!gif->SavedImages[i].ImageDesc.ColorMap) {
+        return JXL_FAILURE("Missing GIF color map");
+      }
+    }
+  }
+
+  if (gif->ImageCount > 1) {
+    ppf->info.have_animation = true;
+    // Delays in GIF are specified in 100ths of a second.
+    ppf->info.animation.tps_numerator = 100;
+    ppf->info.animation.tps_denominator = 1;
+  }
+
+  ppf->frames.clear();
+  ppf->frames.reserve(gif->ImageCount);
+
+  ppf->info.xsize = gif->SWidth;
+  ppf->info.ysize = gif->SHeight;
+  ppf->info.bits_per_sample = 8;
+  ppf->info.exponent_bits_per_sample = 0;
+  // alpha_bits is later set to 8 if we find a frame with transparent pixels.
+  ppf->info.alpha_bits = 0;
+  ppf->info.alpha_exponent_bits = 0;
+  JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, /*color_already_set=*/false,
+                                      /*is_gray=*/false, ppf));
+
+  ppf->info.num_color_channels = 3;
+
+  // Pixel format for the 'canvas' onto which we paint
+  // the (potentially individually cropped) GIF frames
+  // of an animation.
+  const JxlPixelFormat canvas_format{
+      /*num_channels=*/4u,
+      /*data_type=*/JXL_TYPE_UINT8,
+      /*endianness=*/JXL_NATIVE_ENDIAN,
+      /*align=*/0,
+  };
+
+  // Pixel format for the JXL PackedFrame that goes into the
+  // PackedPixelFile. Here, we use 3 color channels, and provide
+  // the alpha channel as an extra_channel wherever it is used.
+  const JxlPixelFormat packed_frame_format{
+      /*num_channels=*/3u,
+      /*data_type=*/JXL_TYPE_UINT8,
+      /*endianness=*/JXL_NATIVE_ENDIAN,
+      /*align=*/0,
+  };
+
+  GifColorType background_color;
+  if (gif->SColorMap == nullptr ||
+      gif->SBackGroundColor >= gif->SColorMap->ColorCount) {
+    background_color = {0, 0, 0};
+  } else {
+    background_color = gif->SColorMap->Colors[gif->SBackGroundColor];
+  }
+  const PackedRgba background_rgba{background_color.Red, background_color.Green,
+                                   background_color.Blue, 0};
+  PackedFrame canvas(gif->SWidth, gif->SHeight, canvas_format);
+  std::fill_n(static_cast<PackedRgba*>(canvas.color.pixels()),
+              canvas.color.xsize * canvas.color.ysize, background_rgba);
+  Rect canvas_rect{0, 0, canvas.color.xsize, canvas.color.ysize};
+
+  Rect previous_rect_if_restore_to_background;
+
+  bool replace = true;
+  bool last_base_was_none = true;
+  for (int i = 0; i < gif->ImageCount; ++i) {
+    const SavedImage& image = gif->SavedImages[i];
+    msan::UnpoisonMemory(image.RasterBits, sizeof(*image.RasterBits) *
+                                               image.ImageDesc.Width *
+                                               image.ImageDesc.Height);
+    const Rect image_rect(image.ImageDesc.Left, image.ImageDesc.Top,
+                          image.ImageDesc.Width, image.ImageDesc.Height);
+
+    Rect total_rect;
+    if (previous_rect_if_restore_to_background.xsize() != 0 ||
+        previous_rect_if_restore_to_background.ysize() != 0) {
+      const size_t xbegin = std::min(
+          image_rect.x0(), previous_rect_if_restore_to_background.x0());
+      const size_t ybegin = std::min(
+          image_rect.y0(), previous_rect_if_restore_to_background.y0());
+      const size_t xend =
+          std::max(image_rect.x0() + image_rect.xsize(),
+                   previous_rect_if_restore_to_background.x0() +
+                       previous_rect_if_restore_to_background.xsize());
+      const size_t yend =
+          std::max(image_rect.y0() + image_rect.ysize(),
+                   previous_rect_if_restore_to_background.y0() +
+                       previous_rect_if_restore_to_background.ysize());
+      total_rect = Rect(xbegin, ybegin, xend - xbegin, yend - ybegin);
+      previous_rect_if_restore_to_background = Rect();
+      replace = true;
+    } else {
+      total_rect = image_rect;
+      replace = false;
+    }
+    if (!image_rect.IsInside(canvas_rect)) {
+      return JXL_FAILURE("GIF frame extends outside of the canvas");
+    }
+
+    // Allocates the frame buffer.
+    ppf->frames.emplace_back(total_rect.xsize(), total_rect.ysize(),
+                             packed_frame_format);
+    PackedFrame* frame = &ppf->frames.back();
+
+    // We cannot tell right from the start whether there will be a
+    // need for an alpha channel. This is discovered only as soon as
+    // we see a transparent pixel. We hence initialize alpha lazily.
+    auto set_pixel_alpha = [&frame](size_t x, size_t y, uint8_t a) {
+      // If we do not have an alpha-channel and a==255 (fully opaque),
+      // we can skip setting this pixel-value and rely on
+      // "no alpha channel = no transparency".
+      if (a == 255 && !frame->extra_channels.empty()) return;
+      ensure_have_alpha(frame);
+      static_cast<uint8_t*>(
+          frame->extra_channels[0].pixels())[y * frame->color.xsize + x] = a;
+    };
+
+    const ColorMapObject* const color_map =
+        image.ImageDesc.ColorMap ? image.ImageDesc.ColorMap : gif->SColorMap;
+    JXL_CHECK(color_map);
+    msan::UnpoisonMemory(color_map, sizeof(*color_map));
+    msan::UnpoisonMemory(color_map->Colors,
+                         sizeof(*color_map->Colors) * color_map->ColorCount);
+    GraphicsControlBlock gcb;
+    DGifSavedExtensionToGCB(gif.get(), i, &gcb);
+    msan::UnpoisonMemory(&gcb, sizeof(gcb));
+    bool is_full_size = total_rect.x0() == 0 && total_rect.y0() == 0 &&
+                        total_rect.xsize() == canvas.color.xsize &&
+                        total_rect.ysize() == canvas.color.ysize;
+    if (ppf->info.have_animation) {
+      frame->frame_info.duration = gcb.DelayTime;
+      frame->frame_info.layer_info.have_crop = static_cast<int>(!is_full_size);
+      frame->frame_info.layer_info.crop_x0 = total_rect.x0();
+      frame->frame_info.layer_info.crop_y0 = total_rect.y0();
+      frame->frame_info.layer_info.xsize = frame->color.xsize;
+      frame->frame_info.layer_info.ysize = frame->color.ysize;
+      if (last_base_was_none) {
+        replace = true;
+      }
+      frame->frame_info.layer_info.blend_info.blendmode =
+          replace ? JXL_BLEND_REPLACE : JXL_BLEND_BLEND;
+      // We always only reference at most the last frame
+      frame->frame_info.layer_info.blend_info.source =
+          last_base_was_none ? 0u : 1u;
+      frame->frame_info.layer_info.blend_info.clamp = 1;
+      frame->frame_info.layer_info.blend_info.alpha = 0;
+      // TODO(veluca): this could in principle be implemented.
+      if (last_base_was_none &&
+          (total_rect.x0() != 0 || total_rect.y0() != 0 ||
+           total_rect.xsize() != canvas.color.xsize ||
+           total_rect.ysize() != canvas.color.ysize || !replace)) {
+        return JXL_FAILURE(
+            "GIF with dispose-to-0 is not supported for non-full or "
+            "blended frames");
+      }
+      switch (gcb.DisposalMode) {
+        case DISPOSE_DO_NOT:
+        case DISPOSE_BACKGROUND:
+          frame->frame_info.layer_info.save_as_reference = 1u;
+          last_base_was_none = false;
+          break;
+        case DISPOSE_PREVIOUS:
+          frame->frame_info.layer_info.save_as_reference = 0u;
+          break;
+        default:
+          frame->frame_info.layer_info.save_as_reference = 0u;
+          last_base_was_none = true;
+      }
+    }
+
+    // Update the canvas by creating a copy first.
+    PackedImage new_canvas_image(canvas.color.xsize, canvas.color.ysize,
+                                 canvas.color.format);
+    memcpy(new_canvas_image.pixels(), canvas.color.pixels(),
+           new_canvas_image.pixels_size);
+    for (size_t y = 0, byte_index = 0; y < image_rect.ysize(); ++y) {
+      // Assumes format.align == 0. row points to the beginning of the y row in
+      // the image_rect.
+      PackedRgba* row = static_cast<PackedRgba*>(new_canvas_image.pixels()) +
+                        (y + image_rect.y0()) * new_canvas_image.xsize +
+                        image_rect.x0();
+      for (size_t x = 0; x < image_rect.xsize(); ++x, ++byte_index) {
+        const GifByteType byte = image.RasterBits[byte_index];
+        if (byte >= color_map->ColorCount) {
+          return JXL_FAILURE("GIF color is out of bounds");
+        }
+
+        if (byte == gcb.TransparentColor) continue;
+        GifColorType color = color_map->Colors[byte];
+        row[x].r = color.Red;
+        row[x].g = color.Green;
+        row[x].b = color.Blue;
+        row[x].a = 255;
+      }
+    }
+    const PackedImage& sub_frame_image = frame->color;
+    if (replace) {
+      // Copy from the new canvas image to the subframe
+      for (size_t y = 0; y < total_rect.ysize(); ++y) {
+        const PackedRgba* row_in =
+            static_cast<const PackedRgba*>(new_canvas_image.pixels()) +
+            (y + total_rect.y0()) * new_canvas_image.xsize + total_rect.x0();
+        PackedRgb* row_out = static_cast<PackedRgb*>(sub_frame_image.pixels()) +
+                             y * sub_frame_image.xsize;
+        for (size_t x = 0; x < sub_frame_image.xsize; ++x) {
+          row_out[x].r = row_in[x].r;
+          row_out[x].g = row_in[x].g;
+          row_out[x].b = row_in[x].b;
+          set_pixel_alpha(x, y, row_in[x].a);
+        }
+      }
+    } else {
+      for (size_t y = 0, byte_index = 0; y < image_rect.ysize(); ++y) {
+        // Assumes format.align == 0
+        PackedRgb* row = static_cast<PackedRgb*>(sub_frame_image.pixels()) +
+                         y * sub_frame_image.xsize;
+        for (size_t x = 0; x < image_rect.xsize(); ++x, ++byte_index) {
+          const GifByteType byte = image.RasterBits[byte_index];
+          if (byte > color_map->ColorCount) {
+            return JXL_FAILURE("GIF color is out of bounds");
+          }
+          if (byte == gcb.TransparentColor) {
+            row[x].r = 0;
+            row[x].g = 0;
+            row[x].b = 0;
+            set_pixel_alpha(x, y, 0);
+            continue;
+          }
+          GifColorType color = color_map->Colors[byte];
+          row[x].r = color.Red;
+          row[x].g = color.Green;
+          row[x].b = color.Blue;
+          set_pixel_alpha(x, y, 255);
+        }
+      }
+    }
+
+    if (!frame->extra_channels.empty()) {
+      ppf->info.alpha_bits = 8;
+    }
+
+    switch (gcb.DisposalMode) {
+      case DISPOSE_DO_NOT:
+        canvas.color = std::move(new_canvas_image);
+        break;
+
+      case DISPOSE_BACKGROUND:
+        std::fill_n(static_cast<PackedRgba*>(canvas.color.pixels()),
+                    canvas.color.xsize * canvas.color.ysize, background_rgba);
+        previous_rect_if_restore_to_background = image_rect;
+        break;
+
+      case DISPOSE_PREVIOUS:
+        break;
+
+      case DISPOSAL_UNSPECIFIED:
+      default:
+        std::fill_n(static_cast<PackedRgba*>(canvas.color.pixels()),
+                    canvas.color.xsize * canvas.color.ysize, background_rgba);
+    }
+  }
+  // Finally, if any frame has an alpha-channel, every frame will need
+  // to have an alpha-channel.
+  bool seen_alpha = false;
+  for (const PackedFrame& frame : ppf->frames) {
+    if (!frame.extra_channels.empty()) {
+      seen_alpha = true;
+      break;
+    }
+  }
+  if (seen_alpha) {
+    for (PackedFrame& frame : ppf->frames) {
+      ensure_have_alpha(&frame);
+    }
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/gif.h b/third_party/jpeg-xl/lib/extras/dec/gif.h
new file mode 100644
index 0000000000..e217d617a7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/gif.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_GIF_H_
+#define LIB_EXTRAS_DEC_GIF_H_
+
+// Decodes GIF images in memory.
+
+#include <stdint.h>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Decodes `bytes` into `ppf`. color_hints are ignored.
+Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_GIF_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/jpegli.cc b/third_party/jpeg-xl/lib/extras/dec/jpegli.cc
new file mode 100644
index 0000000000..ffa1b79c25
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jpegli.cc
@@ -0,0 +1,271 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jpegli.h"
+
+#include <setjmp.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+constexpr int kExifMarker = JPEG_APP0 + 1;
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+static inline bool IsJPG(const std::vector<uint8_t>& bytes) {
+  if (bytes.size() < 2) return false;
+  if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
+  return true;
+}
+
+bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
+  return marker->marker == kExifMarker &&
+         marker->data_length >= sizeof kExifSignature + 2 &&
+         std::equal(std::begin(kExifSignature), std::end(kExifSignature),
+                    marker->data);
+}
+
+Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
+                      std::vector<uint8_t>* const icc) {
+  uint8_t* icc_data_ptr;
+  unsigned int icc_data_len;
+  if (jpegli_read_icc_profile(cinfo, &icc_data_ptr, &icc_data_len)) {
+    icc->assign(icc_data_ptr, icc_data_ptr + icc_data_len);
+    free(icc_data_ptr);
+    return true;
+  }
+  return false;
+}
+
+void ReadExif(jpeg_decompress_struct* const cinfo,
+              std::vector<uint8_t>* const exif) {
+  constexpr size_t kExifSignatureSize = sizeof kExifSignature;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    // marker is initialized by libjpeg, which we are not instrumenting with
+    // msan.
+    msan::UnpoisonMemory(marker, sizeof(*marker));
+    msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (!MarkerIsExif(marker)) continue;
+    size_t marker_length = marker->data_length - kExifSignatureSize;
+    exif->resize(marker_length);
+    std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
+    return;
+  }
+}
+
+JpegliDataType ConvertDataType(JxlDataType type) {
+  switch (type) {
+    case JXL_TYPE_UINT8:
+      return JPEGLI_TYPE_UINT8;
+    case JXL_TYPE_UINT16:
+      return JPEGLI_TYPE_UINT16;
+    case JXL_TYPE_FLOAT:
+      return JPEGLI_TYPE_FLOAT;
+    default:
+      return JPEGLI_TYPE_UINT8;
+  }
+}
+
+JpegliEndianness ConvertEndianness(JxlEndianness type) {
+  switch (type) {
+    case JXL_NATIVE_ENDIAN:
+      return JPEGLI_NATIVE_ENDIAN;
+    case JXL_BIG_ENDIAN:
+      return JPEGLI_BIG_ENDIAN;
+    case JXL_LITTLE_ENDIAN:
+      return JPEGLI_LITTLE_ENDIAN;
+    default:
+      return JPEGLI_NATIVE_ENDIAN;
+  }
+}
+
+JxlColorSpace ConvertColorSpace(J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      return JXL_COLOR_SPACE_GRAY;
+    case JCS_RGB:
+      return JXL_COLOR_SPACE_RGB;
+    default:
+      return JXL_COLOR_SPACE_UNKNOWN;
+  }
+}
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+void MyOutputMessage(j_common_ptr cinfo) {
+#if JXL_DEBUG_WARNING == 1
+  char buf[JMSG_LENGTH_MAX + 1];
+  (*cinfo->err->format_message)(cinfo, buf);
+  buf[JMSG_LENGTH_MAX] = 0;
+  JXL_WARNING("%s", buf);
+#endif
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+}  // namespace
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  const JpegDecompressParams& dparams, ThreadPool* pool,
+                  PackedPixelFile* ppf) {
+  // Don't do anything for non-JPEG files (no need to report an error)
+  if (!IsJPG(compressed)) return false;
+
+  // TODO(veluca): use JPEGData also for pixels?
+
+  // We need to declare all the non-trivial destructor local variables before
+  // the call to setjmp().
+  std::unique_ptr<JSAMPLE[]> row;
+
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    // Setup error handling in jpeg library so we can deal with broken jpegs in
+    // the fuzzer.
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    jerr.output_message = &MyOutputMessage;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo,
+                   reinterpret_cast<const unsigned char*>(compressed.data()),
+                   compressed.size());
+    jpegli_save_markers(&cinfo, kICCMarker, 0xFFFF);
+    jpegli_save_markers(&cinfo, kExifMarker, 0xFFFF);
+    const auto failure = [&cinfo](const char* str) -> Status {
+      jpegli_abort_decompress(&cinfo);
+      jpegli_destroy_decompress(&cinfo);
+      return JXL_FAILURE("%s", str);
+    };
+    jpegli_read_header(&cinfo, TRUE);
+    // Might cause CPU-zip bomb.
+    if (cinfo.arith_code) {
+      return failure("arithmetic code JPEGs are not supported");
+    }
+    int nbcomp = cinfo.num_components;
+    if (nbcomp != 1 && nbcomp != 3) {
+      return failure("unsupported number of components in JPEG");
+    }
+    if (dparams.force_rgb) {
+      cinfo.out_color_space = JCS_RGB;
+    } else if (dparams.force_grayscale) {
+      cinfo.out_color_space = JCS_GRAYSCALE;
+    }
+    if (!ReadICCProfile(&cinfo, &ppf->icc)) {
+      ppf->icc.clear();
+      // Default to SRGB
+      ppf->color_encoding.color_space =
+          ConvertColorSpace(cinfo.out_color_space);
+      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
+    }
+    ReadExif(&cinfo, &ppf->metadata.exif);
+
+    ppf->info.xsize = cinfo.image_width;
+    ppf->info.ysize = cinfo.image_height;
+    if (dparams.output_data_type == JXL_TYPE_UINT8) {
+      ppf->info.bits_per_sample = 8;
+      ppf->info.exponent_bits_per_sample = 0;
+    } else if (dparams.output_data_type == JXL_TYPE_UINT16) {
+      ppf->info.bits_per_sample = 16;
+      ppf->info.exponent_bits_per_sample = 0;
+    } else if (dparams.output_data_type == JXL_TYPE_FLOAT) {
+      ppf->info.bits_per_sample = 32;
+      ppf->info.exponent_bits_per_sample = 8;
+    } else {
+      return failure("unsupported data type");
+    }
+    ppf->info.uses_original_profile = true;
+
+    // No alpha in JPG
+    ppf->info.alpha_bits = 0;
+    ppf->info.alpha_exponent_bits = 0;
+    ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+    jpegli_set_output_format(&cinfo, ConvertDataType(dparams.output_data_type),
+                             ConvertEndianness(dparams.output_endianness));
+
+    if (dparams.num_colors > 0) {
+      cinfo.quantize_colors = TRUE;
+      cinfo.desired_number_of_colors = dparams.num_colors;
+      cinfo.two_pass_quantize = dparams.two_pass_quant;
+      cinfo.dither_mode = (J_DITHER_MODE)dparams.dither_mode;
+    }
+
+    jpegli_start_decompress(&cinfo);
+
+    ppf->info.num_color_channels = cinfo.out_color_components;
+    const JxlPixelFormat format{
+        /*num_channels=*/static_cast<uint32_t>(cinfo.out_color_components),
+        dparams.output_data_type,
+        dparams.output_endianness,
+        /*align=*/0,
+    };
+    ppf->frames.clear();
+    // Allocates the frame buffer.
+    ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
+    const auto& frame = ppf->frames.back();
+    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.out_color_components *
+                   cinfo.image_width <=
+               frame.color.stride);
+
+    for (size_t y = 0; y < cinfo.image_height; ++y) {
+      JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
+          static_cast<uint8_t*>(frame.color.pixels()) +
+          frame.color.stride * y)};
+      jpegli_read_scanlines(&cinfo, rows, 1);
+      if (dparams.num_colors > 0) {
+        UnmapColors(rows[0], cinfo.output_width, cinfo.out_color_components,
+                    cinfo.colormap, cinfo.actual_number_of_colors);
+      }
+    }
+
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return success;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/jpegli.h b/third_party/jpeg-xl/lib/extras/dec/jpegli.h
new file mode 100644
index 0000000000..574df54c8e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jpegli.h
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_JPEGLI_H_
+#define LIB_EXTRAS_DEC_JPEGLI_H_
+
+// Decodes JPG pixels and metadata in memory using the libjpegli library.
+
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+struct JpegDecompressParams {
+  JxlDataType output_data_type = JXL_TYPE_UINT8;
+  JxlEndianness output_endianness = JXL_NATIVE_ENDIAN;
+  bool force_rgb = false;
+  bool force_grayscale = false;
+  int num_colors = 0;
+  bool two_pass_quant = true;
+  // 0 = none, 1 = ordered, 2 = Floyd-Steinberg
+  int dither_mode = 2;
+};
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  const JpegDecompressParams& dparams, ThreadPool* pool,
+                  PackedPixelFile* ppf);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_JPEGLI_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/jpg.cc b/third_party/jpeg-xl/lib/extras/dec/jpg.cc
new file mode 100644
index 0000000000..b3c568b87b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jpg.cc
@@ -0,0 +1,322 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jpg.h"
+
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+constexpr int kExifMarker = JPEG_APP0 + 1;
+
+static inline bool IsJPG(const Span<const uint8_t> bytes) {
+  if (bytes.size() < 2) return false;
+  if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
+  return true;
+}
+
+bool MarkerIsICC(const jpeg_saved_marker_ptr marker) {
+  return marker->marker == kICCMarker &&
+         marker->data_length >= sizeof kICCSignature + 2 &&
+         std::equal(std::begin(kICCSignature), std::end(kICCSignature),
+                    marker->data);
+}
+bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
+  return marker->marker == kExifMarker &&
+         marker->data_length >= sizeof kExifSignature + 2 &&
+         std::equal(std::begin(kExifSignature), std::end(kExifSignature),
+                    marker->data);
+}
+
+Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
+                      std::vector<uint8_t>* const icc) {
+  constexpr size_t kICCSignatureSize = sizeof kICCSignature;
+  // ICC signature + uint8_t index + uint8_t max_index.
+  constexpr size_t kICCHeadSize = kICCSignatureSize + 2;
+  // Markers are 1-indexed, and we keep them that way in this vector to get a
+  // convenient 0 at the front for when we compute the offsets later.
+  std::vector<size_t> marker_lengths;
+  int num_markers = 0;
+  int seen_markers_count = 0;
+  bool has_num_markers = false;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    // marker is initialized by libjpeg, which we are not instrumenting with
+    // msan.
+    msan::UnpoisonMemory(marker, sizeof(*marker));
+    msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (!MarkerIsICC(marker)) continue;
+
+    const int current_marker = marker->data[kICCSignatureSize];
+    if (current_marker == 0) {
+      return JXL_FAILURE("inconsistent JPEG ICC marker numbering");
+    }
+    const int current_num_markers = marker->data[kICCSignatureSize + 1];
+    if (current_marker > current_num_markers) {
+      return JXL_FAILURE("inconsistent JPEG ICC marker numbering");
+    }
+    if (has_num_markers) {
+      if (current_num_markers != num_markers) {
+        return JXL_FAILURE("inconsistent numbers of JPEG ICC markers");
+      }
+    } else {
+      num_markers = current_num_markers;
+      has_num_markers = true;
+      marker_lengths.resize(num_markers + 1);
+    }
+
+    size_t marker_length = marker->data_length - kICCHeadSize;
+
+    if (marker_length == 0) {
+      // NB: if we allow empty chunks, then the next check is incorrect.
+      return JXL_FAILURE("Empty ICC chunk");
+    }
+
+    if (marker_lengths[current_marker] != 0) {
+      return JXL_FAILURE("duplicate JPEG ICC marker number");
+    }
+    marker_lengths[current_marker] = marker_length;
+    seen_markers_count++;
+  }
+
+  if (marker_lengths.empty()) {
+    // Not an error.
+    return false;
+  }
+
+  if (seen_markers_count != num_markers) {
+    JXL_DASSERT(has_num_markers);
+    return JXL_FAILURE("Incomplete set of ICC chunks");
+  }
+
+  std::vector<size_t> offsets = std::move(marker_lengths);
+  std::partial_sum(offsets.begin(), offsets.end(), offsets.begin());
+  icc->resize(offsets.back());
+
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    if (!MarkerIsICC(marker)) continue;
+    const uint8_t* first = marker->data + kICCHeadSize;
+    uint8_t current_marker = marker->data[kICCSignatureSize];
+    size_t offset = offsets[current_marker - 1];
+    size_t marker_length = offsets[current_marker] - offset;
+    std::copy_n(first, marker_length, icc->data() + offset);
+  }
+
+  return true;
+}
+
+void ReadExif(jpeg_decompress_struct* const cinfo,
+              std::vector<uint8_t>* const exif) {
+  constexpr size_t kExifSignatureSize = sizeof kExifSignature;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    // marker is initialized by libjpeg, which we are not instrumenting with
+    // msan.
+    msan::UnpoisonMemory(marker, sizeof(*marker));
+    msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (!MarkerIsExif(marker)) continue;
+    size_t marker_length = marker->data_length - kExifSignatureSize;
+    exif->resize(marker_length);
+    std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
+    return;
+  }
+}
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpeg_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+void MyOutputMessage(j_common_ptr cinfo) {
+#if JXL_DEBUG_WARNING == 1
+  char buf[JMSG_LENGTH_MAX + 1];
+  (*cinfo->err->format_message)(cinfo, buf);
+  buf[JMSG_LENGTH_MAX] = 0;
+  JXL_WARNING("%s", buf);
+#endif
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+}  // namespace
+
+Status DecodeImageJPG(const Span<const uint8_t> bytes,
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints,
+                      const JPGDecompressParams* dparams) {
+  // Don't do anything for non-JPEG files (no need to report an error)
+  if (!IsJPG(bytes)) return false;
+
+  // TODO(veluca): use JPEGData also for pixels?
+
+  // We need to declare all the non-trivial destructor local variables before
+  // the call to setjmp().
+  std::unique_ptr<JSAMPLE[]> row;
+
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_decompress_struct cinfo = {};
+    // Setup error handling in jpeg library so we can deal with broken jpegs in
+    // the fuzzer.
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpeg_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    jerr.output_message = &MyOutputMessage;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, reinterpret_cast<const unsigned char*>(bytes.data()),
+                 bytes.size());
+    jpeg_save_markers(&cinfo, kICCMarker, 0xFFFF);
+    jpeg_save_markers(&cinfo, kExifMarker, 0xFFFF);
+    const auto failure = [&cinfo](const char* str) -> Status {
+      jpeg_abort_decompress(&cinfo);
+      jpeg_destroy_decompress(&cinfo);
+      return JXL_FAILURE("%s", str);
+    };
+    int read_header_result = jpeg_read_header(&cinfo, TRUE);
+    // TODO(eustas): what about JPEG_HEADER_TABLES_ONLY?
+    if (read_header_result == JPEG_SUSPENDED) {
+      return failure("truncated JPEG input");
+    }
+    if (!VerifyDimensions(constraints, cinfo.image_width, cinfo.image_height)) {
+      return failure("image too big");
+    }
+    // Might cause CPU-zip bomb.
+    if (cinfo.arith_code) {
+      return failure("arithmetic code JPEGs are not supported");
+    }
+    int nbcomp = cinfo.num_components;
+    if (nbcomp != 1 && nbcomp != 3) {
+      return failure("unsupported number of components in JPEG");
+    }
+    if (!ReadICCProfile(&cinfo, &ppf->icc)) {
+      ppf->icc.clear();
+      // Default to SRGB
+      // Actually, (cinfo.output_components == nbcomp) will be checked after
+      // `jpeg_start_decompress`.
+      ppf->color_encoding.color_space =
+          (nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
+      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
+    }
+    ReadExif(&cinfo, &ppf->metadata.exif);
+    if (!ApplyColorHints(color_hints, /*color_already_set=*/true,
+                         /*is_gray=*/false, ppf)) {
+      return failure("ApplyColorHints failed");
+    }
+
+    ppf->info.xsize = cinfo.image_width;
+    ppf->info.ysize = cinfo.image_height;
+    // Original data is uint, so exponent_bits_per_sample = 0.
+    ppf->info.bits_per_sample = BITS_IN_JSAMPLE;
+    JXL_ASSERT(BITS_IN_JSAMPLE == 8 || BITS_IN_JSAMPLE == 16);
+    ppf->info.exponent_bits_per_sample = 0;
+    ppf->info.uses_original_profile = true;
+
+    // No alpha in JPG
+    ppf->info.alpha_bits = 0;
+    ppf->info.alpha_exponent_bits = 0;
+
+    ppf->info.num_color_channels = nbcomp;
+    ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+    if (dparams && dparams->num_colors > 0) {
+      cinfo.quantize_colors = TRUE;
+      cinfo.desired_number_of_colors = dparams->num_colors;
+      cinfo.two_pass_quantize = dparams->two_pass_quant;
+      cinfo.dither_mode = (J_DITHER_MODE)dparams->dither_mode;
+    }
+
+    jpeg_start_decompress(&cinfo);
+    JXL_ASSERT(cinfo.out_color_components == nbcomp);
+    JxlDataType data_type =
+        ppf->info.bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
+
+    const JxlPixelFormat format{
+        /*num_channels=*/static_cast<uint32_t>(nbcomp),
+        data_type,
+        /*endianness=*/JXL_NATIVE_ENDIAN,
+        /*align=*/0,
+    };
+    ppf->frames.clear();
+    // Allocates the frame buffer.
+    ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
+    const auto& frame = ppf->frames.back();
+    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.out_color_components *
+                   cinfo.image_width <=
+               frame.color.stride);
+
+    if (cinfo.quantize_colors) {
+      jxl::msan::UnpoisonMemory(cinfo.colormap, cinfo.out_color_components *
+                                                    sizeof(cinfo.colormap[0]));
+      for (int c = 0; c < cinfo.out_color_components; ++c) {
+        jxl::msan::UnpoisonMemory(
+            cinfo.colormap[c],
+            cinfo.actual_number_of_colors * sizeof(cinfo.colormap[c][0]));
+      }
+    }
+    for (size_t y = 0; y < cinfo.image_height; ++y) {
+      JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
+          static_cast<uint8_t*>(frame.color.pixels()) +
+          frame.color.stride * y)};
+      jpeg_read_scanlines(&cinfo, rows, 1);
+      msan::UnpoisonMemory(rows[0], sizeof(JSAMPLE) * cinfo.output_components *
+                                        cinfo.image_width);
+      if (dparams && dparams->num_colors > 0) {
+        UnmapColors(rows[0], cinfo.output_width, cinfo.out_color_components,
+                    cinfo.colormap, cinfo.actual_number_of_colors);
+      }
+    }
+
+    jpeg_finish_decompress(&cinfo);
+    jpeg_destroy_decompress(&cinfo);
+    return true;
+  };
+
+  return try_catch_block();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/jpg.h b/third_party/jpeg-xl/lib/extras/dec/jpg.h
new file mode 100644
index 0000000000..e3de2536ac
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jpg.h
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_JPG_H_
+#define LIB_EXTRAS_DEC_JPG_H_
+
+// Decodes JPG pixels and metadata in memory.
+
+#include <stdint.h>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/color_hints.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+struct JPGDecompressParams {
+  int num_colors = 0;
+  bool two_pass_quant = false;
+  // 0 = none, 1 = ordered, 2 = Floyd-Steinberg
+  int dither_mode = 0;
+};
+
+// Decodes `bytes` into `ppf`. color_hints are ignored.
+// `elapsed_deinterleave`, if non-null, will be set to the time (in seconds)
+// that it took to deinterleave the raw JSAMPLEs to planar floats.
+Status DecodeImageJPG(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr,
+                      const JPGDecompressParams* dparams = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_JPG_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/jxl.cc b/third_party/jpeg-xl/lib/extras/dec/jxl.cc
new file mode 100644
index 0000000000..224f7c7bf9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.cc
@@ -0,0 +1,561 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jxl.h"
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/types.h>
+
+#include "lib/extras/dec/color_description.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/exif.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+struct BoxProcessor {
+  BoxProcessor(JxlDecoder* dec) : dec_(dec) { Reset(); }
+
+  void InitializeOutput(std::vector<uint8_t>* out) {
+    box_data_ = out;
+    AddMoreOutput();
+  }
+
+  bool AddMoreOutput() {
+    Flush();
+    static const size_t kBoxOutputChunkSize = 1 << 16;
+    box_data_->resize(box_data_->size() + kBoxOutputChunkSize);
+    next_out_ = box_data_->data() + total_size_;
+    avail_out_ = box_data_->size() - total_size_;
+    if (JXL_DEC_SUCCESS !=
+        JxlDecoderSetBoxBuffer(dec_, next_out_, avail_out_)) {
+      fprintf(stderr, "JxlDecoderSetBoxBuffer failed\n");
+      return false;
+    }
+    return true;
+  }
+
+  void FinalizeOutput() {
+    if (box_data_ == nullptr) return;
+    Flush();
+    box_data_->resize(total_size_);
+    Reset();
+  }
+
+ private:
+  JxlDecoder* dec_;
+  std::vector<uint8_t>* box_data_;
+  uint8_t* next_out_;
+  size_t avail_out_;
+  size_t total_size_;
+
+  void Reset() {
+    box_data_ = nullptr;
+    next_out_ = nullptr;
+    avail_out_ = 0;
+    total_size_ = 0;
+  }
+  void Flush() {
+    if (box_data_ == nullptr) return;
+    size_t remaining = JxlDecoderReleaseBoxBuffer(dec_);
+    size_t bytes_written = avail_out_ - remaining;
+    next_out_ += bytes_written;
+    avail_out_ -= bytes_written;
+    total_size_ += bytes_written;
+  }
+};
+
+void SetBitDepthFromDataType(JxlDataType data_type, uint32_t* bits_per_sample,
+                             uint32_t* exponent_bits_per_sample) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      *bits_per_sample = 8;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_FLOAT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_FLOAT:
+      *bits_per_sample = 32;
+      *exponent_bits_per_sample = 8;
+      break;
+  }
+}
+
+template <typename T>
+void UpdateBitDepth(JxlBitDepth bit_depth, JxlDataType data_type, T* info) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    SetBitDepthFromDataType(data_type, &info->bits_per_sample,
+                            &info->exponent_bits_per_sample);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    info->bits_per_sample = bit_depth.bits_per_sample;
+    info->exponent_bits_per_sample = bit_depth.exponent_bits_per_sample;
+  }
+}
+
+}  // namespace
+
+bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
+                    const JXLDecompressParams& dparams, size_t* decoded_bytes,
+                    PackedPixelFile* ppf, std::vector<uint8_t>* jpeg_bytes) {
+  auto decoder = JxlDecoderMake(/*memory_manager=*/nullptr);
+  JxlDecoder* dec = decoder.get();
+  ppf->frames.clear();
+
+  if (dparams.runner_opaque != nullptr &&
+      JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(dec, dparams.runner,
+                                                     dparams.runner_opaque)) {
+    fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  JxlPixelFormat format;
+  std::vector<JxlPixelFormat> accepted_formats = dparams.accepted_formats;
+  if (accepted_formats.empty()) {
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      accepted_formats.push_back(
+          {num_channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, /*align=*/0});
+    }
+  }
+  JxlColorEncoding color_encoding;
+  size_t num_color_channels = 0;
+  if (!dparams.color_space.empty()) {
+    if (!jxl::ParseDescription(dparams.color_space, &color_encoding)) {
+      fprintf(stderr, "Failed to parse color space %s.\n",
+              dparams.color_space.c_str());
+      return false;
+    }
+    num_color_channels =
+        color_encoding.color_space == JXL_COLOR_SPACE_GRAY ? 1 : 3;
+  }
+
+  bool can_reconstruct_jpeg = false;
+  std::vector<uint8_t> jpeg_data_chunk;
+  if (jpeg_bytes != nullptr) {
+    jpeg_data_chunk.resize(16384);
+    jpeg_bytes->resize(0);
+  }
+
+  int events = (JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE);
+
+  bool max_passes_defined =
+      (dparams.max_passes < std::numeric_limits<uint32_t>::max());
+  if (max_passes_defined || dparams.max_downsampling > 1) {
+    events |= JXL_DEC_FRAME_PROGRESSION;
+    if (max_passes_defined) {
+      JxlDecoderSetProgressiveDetail(dec, JxlProgressiveDetail::kPasses);
+    } else {
+      JxlDecoderSetProgressiveDetail(dec, JxlProgressiveDetail::kLastPasses);
+    }
+  }
+  if (jpeg_bytes != nullptr) {
+    events |= JXL_DEC_JPEG_RECONSTRUCTION;
+  } else {
+    events |= (JXL_DEC_COLOR_ENCODING | JXL_DEC_FRAME | JXL_DEC_PREVIEW_IMAGE |
+               JXL_DEC_BOX);
+  }
+  if (JXL_DEC_SUCCESS != JxlDecoderSubscribeEvents(dec, events)) {
+    fprintf(stderr, "JxlDecoderSubscribeEvents failed\n");
+    return false;
+  }
+  if (jpeg_bytes == nullptr) {
+    if (JXL_DEC_SUCCESS !=
+        JxlDecoderSetRenderSpotcolors(dec, dparams.render_spotcolors)) {
+      fprintf(stderr, "JxlDecoderSetRenderSpotColors failed\n");
+      return false;
+    }
+    if (JXL_DEC_SUCCESS !=
+        JxlDecoderSetKeepOrientation(dec, dparams.keep_orientation)) {
+      fprintf(stderr, "JxlDecoderSetKeepOrientation failed\n");
+      return false;
+    }
+    if (JXL_DEC_SUCCESS !=
+        JxlDecoderSetUnpremultiplyAlpha(dec, dparams.unpremultiply_alpha)) {
+      fprintf(stderr, "JxlDecoderSetUnpremultiplyAlpha failed\n");
+      return false;
+    }
+    if (dparams.display_nits > 0 &&
+        JXL_DEC_SUCCESS !=
+            JxlDecoderSetDesiredIntensityTarget(dec, dparams.display_nits)) {
+      fprintf(stderr, "Decoder failed to set desired intensity target\n");
+      return false;
+    }
+    if (JXL_DEC_SUCCESS != JxlDecoderSetDecompressBoxes(dec, JXL_TRUE)) {
+      fprintf(stderr, "JxlDecoderSetDecompressBoxes failed\n");
+      return false;
+    }
+  }
+  if (JXL_DEC_SUCCESS != JxlDecoderSetInput(dec, bytes, bytes_size)) {
+    fprintf(stderr, "Decoder failed to set input\n");
+    return false;
+  }
+  uint32_t progression_index = 0;
+  bool codestream_done = false;
+  BoxProcessor boxes(dec);
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+    if (status == JXL_DEC_ERROR) {
+      fprintf(stderr, "Failed to decode image\n");
+      return false;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+      if (codestream_done) {
+        break;
+      }
+      if (dparams.allow_partial_input) {
+        if (JXL_DEC_SUCCESS != JxlDecoderFlushImage(dec)) {
+          fprintf(stderr,
+                  "Input file is truncated and there is no preview "
+                  "available yet.\n");
+          return false;
+        }
+        break;
+      }
+      size_t released_size = JxlDecoderReleaseInput(dec);
+      fprintf(stderr,
+              "Input file is truncated (total bytes: %" PRIuS
+              ", processed bytes: %" PRIuS
+              ") and --allow_partial_files is not present.\n",
+              bytes_size, bytes_size - released_size);
+      return false;
+    } else if (status == JXL_DEC_BOX) {
+      boxes.FinalizeOutput();
+      JxlBoxType box_type;
+      if (JXL_DEC_SUCCESS != JxlDecoderGetBoxType(dec, box_type, JXL_TRUE)) {
+        fprintf(stderr, "JxlDecoderGetBoxType failed\n");
+        return false;
+      }
+      std::vector<uint8_t>* box_data = nullptr;
+      if (memcmp(box_type, "Exif", 4) == 0) {
+        box_data = &ppf->metadata.exif;
+      } else if (memcmp(box_type, "iptc", 4) == 0) {
+        box_data = &ppf->metadata.iptc;
+      } else if (memcmp(box_type, "jumb", 4) == 0) {
+        box_data = &ppf->metadata.jumbf;
+      } else if (memcmp(box_type, "xml ", 4) == 0) {
+        box_data = &ppf->metadata.xmp;
+      }
+      if (box_data) {
+        boxes.InitializeOutput(box_data);
+      }
+    } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+      boxes.AddMoreOutput();
+    } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) {
+      can_reconstruct_jpeg = true;
+      // Decoding to JPEG.
+      if (JXL_DEC_SUCCESS != JxlDecoderSetJPEGBuffer(dec,
+                                                     jpeg_data_chunk.data(),
+                                                     jpeg_data_chunk.size())) {
+        fprintf(stderr, "Decoder failed to set JPEG Buffer\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+      // Decoded a chunk to JPEG.
+      size_t used_jpeg_output =
+          jpeg_data_chunk.size() - JxlDecoderReleaseJPEGBuffer(dec);
+      jpeg_bytes->insert(jpeg_bytes->end(), jpeg_data_chunk.data(),
+                         jpeg_data_chunk.data() + used_jpeg_output);
+      if (used_jpeg_output == 0) {
+        // Chunk is too small.
+        jpeg_data_chunk.resize(jpeg_data_chunk.size() * 2);
+      }
+      if (JXL_DEC_SUCCESS != JxlDecoderSetJPEGBuffer(dec,
+                                                     jpeg_data_chunk.data(),
+                                                     jpeg_data_chunk.size())) {
+        fprintf(stderr, "Decoder failed to set JPEG Buffer\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(dec, &ppf->info)) {
+        fprintf(stderr, "JxlDecoderGetBasicInfo failed\n");
+        return false;
+      }
+      if (num_color_channels != 0) {
+        // Mark the change in number of color channels due to the requested
+        // color space.
+        ppf->info.num_color_channels = num_color_channels;
+      }
+      if (dparams.output_bitdepth.type == JXL_BIT_DEPTH_CUSTOM) {
+        // Select format based on custom bits per sample.
+        ppf->info.bits_per_sample = dparams.output_bitdepth.bits_per_sample;
+      }
+      // Select format according to accepted formats.
+      if (!jxl::extras::SelectFormat(accepted_formats, ppf->info, &format)) {
+        fprintf(stderr, "SelectFormat failed\n");
+        return false;
+      }
+      bool have_alpha = (format.num_channels == 2 || format.num_channels == 4);
+      if (!have_alpha) {
+        // Mark in the basic info that alpha channel was dropped.
+        ppf->info.alpha_bits = 0;
+      } else {
+        if (dparams.unpremultiply_alpha) {
+          // Mark in the basic info that alpha was unpremultiplied.
+          ppf->info.alpha_premultiplied = false;
+        }
+      }
+      bool alpha_found = false;
+      for (uint32_t i = 0; i < ppf->info.num_extra_channels; ++i) {
+        JxlExtraChannelInfo eci;
+        if (JXL_DEC_SUCCESS != JxlDecoderGetExtraChannelInfo(dec, i, &eci)) {
+          fprintf(stderr, "JxlDecoderGetExtraChannelInfo failed\n");
+          return false;
+        }
+        if (eci.type == JXL_CHANNEL_ALPHA && have_alpha && !alpha_found) {
+          // Skip the first alpha channels because it is already present in the
+          // interleaved image.
+          alpha_found = true;
+          continue;
+        }
+        std::string name(eci.name_length + 1, 0);
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetExtraChannelName(dec, i, &name[0], name.size())) {
+          fprintf(stderr, "JxlDecoderGetExtraChannelName failed\n");
+          return false;
+        }
+        name.resize(eci.name_length);
+        ppf->extra_channels_info.push_back({eci, i, name});
+      }
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      if (!dparams.color_space.empty()) {
+        if (ppf->info.uses_original_profile) {
+          fprintf(stderr,
+                  "Warning: --color_space ignored because the image is "
+                  "not XYB encoded.\n");
+        } else {
+          if (JXL_DEC_SUCCESS !=
+              JxlDecoderSetPreferredColorProfile(dec, &color_encoding)) {
+            fprintf(stderr, "Failed to set color space.\n");
+            return false;
+          }
+        }
+      }
+      size_t icc_size = 0;
+      JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetICCProfileSize(dec, nullptr, target, &icc_size)) {
+        fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
+      }
+      if (icc_size != 0) {
+        ppf->icc.resize(icc_size);
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetColorAsICCProfile(dec, nullptr, target,
+                                           ppf->icc.data(), icc_size)) {
+          fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
+          return false;
+        }
+      }
+      if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsEncodedProfile(
+                                 dec, nullptr, target, &ppf->color_encoding)) {
+        ppf->color_encoding.color_space = JXL_COLOR_SPACE_UNKNOWN;
+      }
+      icc_size = 0;
+      target = JXL_COLOR_PROFILE_TARGET_ORIGINAL;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetICCProfileSize(dec, nullptr, target, &icc_size)) {
+        fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
+      }
+      if (icc_size != 0) {
+        ppf->orig_icc.resize(icc_size);
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetColorAsICCProfile(dec, nullptr, target,
+                                           ppf->orig_icc.data(), icc_size)) {
+          fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
+          return false;
+        }
+      }
+    } else if (status == JXL_DEC_FRAME) {
+      jxl::extras::PackedFrame frame(ppf->info.xsize, ppf->info.ysize, format);
+      if (JXL_DEC_SUCCESS != JxlDecoderGetFrameHeader(dec, &frame.frame_info)) {
+        fprintf(stderr, "JxlDecoderGetFrameHeader failed\n");
+        return false;
+      }
+      frame.name.resize(frame.frame_info.name_length + 1, 0);
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetFrameName(dec, &frame.name[0], frame.name.size())) {
+        fprintf(stderr, "JxlDecoderGetFrameName failed\n");
+        return false;
+      }
+      frame.name.resize(frame.frame_info.name_length);
+      ppf->frames.emplace_back(std::move(frame));
+      progression_index = 0;
+    } else if (status == JXL_DEC_FRAME_PROGRESSION) {
+      size_t downsampling = JxlDecoderGetIntendedDownsamplingRatio(dec);
+      if ((max_passes_defined && progression_index >= dparams.max_passes) ||
+          (!max_passes_defined && downsampling <= dparams.max_downsampling)) {
+        if (JXL_DEC_SUCCESS != JxlDecoderFlushImage(dec)) {
+          fprintf(stderr, "JxlDecoderFlushImage failed\n");
+          return false;
+        }
+        if (ppf->frames.back().frame_info.is_last) {
+          break;
+        }
+        if (JXL_DEC_SUCCESS != JxlDecoderSkipCurrentFrame(dec)) {
+          fprintf(stderr, "JxlDecoderSkipCurrentFrame failed\n");
+          return false;
+        }
+      }
+      ++progression_index;
+    } else if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) {
+      size_t buffer_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size)) {
+        fprintf(stderr, "JxlDecoderPreviewOutBufferSize failed\n");
+        return false;
+      }
+      ppf->preview_frame = std::unique_ptr<jxl::extras::PackedFrame>(
+          new jxl::extras::PackedFrame(ppf->info.preview.xsize,
+                                       ppf->info.preview.ysize, format));
+      if (buffer_size != ppf->preview_frame->color.pixels_size) {
+        fprintf(stderr, "Invalid out buffer size %" PRIuS " %" PRIuS "\n",
+                buffer_size, ppf->preview_frame->color.pixels_size);
+        return false;
+      }
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderSetPreviewOutBuffer(
+              dec, &format, ppf->preview_frame->color.pixels(), buffer_size)) {
+        fprintf(stderr, "JxlDecoderSetPreviewOutBuffer failed\n");
+        return false;
+      }
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      if (jpeg_bytes != nullptr) {
+        break;
+      }
+      size_t buffer_size;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderImageOutBufferSize(dec, &format, &buffer_size)) {
+        fprintf(stderr, "JxlDecoderImageOutBufferSize failed\n");
+        return false;
+      }
+      jxl::extras::PackedFrame& frame = ppf->frames.back();
+      if (buffer_size != frame.color.pixels_size) {
+        fprintf(stderr, "Invalid out buffer size %" PRIuS " %" PRIuS "\n",
+                buffer_size, frame.color.pixels_size);
+        return false;
+      }
+
+      if (dparams.use_image_callback) {
+        auto callback = [](void* opaque, size_t x, size_t y, size_t num_pixels,
+                           const void* pixels) {
+          auto* ppf = reinterpret_cast<jxl::extras::PackedPixelFile*>(opaque);
+          jxl::extras::PackedImage& color = ppf->frames.back().color;
+          uint8_t* pixels_buffer = reinterpret_cast<uint8_t*>(color.pixels());
+          size_t sample_size = color.pixel_stride();
+          memcpy(pixels_buffer + (color.stride * y + sample_size * x), pixels,
+                 num_pixels * sample_size);
+        };
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderSetImageOutCallback(dec, &format, callback, ppf)) {
+          fprintf(stderr, "JxlDecoderSetImageOutCallback failed\n");
+          return false;
+        }
+      } else {
+        if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec, &format,
+                                                           frame.color.pixels(),
+                                                           buffer_size)) {
+          fprintf(stderr, "JxlDecoderSetImageOutBuffer failed\n");
+          return false;
+        }
+      }
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderSetImageOutBitDepth(dec, &dparams.output_bitdepth)) {
+        fprintf(stderr, "JxlDecoderSetImageOutBitDepth failed\n");
+        return false;
+      }
+      UpdateBitDepth(dparams.output_bitdepth, format.data_type, &ppf->info);
+      bool have_alpha = (format.num_channels == 2 || format.num_channels == 4);
+      if (have_alpha) {
+        // Interleaved alpha channels has the same bit depth as color channels.
+        ppf->info.alpha_bits = ppf->info.bits_per_sample;
+        ppf->info.alpha_exponent_bits = ppf->info.exponent_bits_per_sample;
+      }
+      JxlPixelFormat ec_format = format;
+      ec_format.num_channels = 1;
+      for (auto& eci : ppf->extra_channels_info) {
+        frame.extra_channels.emplace_back(jxl::extras::PackedImage(
+            ppf->info.xsize, ppf->info.ysize, ec_format));
+        auto& ec = frame.extra_channels.back();
+        size_t buffer_size;
+        if (JXL_DEC_SUCCESS != JxlDecoderExtraChannelBufferSize(
+                                   dec, &ec_format, &buffer_size, eci.index)) {
+          fprintf(stderr, "JxlDecoderExtraChannelBufferSize failed\n");
+          return false;
+        }
+        if (buffer_size != ec.pixels_size) {
+          fprintf(stderr,
+                  "Invalid extra channel buffer size"
+                  " %" PRIuS " %" PRIuS "\n",
+                  buffer_size, ec.pixels_size);
+          return false;
+        }
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderSetExtraChannelBuffer(dec, &ec_format, ec.pixels(),
+                                            buffer_size, eci.index)) {
+          fprintf(stderr, "JxlDecoderSetExtraChannelBuffer failed\n");
+          return false;
+        }
+        UpdateBitDepth(dparams.output_bitdepth, ec_format.data_type,
+                       &eci.ec_info);
+      }
+    } else if (status == JXL_DEC_SUCCESS) {
+      // Decoding finished successfully.
+      break;
+    } else if (status == JXL_DEC_PREVIEW_IMAGE) {
+      // Nothing to do.
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      if (jpeg_bytes != nullptr || ppf->frames.back().frame_info.is_last) {
+        codestream_done = true;
+      }
+    } else {
+      fprintf(stderr, "Error: unexpected status: %d\n",
+              static_cast<int>(status));
+      return false;
+    }
+  }
+  boxes.FinalizeOutput();
+  if (!ppf->metadata.exif.empty()) {
+    // Verify that Exif box has a valid TIFF header at the specified offset.
+    // Discard bytes preceding the header.
+    if (ppf->metadata.exif.size() >= 4) {
+      uint32_t offset = LoadBE32(ppf->metadata.exif.data());
+      if (offset <= ppf->metadata.exif.size() - 8) {
+        std::vector<uint8_t> exif(ppf->metadata.exif.begin() + 4 + offset,
+                                  ppf->metadata.exif.end());
+        bool bigendian;
+        if (IsExif(exif, &bigendian)) {
+          ppf->metadata.exif = std::move(exif);
+        } else {
+          fprintf(stderr, "Warning: invalid TIFF header in Exif\n");
+        }
+      } else {
+        fprintf(stderr, "Warning: invalid Exif offset: %" PRIu32 "\n", offset);
+      }
+    } else {
+      fprintf(stderr, "Warning: invalid Exif length: %" PRIuS "\n",
+              ppf->metadata.exif.size());
+    }
+  }
+  if (jpeg_bytes != nullptr) {
+    if (!can_reconstruct_jpeg) return false;
+    size_t used_jpeg_output =
+        jpeg_data_chunk.size() - JxlDecoderReleaseJPEGBuffer(dec);
+    jpeg_bytes->insert(jpeg_bytes->end(), jpeg_data_chunk.data(),
+                       jpeg_data_chunk.data() + used_jpeg_output);
+  }
+  if (decoded_bytes) {
+    *decoded_bytes = bytes_size - JxlDecoderReleaseInput(dec);
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/jxl.h b/third_party/jpeg-xl/lib/extras/dec/jxl.h
new file mode 100644
index 0000000000..5f4ed7f683
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.h
@@ -0,0 +1,69 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_JXL_H_
+#define LIB_EXTRAS_DEC_JXL_H_
+
+// Decodes JPEG XL images in memory.
+
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+struct JXLDecompressParams {
+  // If empty, little endian float formats will be accepted.
+  std::vector<JxlPixelFormat> accepted_formats;
+
+  // Requested output color space description.
+  std::string color_space;
+  // If set, performs tone mapping to this intensity target luminance.
+  float display_nits = 0.0;
+  // Whether spot colors are rendered on the image.
+  bool render_spotcolors = true;
+  // Whether to keep or undo the orientation given in the header.
+  bool keep_orientation = false;
+
+  // If runner_opaque is set, the decoder uses this parallel runner.
+  JxlParallelRunner runner;
+  void* runner_opaque = nullptr;
+
+  // Whether truncated input should be treated as an error.
+  bool allow_partial_input = false;
+
+  // How many passes to decode at most. By default, decode everything.
+  uint32_t max_passes = std::numeric_limits<uint32_t>::max();
+
+  // Alternatively, one can specify the maximum tolerable downscaling factor
+  // with respect to the full size of the image. By default, nothing less than
+  // the full size is requested.
+  size_t max_downsampling = 1;
+
+  // Whether to use the image callback or the image buffer to get the output.
+  bool use_image_callback = true;
+  // Whether to unpremultiply colors for associated alpha channels.
+  bool unpremultiply_alpha = false;
+
+  // Controls the effective bit depth of the output pixels.
+  JxlBitDepth output_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
+};
+
+bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
+                    const JXLDecompressParams& dparams, size_t* decoded_bytes,
+                    PackedPixelFile* ppf,
+                    std::vector<uint8_t>* jpeg_bytes = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_JXL_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/pgx.cc b/third_party/jpeg-xl/lib/extras/dec/pgx.cc
new file mode 100644
index 0000000000..a99eb0f4ee
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/pgx.cc
@@ -0,0 +1,202 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/pgx.h"
+
+#include <string.h>
+
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+struct HeaderPGX {
+  // NOTE: PGX is always grayscale
+  size_t xsize;
+  size_t ysize;
+  size_t bits_per_sample;
+  bool big_endian;
+  bool is_signed;
+};
+
+class Parser {
+ public:
+  explicit Parser(const Span<const uint8_t> bytes)
+      : pos_(bytes.data()), end_(pos_ + bytes.size()) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  Status ParseHeader(HeaderPGX* header, const uint8_t** pos) {
+    // codec.cc ensures we have at least two bytes => no range check here.
+    if (pos_[0] != 'P' || pos_[1] != 'G') return false;
+    pos_ += 2;
+    return ParseHeaderPGX(header, pos);
+  }
+
+  // Exposed for testing
+  Status ParseUnsigned(size_t* number) {
+    if (pos_ == end_) return JXL_FAILURE("PGX: reached end before number");
+    if (!IsDigit(*pos_)) return JXL_FAILURE("PGX: expected unsigned number");
+
+    *number = 0;
+    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+      *number *= 10;
+      *number += *pos_ - '0';
+      ++pos_;
+    }
+
+    return true;
+  }
+
+ private:
+  static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  Status SkipSpace() {
+    if (pos_ == end_) return JXL_FAILURE("PGX: reached end before space");
+    const uint8_t c = *pos_;
+    if (c != ' ') return JXL_FAILURE("PGX: expected space");
+    ++pos_;
+    return true;
+  }
+
+  Status SkipLineBreak() {
+    if (pos_ == end_) return JXL_FAILURE("PGX: reached end before line break");
+    // Line break can be either "\n" (0a) or "\r\n" (0d 0a).
+    if (*pos_ == '\n') {
+      pos_++;
+      return true;
+    } else if (*pos_ == '\r' && pos_ + 1 != end_ && *(pos_ + 1) == '\n') {
+      pos_ += 2;
+      return true;
+    }
+    return JXL_FAILURE("PGX: expected line break");
+  }
+
+  Status SkipSingleWhitespace() {
+    if (pos_ == end_) return JXL_FAILURE("PGX: reached end before whitespace");
+    if (!IsWhitespace(*pos_)) return JXL_FAILURE("PGX: expected whitespace");
+    ++pos_;
+    return true;
+  }
+
+  Status ParseHeaderPGX(HeaderPGX* header, const uint8_t** pos) {
+    JXL_RETURN_IF_ERROR(SkipSpace());
+    if (pos_ + 2 > end_) return JXL_FAILURE("PGX: header too small");
+    if (*pos_ == 'M' && *(pos_ + 1) == 'L') {
+      header->big_endian = true;
+    } else if (*pos_ == 'L' && *(pos_ + 1) == 'M') {
+      header->big_endian = false;
+    } else {
+      return JXL_FAILURE("PGX: invalid endianness");
+    }
+    pos_ += 2;
+    JXL_RETURN_IF_ERROR(SkipSpace());
+    if (pos_ == end_) return JXL_FAILURE("PGX: header too small");
+    if (*pos_ == '+') {
+      header->is_signed = false;
+    } else if (*pos_ == '-') {
+      header->is_signed = true;
+    } else {
+      return JXL_FAILURE("PGX: invalid signedness");
+    }
+    pos_++;
+    // Skip optional space
+    if (pos_ < end_ && *pos_ == ' ') pos_++;
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->bits_per_sample));
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+    // 0xa, or 0xd 0xa.
+    JXL_RETURN_IF_ERROR(SkipLineBreak());
+
+    // TODO(jon): could do up to 24-bit by converting the values to
+    // JXL_TYPE_FLOAT.
+    if (header->bits_per_sample > 16) {
+      return JXL_FAILURE("PGX: >16 bits not yet supported");
+    }
+    // TODO(lode): support signed integers. This may require changing the way
+    // external_image works.
+    if (header->is_signed) {
+      return JXL_FAILURE("PGX: signed not yet supported");
+    }
+
+    size_t numpixels = header->xsize * header->ysize;
+    size_t bytes_per_pixel = header->bits_per_sample <= 8 ? 1 : 2;
+    if (pos_ + numpixels * bytes_per_pixel > end_) {
+      return JXL_FAILURE("PGX: data too small");
+    }
+
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+}  // namespace
+
+Status DecodeImagePGX(const Span<const uint8_t> bytes,
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+  Parser parser(bytes);
+  HeaderPGX header = {};
+  const uint8_t* pos;
+  if (!parser.ParseHeader(&header, &pos)) return false;
+  JXL_RETURN_IF_ERROR(
+      VerifyDimensions(constraints, header.xsize, header.ysize));
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 32) {
+    return JXL_FAILURE("PGX: bits_per_sample invalid");
+  }
+
+  JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, /*color_already_set=*/false,
+                                      /*is_gray=*/true, ppf));
+  ppf->info.xsize = header.xsize;
+  ppf->info.ysize = header.ysize;
+  // Original data is uint, so exponent_bits_per_sample = 0.
+  ppf->info.bits_per_sample = header.bits_per_sample;
+  ppf->info.exponent_bits_per_sample = 0;
+  ppf->info.uses_original_profile = true;
+
+  // No alpha in PGX
+  ppf->info.alpha_bits = 0;
+  ppf->info.alpha_exponent_bits = 0;
+  ppf->info.num_color_channels = 1;  // Always grayscale
+  ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+  JxlDataType data_type;
+  if (header.bits_per_sample > 8) {
+    data_type = JXL_TYPE_UINT16;
+  } else {
+    data_type = JXL_TYPE_UINT8;
+  }
+
+  const JxlPixelFormat format{
+      /*num_channels=*/1,
+      /*data_type=*/data_type,
+      /*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
+      /*align=*/0,
+  };
+  ppf->frames.clear();
+  // Allocates the frame buffer.
+  ppf->frames.emplace_back(header.xsize, header.ysize, format);
+  const auto& frame = ppf->frames.back();
+  size_t pgx_remaining_size = bytes.data() + bytes.size() - pos;
+  if (pgx_remaining_size < frame.color.pixels_size) {
+    return JXL_FAILURE("PGX file too small");
+  }
+  memcpy(frame.color.pixels(), pos, frame.color.pixels_size);
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/pgx.h b/third_party/jpeg-xl/lib/extras/dec/pgx.h
new file mode 100644
index 0000000000..2cbd3b4dcf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/pgx.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_PGX_H_
+#define LIB_EXTRAS_DEC_PGX_H_
+
+// Decodes PGX pixels in memory.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Decodes `bytes` into `ppf`.
+Status DecodeImagePGX(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_PGX_H_
diff --git a/third_party/jpeg-xl/lib/extras/dec/pgx_test.cc b/third_party/jpeg-xl/lib/extras/dec/pgx_test.cc
new file mode 100644
index 0000000000..78ed689d07
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/pgx_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/pgx.h"
+
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+Span<const uint8_t> MakeSpan(const char* str) {
+  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
+                             strlen(str));
+}
+
+TEST(CodecPGXTest, Test8bits) {
+  std::string pgx = "PG ML + 8 2 3\npixels";
+
+  PackedPixelFile ppf;
+  ThreadPool* pool = nullptr;
+
+  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(), &ppf));
+  CodecInOut io;
+  EXPECT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf, pool, &io));
+
+  ScaleImage(255.f, io.Main().color());
+
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.IsGray());
+  EXPECT_EQ(2u, io.xsize());
+  EXPECT_EQ(3u, io.ysize());
+
+  float eps = 1e-5;
+  EXPECT_NEAR('p', io.Main().color()->Plane(0).Row(0)[0], eps);
+  EXPECT_NEAR('i', io.Main().color()->Plane(0).Row(0)[1], eps);
+  EXPECT_NEAR('x', io.Main().color()->Plane(0).Row(1)[0], eps);
+  EXPECT_NEAR('e', io.Main().color()->Plane(0).Row(1)[1], eps);
+  EXPECT_NEAR('l', io.Main().color()->Plane(0).Row(2)[0], eps);
+  EXPECT_NEAR('s', io.Main().color()->Plane(0).Row(2)[1], eps);
+}
+
+TEST(CodecPGXTest, Test16bits) {
+  std::string pgx = "PG ML + 16 2 3\np_i_x_e_l_s_";
+
+  PackedPixelFile ppf;
+  ThreadPool* pool = nullptr;
+
+  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(), &ppf));
+  CodecInOut io;
+  EXPECT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf, pool, &io));
+
+  ScaleImage(255.f, io.Main().color());
+
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(16u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.IsGray());
+  EXPECT_EQ(2u, io.xsize());
+  EXPECT_EQ(3u, io.ysize());
+
+  // Comparing ~16-bit numbers in floats, only ~7 bits left.
+  float eps = 1e-3;
+  const auto& plane = io.Main().color()->Plane(0);
+  EXPECT_NEAR(256.0f * 'p' + '_', plane.Row(0)[0] * 257, eps);
+  EXPECT_NEAR(256.0f * 'i' + '_', plane.Row(0)[1] * 257, eps);
+  EXPECT_NEAR(256.0f * 'x' + '_', plane.Row(1)[0] * 257, eps);
+  EXPECT_NEAR(256.0f * 'e' + '_', plane.Row(1)[1] * 257, eps);
+  EXPECT_NEAR(256.0f * 'l' + '_', plane.Row(2)[0] * 257, eps);
+  EXPECT_NEAR(256.0f * 's' + '_', plane.Row(2)[1] * 257, eps);
+}
+
+}  // namespace
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/pnm.cc b/third_party/jpeg-xl/lib/extras/dec/pnm.cc
new file mode 100644
index 0000000000..c3c2247769
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/pnm.cc
@@ -0,0 +1,474 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/pnm.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <cmath>
+
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+struct HeaderPNM {
+  size_t xsize;
+  size_t ysize;
+  bool is_gray;    // PGM
+  bool has_alpha;  // PAM
+  size_t bits_per_sample;
+  bool floating_point;
+  bool big_endian;
+  std::vector<JxlExtraChannelType> ec_types;  // PAM
+};
+
+class Parser {
+ public:
+  explicit Parser(const Span<const uint8_t> bytes)
+      : pos_(bytes.data()), end_(pos_ + bytes.size()) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  Status ParseHeader(HeaderPNM* header, const uint8_t** pos) {
+    // codec.cc ensures we have at least two bytes => no range check here.
+    if (pos_[0] != 'P') return false;
+    const uint8_t type = pos_[1];
+    pos_ += 2;
+
+    switch (type) {
+      case '4':
+        return JXL_FAILURE("pbm not supported");
+
+      case '5':
+        header->is_gray = true;
+        return ParseHeaderPNM(header, pos);
+
+      case '6':
+        header->is_gray = false;
+        return ParseHeaderPNM(header, pos);
+
+      case '7':
+        return ParseHeaderPAM(header, pos);
+
+      case 'F':
+        header->is_gray = false;
+        return ParseHeaderPFM(header, pos);
+
+      case 'f':
+        header->is_gray = true;
+        return ParseHeaderPFM(header, pos);
+    }
+    return false;
+  }
+
+  // Exposed for testing
+  Status ParseUnsigned(size_t* number) {
+    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before number");
+    if (!IsDigit(*pos_)) return JXL_FAILURE("PNM: expected unsigned number");
+
+    *number = 0;
+    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+      *number *= 10;
+      *number += *pos_ - '0';
+      ++pos_;
+    }
+
+    return true;
+  }
+
+  Status ParseSigned(double* number) {
+    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before signed");
+
+    if (*pos_ != '-' && *pos_ != '+' && !IsDigit(*pos_)) {
+      return JXL_FAILURE("PNM: expected signed number");
+    }
+
+    // Skip sign
+    const bool is_neg = *pos_ == '-';
+    if (is_neg || *pos_ == '+') {
+      ++pos_;
+      if (pos_ == end_) return JXL_FAILURE("PNM: reached end before digits");
+    }
+
+    // Leading digits
+    *number = 0.0;
+    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+      *number *= 10;
+      *number += *pos_ - '0';
+      ++pos_;
+    }
+
+    // Decimal places?
+    if (pos_ < end_ && *pos_ == '.') {
+      ++pos_;
+      double place = 0.1;
+      while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+        *number += (*pos_ - '0') * place;
+        place *= 0.1;
+        ++pos_;
+      }
+    }
+
+    if (is_neg) *number = -*number;
+    return true;
+  }
+
+ private:
+  static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  Status SkipBlank() {
+    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before blank");
+    const uint8_t c = *pos_;
+    if (c != ' ' && c != '\n') return JXL_FAILURE("PNM: expected blank");
+    ++pos_;
+    return true;
+  }
+
+  Status SkipSingleWhitespace() {
+    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before whitespace");
+    if (!IsWhitespace(*pos_)) return JXL_FAILURE("PNM: expected whitespace");
+    ++pos_;
+    return true;
+  }
+
+  Status SkipWhitespace() {
+    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before whitespace");
+    if (!IsWhitespace(*pos_) && *pos_ != '#') {
+      return JXL_FAILURE("PNM: expected whitespace/comment");
+    }
+
+    while (pos_ < end_ && IsWhitespace(*pos_)) {
+      ++pos_;
+    }
+
+    // Comment(s)
+    while (pos_ != end_ && *pos_ == '#') {
+      while (pos_ != end_ && !IsLineBreak(*pos_)) {
+        ++pos_;
+      }
+      // Newline(s)
+      while (pos_ != end_ && IsLineBreak(*pos_)) pos_++;
+    }
+
+    while (pos_ < end_ && IsWhitespace(*pos_)) {
+      ++pos_;
+    }
+    return true;
+  }
+
+  Status MatchString(const char* keyword, bool skipws = true) {
+    const uint8_t* ppos = pos_;
+    while (*keyword) {
+      if (ppos >= end_) return JXL_FAILURE("PAM: unexpected end of input");
+      if (*keyword != *ppos) return false;
+      ppos++;
+      keyword++;
+    }
+    pos_ = ppos;
+    if (skipws) {
+      JXL_RETURN_IF_ERROR(SkipWhitespace());
+    } else {
+      JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+    }
+    return true;
+  }
+
+  Status ParseHeaderPAM(HeaderPNM* header, const uint8_t** pos) {
+    size_t depth = 3;
+    size_t max_val = 255;
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
+    while (!MatchString("ENDHDR", /*skipws=*/false)) {
+      if (MatchString("WIDTH")) {
+        JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
+      } else if (MatchString("HEIGHT")) {
+        JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
+      } else if (MatchString("DEPTH")) {
+        JXL_RETURN_IF_ERROR(ParseUnsigned(&depth));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
+      } else if (MatchString("MAXVAL")) {
+        JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
+      } else if (MatchString("TUPLTYPE")) {
+        if (MatchString("RGB_ALPHA")) {
+          header->has_alpha = true;
+        } else if (MatchString("RGB")) {
+        } else if (MatchString("GRAYSCALE_ALPHA")) {
+          header->has_alpha = true;
+          header->is_gray = true;
+        } else if (MatchString("GRAYSCALE")) {
+          header->is_gray = true;
+        } else if (MatchString("BLACKANDWHITE_ALPHA")) {
+          header->has_alpha = true;
+          header->is_gray = true;
+          max_val = 1;
+        } else if (MatchString("BLACKANDWHITE")) {
+          header->is_gray = true;
+          max_val = 1;
+        } else if (MatchString("Alpha")) {
+          header->ec_types.push_back(JXL_CHANNEL_ALPHA);
+        } else if (MatchString("Depth")) {
+          header->ec_types.push_back(JXL_CHANNEL_DEPTH);
+        } else if (MatchString("SpotColor")) {
+          header->ec_types.push_back(JXL_CHANNEL_SPOT_COLOR);
+        } else if (MatchString("SelectionMask")) {
+          header->ec_types.push_back(JXL_CHANNEL_SELECTION_MASK);
+        } else if (MatchString("Black")) {
+          header->ec_types.push_back(JXL_CHANNEL_BLACK);
+        } else if (MatchString("CFA")) {
+          header->ec_types.push_back(JXL_CHANNEL_CFA);
+        } else if (MatchString("Thermal")) {
+          header->ec_types.push_back(JXL_CHANNEL_THERMAL);
+        } else {
+          return JXL_FAILURE("PAM: unknown TUPLTYPE");
+        }
+      } else {
+        constexpr size_t kMaxHeaderLength = 20;
+        char unknown_header[kMaxHeaderLength + 1];
+        size_t len = std::min<size_t>(kMaxHeaderLength, end_ - pos_);
+        strncpy(unknown_header, reinterpret_cast<const char*>(pos_), len);
+        unknown_header[len] = 0;
+        return JXL_FAILURE("PAM: unknown header keyword: %s", unknown_header);
+      }
+    }
+    size_t num_channels = header->is_gray ? 1 : 3;
+    if (header->has_alpha) num_channels++;
+    if (num_channels + header->ec_types.size() != depth) {
+      return JXL_FAILURE("PAM: bad DEPTH");
+    }
+    if (max_val == 0 || max_val >= 65536) {
+      return JXL_FAILURE("PAM: bad MAXVAL");
+    }
+    // e.g. When `max_val` is 1 , we want 1 bit:
+    header->bits_per_sample = FloorLog2Nonzero(max_val) + 1;
+    if ((1u << header->bits_per_sample) - 1 != max_val)
+      return JXL_FAILURE("PNM: unsupported MaxVal (expected 2^n - 1)");
+    // PAM does not pack bits as in PBM.
+
+    header->floating_point = false;
+    header->big_endian = true;
+    *pos = pos_;
+    return true;
+  }
+
+  Status ParseHeaderPNM(HeaderPNM* header, const uint8_t** pos) {
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
+    size_t max_val;
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val));
+    if (max_val == 0 || max_val >= 65536) {
+      return JXL_FAILURE("PNM: bad MaxVal");
+    }
+    header->bits_per_sample = FloorLog2Nonzero(max_val) + 1;
+    if ((1u << header->bits_per_sample) - 1 != max_val)
+      return JXL_FAILURE("PNM: unsupported MaxVal (expected 2^n - 1)");
+    header->floating_point = false;
+    header->big_endian = true;
+
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+
+    *pos = pos_;
+    return true;
+  }
+
+  Status ParseHeaderPFM(HeaderPNM* header, const uint8_t** pos) {
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+
+    JXL_RETURN_IF_ERROR(SkipBlank());
+    JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+    // The scale has no meaning as multiplier, only its sign is used to
+    // indicate endianness. All software expects nominal range 0..1.
+    double scale;
+    JXL_RETURN_IF_ERROR(ParseSigned(&scale));
+    if (scale == 0.0) {
+      return JXL_FAILURE("PFM: bad scale factor value.");
+    } else if (std::abs(scale) != 1.0) {
+      JXL_WARNING("PFM: Discarding non-unit scale factor");
+    }
+    header->big_endian = scale > 0.0;
+    header->bits_per_sample = 32;
+    header->floating_point = true;
+
+    JXL_RETURN_IF_ERROR(SkipSingleWhitespace());
+
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+Span<const uint8_t> MakeSpan(const char* str) {
+  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
+                             strlen(str));
+}
+
+}  // namespace
+
+Status DecodeImagePNM(const Span<const uint8_t> bytes,
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+  Parser parser(bytes);
+  HeaderPNM header = {};
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&header, &pos)) return false;
+  JXL_RETURN_IF_ERROR(
+      VerifyDimensions(constraints, header.xsize, header.ysize));
+
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 32) {
+    return JXL_FAILURE("PNM: bits_per_sample invalid");
+  }
+
+  // PPM specify that in the raster, the sample values are "nonlinear" (BP.709,
+  // with gamma number of 2.2). Deviate from the specification and assume
+  // `sRGB` in our implementation.
+  JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, /*color_already_set=*/false,
+                                      header.is_gray, ppf));
+
+  ppf->info.xsize = header.xsize;
+  ppf->info.ysize = header.ysize;
+  if (header.floating_point) {
+    ppf->info.bits_per_sample = 32;
+    ppf->info.exponent_bits_per_sample = 8;
+  } else {
+    ppf->info.bits_per_sample = header.bits_per_sample;
+    ppf->info.exponent_bits_per_sample = 0;
+  }
+
+  ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+  // No alpha in PNM and PFM
+  ppf->info.alpha_bits = (header.has_alpha ? ppf->info.bits_per_sample : 0);
+  ppf->info.alpha_exponent_bits = 0;
+  ppf->info.num_color_channels = (header.is_gray ? 1 : 3);
+  uint32_t num_alpha_channels = (header.has_alpha ? 1 : 0);
+  uint32_t num_interleaved_channels =
+      ppf->info.num_color_channels + num_alpha_channels;
+  ppf->info.num_extra_channels = num_alpha_channels + header.ec_types.size();
+
+  for (auto type : header.ec_types) {
+    PackedExtraChannel pec;
+    pec.ec_info.bits_per_sample = ppf->info.bits_per_sample;
+    pec.ec_info.type = type;
+    ppf->extra_channels_info.emplace_back(std::move(pec));
+  }
+
+  JxlDataType data_type;
+  if (header.floating_point) {
+    // There's no float16 pnm version.
+    data_type = JXL_TYPE_FLOAT;
+  } else {
+    if (header.bits_per_sample > 8) {
+      data_type = JXL_TYPE_UINT16;
+    } else {
+      data_type = JXL_TYPE_UINT8;
+    }
+  }
+
+  const JxlPixelFormat format{
+      /*num_channels=*/num_interleaved_channels,
+      /*data_type=*/data_type,
+      /*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
+      /*align=*/0,
+  };
+  const JxlPixelFormat ec_format{1, format.data_type, format.endianness, 0};
+  ppf->frames.clear();
+  ppf->frames.emplace_back(header.xsize, header.ysize, format);
+  auto* frame = &ppf->frames.back();
+  for (size_t i = 0; i < header.ec_types.size(); ++i) {
+    frame->extra_channels.emplace_back(header.xsize, header.ysize, ec_format);
+  }
+  size_t pnm_remaining_size = bytes.data() + bytes.size() - pos;
+  if (pnm_remaining_size < frame->color.pixels_size) {
+    return JXL_FAILURE("PNM file too small");
+  }
+
+  uint8_t* out = reinterpret_cast<uint8_t*>(frame->color.pixels());
+  std::vector<uint8_t*> ec_out(header.ec_types.size());
+  for (size_t i = 0; i < ec_out.size(); ++i) {
+    ec_out[i] = reinterpret_cast<uint8_t*>(frame->extra_channels[i].pixels());
+  }
+  if (ec_out.empty()) {
+    const bool flipped_y = header.bits_per_sample == 32;  // PFMs are flipped
+    for (size_t y = 0; y < header.ysize; ++y) {
+      size_t y_in = flipped_y ? header.ysize - 1 - y : y;
+      const uint8_t* row_in = &pos[y_in * frame->color.stride];
+      uint8_t* row_out = &out[y * frame->color.stride];
+      memcpy(row_out, row_in, frame->color.stride);
+    }
+  } else {
+    size_t pwidth = PackedImage::BitsPerChannel(data_type) / 8;
+    for (size_t y = 0; y < header.ysize; ++y) {
+      for (size_t x = 0; x < header.xsize; ++x) {
+        memcpy(out, pos, frame->color.pixel_stride());
+        out += frame->color.pixel_stride();
+        pos += frame->color.pixel_stride();
+        for (auto& p : ec_out) {
+          memcpy(p, pos, pwidth);
+          pos += pwidth;
+          p += pwidth;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void TestCodecPNM() {
+  size_t u = 77777;  // Initialized to wrong value.
+  double d = 77.77;
+// Failing to parse invalid strings results in a crash if `JXL_CRASH_ON_ERROR`
+// is defined and hence the tests fail. Therefore we only run these tests if
+// `JXL_CRASH_ON_ERROR` is not defined.
+#ifndef JXL_CRASH_ON_ERROR
+  JXL_CHECK(false == Parser(MakeSpan("")).ParseUnsigned(&u));
+  JXL_CHECK(false == Parser(MakeSpan("+")).ParseUnsigned(&u));
+  JXL_CHECK(false == Parser(MakeSpan("-")).ParseUnsigned(&u));
+  JXL_CHECK(false == Parser(MakeSpan("A")).ParseUnsigned(&u));
+
+  JXL_CHECK(false == Parser(MakeSpan("")).ParseSigned(&d));
+  JXL_CHECK(false == Parser(MakeSpan("+")).ParseSigned(&d));
+  JXL_CHECK(false == Parser(MakeSpan("-")).ParseSigned(&d));
+  JXL_CHECK(false == Parser(MakeSpan("A")).ParseSigned(&d));
+#endif
+  JXL_CHECK(true == Parser(MakeSpan("1")).ParseUnsigned(&u));
+  JXL_CHECK(u == 1);
+
+  JXL_CHECK(true == Parser(MakeSpan("32")).ParseUnsigned(&u));
+  JXL_CHECK(u == 32);
+
+  JXL_CHECK(true == Parser(MakeSpan("1")).ParseSigned(&d));
+  JXL_CHECK(d == 1.0);
+  JXL_CHECK(true == Parser(MakeSpan("+2")).ParseSigned(&d));
+  JXL_CHECK(d == 2.0);
+  JXL_CHECK(true == Parser(MakeSpan("-3")).ParseSigned(&d));
+  JXL_CHECK(std::abs(d - -3.0) < 1E-15);
+  JXL_CHECK(true == Parser(MakeSpan("3.141592")).ParseSigned(&d));
+  JXL_CHECK(std::abs(d - 3.141592) < 1E-15);
+  JXL_CHECK(true == Parser(MakeSpan("-3.141592")).ParseSigned(&d));
+  JXL_CHECK(std::abs(d - -3.141592) < 1E-15);
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/dec/pnm.h b/third_party/jpeg-xl/lib/extras/dec/pnm.h
new file mode 100644
index 0000000000..0745b2f20d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/dec/pnm.h
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_PNM_H_
+#define LIB_EXTRAS_DEC_PNM_H_
+
+// Decodes PBM/PGM/PPM/PFM pixels in memory.
+
+#include <stddef.h>
+#include <stdint.h>
+
+// TODO(janwas): workaround for incorrect Win64 codegen (cause unknown)
+#include <hwy/highway.h>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints;
+
+namespace extras {
+
+// Decodes `bytes` into `ppf`. color_hints may specify "color_space", which
+// defaults to sRGB.
+Status DecodeImagePNM(Span<const uint8_t> bytes, const ColorHints& color_hints,
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
+
+void TestCodecPNM();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_PNM_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/apng.cc b/third_party/jpeg-xl/lib/extras/enc/apng.cc
new file mode 100644
index 0000000000..79d083349d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/apng.cc
@@ -0,0 +1,371 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/apng.h"
+
+// Parts of this code are taken from apngdis, which has the following license:
+/* APNG Disassembler 2.8
+ *
+ * Deconstructs APNG files into individual frames.
+ *
+ * http://apngdis.sourceforge.net
+ *
+ * Copyright (c) 2010-2015 Max Stepin
+ * maxst at users.sourceforge.net
+ *
+ * zlib license
+ * ------------
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/exif.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "png.h" /* original (unpatched) libpng is ok */
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+
+class APNGEncoder : public Encoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
+        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+          formats.push_back(
+              JxlPixelFormat{num_channels, data_type, endianness, /*align=*/0});
+        }
+      }
+    }
+    return formats;
+  }
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    encoded_image->icc.clear();
+    encoded_image->bitstreams.resize(1);
+    return EncodePackedPixelFileToAPNG(ppf, pool,
+                                       &encoded_image->bitstreams.front());
+  }
+
+ private:
+  Status EncodePackedPixelFileToAPNG(const PackedPixelFile& ppf,
+                                     ThreadPool* pool,
+                                     std::vector<uint8_t>* bytes) const;
+};
+
+static void PngWrite(png_structp png_ptr, png_bytep data, png_size_t length) {
+  std::vector<uint8_t>* bytes =
+      static_cast<std::vector<uint8_t>*>(png_get_io_ptr(png_ptr));
+  bytes->insert(bytes->end(), data, data + length);
+}
+
+// Stores XMP and EXIF/IPTC into key/value strings for PNG
+class BlobsWriterPNG {
+ public:
+  static Status Encode(const PackedMetadata& blobs,
+                       std::vector<std::string>* strings) {
+    if (!blobs.exif.empty()) {
+      // PNG viewers typically ignore Exif orientation but not all of them do
+      // (and e.g. cjxl doesn't), so we overwrite the Exif orientation to the
+      // identity to avoid repeated orientation.
+      std::vector<uint8_t> exif = blobs.exif;
+      ResetExifOrientation(exif);
+      // By convention, the data is prefixed with "Exif\0\0" when stored in
+      // the legacy (and non-standard) "Raw profile type exif" text chunk
+      // currently used here.
+      // TODO: Store Exif data in an eXIf chunk instead, which always begins
+      // with the TIFF header.
+      if (exif.size() >= sizeof kExifSignature &&
+          memcmp(exif.data(), kExifSignature, sizeof kExifSignature) != 0) {
+        exif.insert(exif.begin(), kExifSignature,
+                    kExifSignature + sizeof kExifSignature);
+      }
+      JXL_RETURN_IF_ERROR(EncodeBase16("exif", exif, strings));
+    }
+    if (!blobs.iptc.empty()) {
+      JXL_RETURN_IF_ERROR(EncodeBase16("iptc", blobs.iptc, strings));
+    }
+    if (!blobs.xmp.empty()) {
+      // TODO: Store XMP data in an "XML:com.adobe.xmp" text chunk instead.
+      JXL_RETURN_IF_ERROR(EncodeBase16("xmp", blobs.xmp, strings));
+    }
+    return true;
+  }
+
+ private:
+  static JXL_INLINE char EncodeNibble(const uint8_t nibble) {
+    JXL_ASSERT(nibble < 16);
+    return (nibble < 10) ? '0' + nibble : 'a' + nibble - 10;
+  }
+
+  static Status EncodeBase16(const std::string& type,
+                             const std::vector<uint8_t>& bytes,
+                             std::vector<std::string>* strings) {
+    // Encoding: base16 with newline after 72 chars.
+    const size_t base16_size =
+        2 * bytes.size() + DivCeil(bytes.size(), size_t(36)) + 1;
+    std::string base16;
+    base16.reserve(base16_size);
+    for (size_t i = 0; i < bytes.size(); ++i) {
+      if (i % 36 == 0) base16.push_back('\n');
+      base16.push_back(EncodeNibble(bytes[i] >> 4));
+      base16.push_back(EncodeNibble(bytes[i] & 0x0F));
+    }
+    base16.push_back('\n');
+    JXL_ASSERT(base16.length() == base16_size);
+
+    char key[30];
+    snprintf(key, sizeof(key), "Raw profile type %s", type.c_str());
+
+    char header[30];
+    snprintf(header, sizeof(header), "\n%s\n%8" PRIuS, type.c_str(),
+             bytes.size());
+
+    strings->push_back(std::string(key));
+    strings->push_back(std::string(header) + base16);
+    return true;
+  }
+};
+
+void MaybeAddCICP(JxlColorEncoding c_enc, png_structp png_ptr,
+                  png_infop info_ptr) {
+  png_byte cicp_data[4] = {};
+  png_unknown_chunk cicp_chunk;
+  if (c_enc.color_space != JXL_COLOR_SPACE_RGB) {
+    return;
+  }
+  if (c_enc.primaries == JXL_PRIMARIES_P3) {
+    if (c_enc.white_point == JXL_WHITE_POINT_D65) {
+      cicp_data[0] = 12;
+    } else if (c_enc.white_point == JXL_WHITE_POINT_DCI) {
+      cicp_data[0] = 11;
+    } else {
+      return;
+    }
+  } else if (c_enc.primaries != JXL_PRIMARIES_CUSTOM &&
+             c_enc.white_point == JXL_WHITE_POINT_D65) {
+    cicp_data[0] = static_cast<png_byte>(c_enc.primaries);
+  } else {
+    return;
+  }
+  if (c_enc.transfer_function == JXL_TRANSFER_FUNCTION_UNKNOWN ||
+      c_enc.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
+    return;
+  }
+  cicp_data[1] = static_cast<png_byte>(c_enc.transfer_function);
+  cicp_data[2] = 0;
+  cicp_data[3] = 1;
+  cicp_chunk.data = cicp_data;
+  cicp_chunk.size = sizeof(cicp_data);
+  cicp_chunk.location = PNG_HAVE_PLTE;
+  memcpy(cicp_chunk.name, "cICP", 5);
+  png_set_keep_unknown_chunks(png_ptr, 3,
+                              reinterpret_cast<const png_byte*>("cICP"), 1);
+  png_set_unknown_chunks(png_ptr, info_ptr, &cicp_chunk, 1);
+}
+
+Status APNGEncoder::EncodePackedPixelFileToAPNG(
+    const PackedPixelFile& ppf, ThreadPool* pool,
+    std::vector<uint8_t>* bytes) const {
+  size_t xsize = ppf.info.xsize;
+  size_t ysize = ppf.info.ysize;
+  bool has_alpha = ppf.info.alpha_bits != 0;
+  bool is_gray = ppf.info.num_color_channels == 1;
+  size_t color_channels = ppf.info.num_color_channels;
+  size_t num_channels = color_channels + (has_alpha ? 1 : 0);
+  size_t num_samples = num_channels * xsize * ysize;
+
+  if (!ppf.info.have_animation && ppf.frames.size() != 1) {
+    return JXL_FAILURE("Invalid number of frames");
+  }
+
+  size_t count = 0;
+  size_t anim_chunks = 0;
+
+  for (const auto& frame : ppf.frames) {
+    JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
+
+    const PackedImage& color = frame.color;
+    const JxlPixelFormat format = color.format;
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
+    size_t data_bits_per_sample = PackedImage::BitsPerChannel(format.data_type);
+    size_t bytes_per_sample = data_bits_per_sample / 8;
+    size_t out_bytes_per_sample = bytes_per_sample > 1 ? 2 : 1;
+    size_t out_stride = xsize * num_channels * out_bytes_per_sample;
+    size_t out_size = ysize * out_stride;
+    std::vector<uint8_t> out(out_size);
+
+    if (format.data_type == JXL_TYPE_UINT8) {
+      if (ppf.info.bits_per_sample < 8) {
+        float mul = 255.0 / ((1u << ppf.info.bits_per_sample) - 1);
+        for (size_t i = 0; i < num_samples; ++i) {
+          out[i] = static_cast<uint8_t>(in[i] * mul + 0.5);
+        }
+      } else {
+        memcpy(&out[0], in, out_size);
+      }
+    } else if (format.data_type == JXL_TYPE_UINT16) {
+      if (ppf.info.bits_per_sample < 16 ||
+          format.endianness != JXL_BIG_ENDIAN) {
+        float mul = 65535.0 / ((1u << ppf.info.bits_per_sample) - 1);
+        const uint8_t* p_in = in;
+        uint8_t* p_out = out.data();
+        for (size_t i = 0; i < num_samples; ++i, p_in += 2, p_out += 2) {
+          uint32_t val = (format.endianness == JXL_BIG_ENDIAN ? LoadBE16(p_in)
+                                                              : LoadLE16(p_in));
+          StoreBE16(static_cast<uint32_t>(val * mul + 0.5), p_out);
+        }
+      } else {
+        memcpy(&out[0], in, out_size);
+      }
+    }
+    png_structp png_ptr;
+    png_infop info_ptr;
+
+    png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+
+    if (!png_ptr) return JXL_FAILURE("Could not init png encoder");
+
+    info_ptr = png_create_info_struct(png_ptr);
+    if (!info_ptr) return JXL_FAILURE("Could not init png info struct");
+
+    png_set_write_fn(png_ptr, bytes, PngWrite, NULL);
+    png_set_flush(png_ptr, 0);
+
+    int width = xsize;
+    int height = ysize;
+
+    png_byte color_type = (is_gray ? PNG_COLOR_TYPE_GRAY : PNG_COLOR_TYPE_RGB);
+    if (has_alpha) color_type |= PNG_COLOR_MASK_ALPHA;
+    png_byte bit_depth = out_bytes_per_sample * 8;
+
+    png_set_IHDR(png_ptr, info_ptr, width, height, bit_depth, color_type,
+                 PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+                 PNG_FILTER_TYPE_BASE);
+    if (count == 0) {
+      MaybeAddCICP(ppf.color_encoding, png_ptr, info_ptr);
+      if (!ppf.icc.empty()) {
+        png_set_benign_errors(png_ptr, 1);
+        png_set_iCCP(png_ptr, info_ptr, "1", 0, ppf.icc.data(), ppf.icc.size());
+      }
+      std::vector<std::string> textstrings;
+      JXL_RETURN_IF_ERROR(BlobsWriterPNG::Encode(ppf.metadata, &textstrings));
+      for (size_t kk = 0; kk + 1 < textstrings.size(); kk += 2) {
+        png_text text;
+        text.key = const_cast<png_charp>(textstrings[kk].c_str());
+        text.text = const_cast<png_charp>(textstrings[kk + 1].c_str());
+        text.compression = PNG_TEXT_COMPRESSION_zTXt;
+        png_set_text(png_ptr, info_ptr, &text, 1);
+      }
+
+      png_write_info(png_ptr, info_ptr);
+    } else {
+      // fake writing a header, otherwise libpng gets confused
+      size_t pos = bytes->size();
+      png_write_info(png_ptr, info_ptr);
+      bytes->resize(pos);
+    }
+
+    if (ppf.info.have_animation) {
+      if (count == 0) {
+        png_byte adata[8];
+        png_save_uint_32(adata, ppf.frames.size());
+        png_save_uint_32(adata + 4, ppf.info.animation.num_loops);
+        png_byte actl[5] = "acTL";
+        png_write_chunk(png_ptr, actl, adata, 8);
+      }
+      png_byte fdata[26];
+      // TODO(jon): also make this work for the non-coalesced case
+      png_save_uint_32(fdata, anim_chunks++);
+      png_save_uint_32(fdata + 4, width);
+      png_save_uint_32(fdata + 8, height);
+      png_save_uint_32(fdata + 12, 0);
+      png_save_uint_32(fdata + 16, 0);
+      png_save_uint_16(fdata + 20, frame.frame_info.duration *
+                                       ppf.info.animation.tps_denominator);
+      png_save_uint_16(fdata + 22, ppf.info.animation.tps_numerator);
+      fdata[24] = 1;
+      fdata[25] = 0;
+      png_byte fctl[5] = "fcTL";
+      png_write_chunk(png_ptr, fctl, fdata, 26);
+    }
+
+    std::vector<uint8_t*> rows(height);
+    for (int y = 0; y < height; ++y) {
+      rows[y] = out.data() + y * out_stride;
+    }
+
+    png_write_flush(png_ptr);
+    const size_t pos = bytes->size();
+    png_write_image(png_ptr, &rows[0]);
+    png_write_flush(png_ptr);
+    if (count > 0) {
+      std::vector<uint8_t> fdata(4);
+      png_save_uint_32(fdata.data(), anim_chunks++);
+      size_t p = pos;
+      while (p + 8 < bytes->size()) {
+        size_t len = png_get_uint_32(bytes->data() + p);
+        JXL_ASSERT(bytes->operator[](p + 4) == 'I');
+        JXL_ASSERT(bytes->operator[](p + 5) == 'D');
+        JXL_ASSERT(bytes->operator[](p + 6) == 'A');
+        JXL_ASSERT(bytes->operator[](p + 7) == 'T');
+        fdata.insert(fdata.end(), bytes->data() + p + 8,
+                     bytes->data() + p + 8 + len);
+        p += len + 12;
+      }
+      bytes->resize(pos);
+
+      png_byte fdat[5] = "fdAT";
+      png_write_chunk(png_ptr, fdat, fdata.data(), fdata.size());
+    }
+
+    count++;
+    if (count == ppf.frames.size() || !ppf.info.have_animation) {
+      png_write_end(png_ptr, NULL);
+    }
+
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+  }
+
+  return true;
+}
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetAPNGEncoder() {
+  return jxl::make_unique<APNGEncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/apng.h b/third_party/jpeg-xl/lib/extras/enc/apng.h
new file mode 100644
index 0000000000..2a2139c8fa
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/apng.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_APNG_H_
+#define LIB_EXTRAS_ENC_APNG_H_
+
+// Encodes APNG images in memory.
+
+#include <memory>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetAPNGEncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_APNG_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/encode.cc b/third_party/jpeg-xl/lib/extras/enc/encode.cc
new file mode 100644
index 0000000000..9ffba9d0dd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/encode.cc
@@ -0,0 +1,170 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/encode.h"
+
+#include <locale>
+
+#if JPEGXL_ENABLE_APNG
+#include "lib/extras/enc/apng.h"
+#endif
+#if JPEGXL_ENABLE_EXR
+#include "lib/extras/enc/exr.h"
+#endif
+#if JPEGXL_ENABLE_JPEG
+#include "lib/extras/enc/jpg.h"
+#endif
+#include "lib/extras/enc/npy.h"
+#include "lib/extras/enc/pgx.h"
+#include "lib/extras/enc/pnm.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jxl {
+namespace extras {
+
+Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) {
+  if (info.xsize == 0 || info.ysize == 0) {
+    return JXL_FAILURE("Empty image");
+  }
+  if (info.num_color_channels != 1 && info.num_color_channels != 3) {
+    return JXL_FAILURE("Invalid number of color channels");
+  }
+  if (info.alpha_bits > 0 && info.alpha_bits != info.bits_per_sample) {
+    return JXL_FAILURE("Alpha bit depth does not match image bit depth");
+  }
+  if (info.orientation != JXL_ORIENT_IDENTITY) {
+    return JXL_FAILURE("Orientation must be identity");
+  }
+  return true;
+}
+
+Status Encoder::VerifyFormat(const JxlPixelFormat& format) const {
+  for (auto f : AcceptedFormats()) {
+    if (f.num_channels != format.num_channels) continue;
+    if (f.data_type != format.data_type) continue;
+    if (f.data_type == JXL_TYPE_UINT8 || f.endianness == format.endianness) {
+      return true;
+    }
+  }
+  return JXL_FAILURE("Format is not in the list of accepted formats.");
+}
+
+Status Encoder::VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                               uint32_t exponent_bits) {
+  if ((data_type == JXL_TYPE_UINT8 &&
+       (bits_per_sample == 0 || bits_per_sample > 8 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_UINT16 &&
+       (bits_per_sample <= 8 || bits_per_sample > 16 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_FLOAT16 &&
+       (bits_per_sample != 16 || exponent_bits != 5)) ||
+      (data_type == JXL_TYPE_FLOAT &&
+       (bits_per_sample != 32 || exponent_bits != 8))) {
+    return JXL_FAILURE(
+        "Incompatible data_type %d and bit depth %u with exponent bits %u",
+        (int)data_type, bits_per_sample, exponent_bits);
+  }
+  return true;
+}
+
+Status Encoder::VerifyImageSize(const PackedImage& image,
+                                const JxlBasicInfo& info) {
+  if (image.pixels() == nullptr) {
+    return JXL_FAILURE("Invalid image.");
+  }
+  if (image.stride != image.xsize * image.pixel_stride()) {
+    return JXL_FAILURE("Invalid image stride.");
+  }
+  if (image.pixels_size != image.ysize * image.stride) {
+    return JXL_FAILURE("Invalid image size.");
+  }
+  size_t info_num_channels =
+      (info.num_color_channels + (info.alpha_bits > 0 ? 1 : 0));
+  if (image.xsize != info.xsize || image.ysize != info.ysize ||
+      image.format.num_channels != info_num_channels) {
+    return JXL_FAILURE("Frame size does not match image size");
+  }
+  return true;
+}
+
+Status Encoder::VerifyPackedImage(const PackedImage& image,
+                                  const JxlBasicInfo& info) const {
+  JXL_RETURN_IF_ERROR(VerifyImageSize(image, info));
+  JXL_RETURN_IF_ERROR(VerifyFormat(image.format));
+  JXL_RETURN_IF_ERROR(VerifyBitDepth(image.format.data_type,
+                                     info.bits_per_sample,
+                                     info.exponent_bits_per_sample));
+  return true;
+}
+
+Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
+                    const JxlBasicInfo& basic_info, JxlPixelFormat* format) {
+  const size_t original_bit_depth = basic_info.bits_per_sample;
+  size_t current_bit_depth = 0;
+  size_t num_alpha_channels = (basic_info.alpha_bits != 0 ? 1 : 0);
+  size_t num_channels = basic_info.num_color_channels + num_alpha_channels;
+  for (;;) {
+    for (const JxlPixelFormat& candidate : accepted_formats) {
+      if (candidate.num_channels != num_channels) continue;
+      const size_t candidate_bit_depth =
+          PackedImage::BitsPerChannel(candidate.data_type);
+      if (
+          // Candidate bit depth is less than what we have and still enough
+          (original_bit_depth <= candidate_bit_depth &&
+           candidate_bit_depth < current_bit_depth) ||
+          // Or larger than the too-small bit depth we currently have
+          (current_bit_depth < candidate_bit_depth &&
+           current_bit_depth < original_bit_depth)) {
+        *format = candidate;
+        current_bit_depth = candidate_bit_depth;
+      }
+    }
+    if (current_bit_depth == 0) {
+      if (num_channels > basic_info.num_color_channels) {
+        // Try dropping the alpha channel.
+        --num_channels;
+        continue;
+      }
+      return JXL_FAILURE("no appropriate format found");
+    }
+    break;
+  }
+  if (current_bit_depth < original_bit_depth) {
+    JXL_WARNING("encoding %" PRIuS "-bit original to %" PRIuS " bits",
+                original_bit_depth, current_bit_depth);
+  }
+  return true;
+}
+
+std::unique_ptr<Encoder> Encoder::FromExtension(std::string extension) {
+  std::transform(
+      extension.begin(), extension.end(), extension.begin(),
+      [](char c) { return std::tolower(c, std::locale::classic()); });
+#if JPEGXL_ENABLE_APNG
+  if (extension == ".png" || extension == ".apng") return GetAPNGEncoder();
+#endif
+
+#if JPEGXL_ENABLE_JPEG
+  if (extension == ".jpg") return GetJPEGEncoder();
+  if (extension == ".jpeg") return GetJPEGEncoder();
+#endif
+
+  if (extension == ".npy") return GetNumPyEncoder();
+
+  if (extension == ".pgx") return GetPGXEncoder();
+
+  if (extension == ".pam") return GetPAMEncoder();
+  if (extension == ".pgm") return GetPGMEncoder();
+  if (extension == ".ppm") return GetPPMEncoder();
+  if (extension == ".pfm") return GetPFMEncoder();
+
+#if JPEGXL_ENABLE_EXR
+  if (extension == ".exr") return GetEXREncoder();
+#endif
+
+  return nullptr;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/encode.h b/third_party/jpeg-xl/lib/extras/enc/encode.h
new file mode 100644
index 0000000000..63cabaf30f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/encode.h
@@ -0,0 +1,83 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_ENCODE_H_
+#define LIB_EXTRAS_ENC_ENCODE_H_
+
+// Facade for image encoders.
+
+#include <string>
+#include <unordered_map>
+
+#include "lib/extras/dec/decode.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+struct EncodedImage {
+  // One (if the format supports animations or the image has only one frame) or
+  // more sequential bitstreams.
+  std::vector<std::vector<uint8_t>> bitstreams;
+
+  // For each extra channel one or more sequential bitstreams.
+  std::vector<std::vector<std::vector<uint8_t>>> extra_channel_bitstreams;
+
+  std::vector<uint8_t> preview_bitstream;
+
+  // If the format does not support embedding color profiles into the bitstreams
+  // above, it will be present here, to be written as a separate file. If it
+  // does support them, this field will be empty.
+  std::vector<uint8_t> icc;
+
+  // Additional output for conformance testing, only filled in by NumPyEncoder.
+  std::vector<uint8_t> metadata;
+};
+
+class Encoder {
+ public:
+  static std::unique_ptr<Encoder> FromExtension(std::string extension);
+
+  virtual ~Encoder() = default;
+
+  virtual std::vector<JxlPixelFormat> AcceptedFormats() const = 0;
+
+  // Any existing data in encoded_image is discarded.
+  virtual Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                        ThreadPool* pool = nullptr) const = 0;
+
+  void SetOption(std::string name, std::string value) {
+    options_[std::move(name)] = std::move(value);
+  }
+
+  static Status VerifyBasicInfo(const JxlBasicInfo& info);
+  static Status VerifyImageSize(const PackedImage& image,
+                                const JxlBasicInfo& info);
+  static Status VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                               uint32_t exponent_bits);
+
+ protected:
+  const std::unordered_map<std::string, std::string>& options() const {
+    return options_;
+  }
+
+  Status VerifyFormat(const JxlPixelFormat& format) const;
+
+  Status VerifyPackedImage(const PackedImage& image,
+                           const JxlBasicInfo& info) const;
+
+ private:
+  std::unordered_map<std::string, std::string> options_;
+};
+
+// TODO(sboukortt): consider exposing this as part of the C API.
+Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
+                    const JxlBasicInfo& basic_info, JxlPixelFormat* format);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_ENCODE_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/exr.cc b/third_party/jpeg-xl/lib/extras/enc/exr.cc
new file mode 100644
index 0000000000..1d70913936
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/exr.cc
@@ -0,0 +1,200 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/exr.h"
+
+#include <ImfChromaticitiesAttribute.h>
+#include <ImfIO.h>
+#include <ImfRgbaFile.h>
+#include <ImfStandardAttributes.h>
+#include <jxl/codestream_header.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/byte_order.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+namespace OpenEXR = OPENEXR_IMF_NAMESPACE;
+namespace Imath = IMATH_NAMESPACE;
+
+// OpenEXR::Int64 is deprecated in favor of using uint64_t directly, but using
+// uint64_t as recommended causes build failures with previous OpenEXR versions
+// on macOS, where the definition for OpenEXR::Int64 was actually not equivalent
+// to uint64_t. This alternative should work in all cases.
+using ExrInt64 = decltype(std::declval<OpenEXR::IStream>().tellg());
+
+class InMemoryOStream : public OpenEXR::OStream {
+ public:
+  // `bytes` must outlive the InMemoryOStream.
+  explicit InMemoryOStream(std::vector<uint8_t>* const bytes)
+      : OStream(/*fileName=*/""), bytes_(*bytes) {}
+
+  void write(const char c[], const int n) override {
+    if (bytes_.size() < pos_ + n) {
+      bytes_.resize(pos_ + n);
+    }
+    std::copy_n(c, n, bytes_.begin() + pos_);
+    pos_ += n;
+  }
+
+  ExrInt64 tellp() override { return pos_; }
+  void seekp(const ExrInt64 pos) override {
+    if (bytes_.size() + 1 < pos) {
+      bytes_.resize(pos - 1);
+    }
+    pos_ = pos;
+  }
+
+ private:
+  std::vector<uint8_t>& bytes_;
+  size_t pos_ = 0;
+};
+
+// Loads a Big-Endian float
+float LoadBEFloat(const uint8_t* p) {
+  uint32_t u = LoadBE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+// Loads a Little-Endian float
+float LoadLEFloat(const uint8_t* p) {
+  uint32_t u = LoadLE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+Status EncodeImageEXR(const PackedImage& image, const JxlBasicInfo& info,
+                      const JxlColorEncoding& c_enc, ThreadPool* pool,
+                      std::vector<uint8_t>* bytes) {
+  OpenEXR::setGlobalThreadCount(0);
+
+  const size_t xsize = info.xsize;
+  const size_t ysize = info.ysize;
+  const bool has_alpha = info.alpha_bits > 0;
+  const bool alpha_is_premultiplied = info.alpha_premultiplied;
+
+  if (info.num_color_channels != 3 ||
+      c_enc.color_space != JXL_COLOR_SPACE_RGB ||
+      c_enc.transfer_function != JXL_TRANSFER_FUNCTION_LINEAR) {
+    return JXL_FAILURE("Unsupported color encoding for OpenEXR output.");
+  }
+
+  const size_t num_channels = 3 + (has_alpha ? 1 : 0);
+  const JxlPixelFormat format = image.format;
+
+  if (format.data_type != JXL_TYPE_FLOAT) {
+    return JXL_FAILURE("Unsupported pixel format for OpenEXR output");
+  }
+
+  const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
+  size_t in_stride = num_channels * 4 * xsize;
+
+  OpenEXR::Header header(xsize, ysize);
+  OpenEXR::Chromaticities chromaticities;
+  chromaticities.red =
+      Imath::V2f(c_enc.primaries_red_xy[0], c_enc.primaries_red_xy[1]);
+  chromaticities.green =
+      Imath::V2f(c_enc.primaries_green_xy[0], c_enc.primaries_green_xy[1]);
+  chromaticities.blue =
+      Imath::V2f(c_enc.primaries_blue_xy[0], c_enc.primaries_blue_xy[1]);
+  chromaticities.white =
+      Imath::V2f(c_enc.white_point_xy[0], c_enc.white_point_xy[1]);
+  OpenEXR::addChromaticities(header, chromaticities);
+  OpenEXR::addWhiteLuminance(header, info.intensity_target);
+
+  auto loadFloat =
+      format.endianness == JXL_BIG_ENDIAN ? LoadBEFloat : LoadLEFloat;
+  auto loadAlpha =
+      has_alpha ? loadFloat : [](const uint8_t* p) -> float { return 1.0f; };
+
+  // Ensure that the destructor of RgbaOutputFile has run before we look at the
+  // size of `bytes`.
+  {
+    InMemoryOStream os(bytes);
+    OpenEXR::RgbaOutputFile output(
+        os, header, has_alpha ? OpenEXR::WRITE_RGBA : OpenEXR::WRITE_RGB);
+    // How many rows to write at once. Again, the OpenEXR documentation
+    // recommends writing the whole image in one call.
+    const int y_chunk_size = ysize;
+    std::vector<OpenEXR::Rgba> output_rows(xsize * y_chunk_size);
+
+    for (size_t start_y = 0; start_y < ysize; start_y += y_chunk_size) {
+      // Inclusive.
+      const size_t end_y = std::min(start_y + y_chunk_size - 1, ysize - 1);
+      output.setFrameBuffer(output_rows.data() - start_y * xsize,
+                            /*xStride=*/1, /*yStride=*/xsize);
+      for (size_t y = start_y; y <= end_y; ++y) {
+        const uint8_t* in_row = &in[(y - start_y) * in_stride];
+        OpenEXR::Rgba* const JXL_RESTRICT row_data =
+            &output_rows[(y - start_y) * xsize];
+        for (size_t x = 0; x < xsize; ++x) {
+          const uint8_t* in_pixel = &in_row[4 * num_channels * x];
+          float r = loadFloat(&in_pixel[0]);
+          float g = loadFloat(&in_pixel[4]);
+          float b = loadFloat(&in_pixel[8]);
+          const float alpha = loadAlpha(&in_pixel[12]);
+          if (!alpha_is_premultiplied) {
+            r *= alpha;
+            g *= alpha;
+            b *= alpha;
+          }
+          row_data[x] = OpenEXR::Rgba(r, g, b, alpha);
+        }
+      }
+      output.writePixels(/*numScanLines=*/end_y - start_y + 1);
+    }
+  }
+
+  return true;
+}
+
+class EXREncoder : public Encoder {
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      for (const JxlDataType data_type : {JXL_TYPE_FLOAT, JXL_TYPE_FLOAT16}) {
+        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+          formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                           /*data_type=*/data_type,
+                                           /*endianness=*/endianness,
+                                           /*align=*/0});
+        }
+      }
+    }
+    return formats;
+  }
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool = nullptr) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    encoded_image->icc.clear();
+    encoded_image->bitstreams.clear();
+    encoded_image->bitstreams.reserve(ppf.frames.size());
+    for (const auto& frame : ppf.frames) {
+      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
+      encoded_image->bitstreams.emplace_back();
+      JXL_RETURN_IF_ERROR(EncodeImageEXR(frame.color, ppf.info,
+                                         ppf.color_encoding, pool,
+                                         &encoded_image->bitstreams.back()));
+    }
+    return true;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetEXREncoder() {
+  return jxl::make_unique<EXREncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/exr.h b/third_party/jpeg-xl/lib/extras/enc/exr.h
new file mode 100644
index 0000000000..1baaa0272f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/exr.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_EXR_H_
+#define LIB_EXTRAS_ENC_EXR_H_
+
+// Encodes OpenEXR images in memory.
+
+#include <memory>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetEXREncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_EXR_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/jpegli.cc b/third_party/jpeg-xl/lib/extras/enc/jpegli.cc
new file mode 100644
index 0000000000..43cf32a19c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jpegli.cc
@@ -0,0 +1,503 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jpegli.h"
+
+#include <jxl/codestream_header.h>
+#include <setjmp.h>
+#include <stdint.h>
+
+#include "lib/extras/enc/encode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy_compress(reinterpret_cast<j_compress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+Status VerifyInput(const PackedPixelFile& ppf) {
+  const JxlBasicInfo& info = ppf.info;
+  JXL_RETURN_IF_ERROR(Encoder::VerifyBasicInfo(info));
+  if (info.alpha_bits > 0) {
+    return JXL_FAILURE("Alpha is not supported for JPEG output.");
+  }
+  if (ppf.frames.size() != 1) {
+    return JXL_FAILURE("JPEG input must have exactly one frame.");
+  }
+  const PackedImage& image = ppf.frames[0].color;
+  JXL_RETURN_IF_ERROR(Encoder::VerifyImageSize(image, info));
+  if (image.format.data_type == JXL_TYPE_FLOAT16) {
+    return JXL_FAILURE("FLOAT16 input is not supported.");
+  }
+  JXL_RETURN_IF_ERROR(Encoder::VerifyBitDepth(image.format.data_type,
+                                              info.bits_per_sample,
+                                              info.exponent_bits_per_sample));
+  if ((image.format.data_type == JXL_TYPE_UINT8 && info.bits_per_sample != 8) ||
+      (image.format.data_type == JXL_TYPE_UINT16 &&
+       info.bits_per_sample != 16)) {
+    return JXL_FAILURE("Only full bit depth unsigned types are supported.");
+  }
+  return true;
+}
+
+Status GetColorEncoding(const PackedPixelFile& ppf,
+                        ColorEncoding* color_encoding) {
+  if (!ppf.icc.empty()) {
+    PaddedBytes icc;
+    icc.assign(ppf.icc.data(), ppf.icc.data() + ppf.icc.size());
+    JXL_RETURN_IF_ERROR(color_encoding->SetICC(std::move(icc)));
+  } else {
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalColorEncoding(
+        ppf.color_encoding, color_encoding));
+  }
+  if (color_encoding->ICC().empty()) {
+    return JXL_FAILURE("Invalid color encoding.");
+  }
+  return true;
+}
+
+bool HasICCProfile(const std::vector<uint8_t>& app_data) {
+  size_t pos = 0;
+  while (pos < app_data.size()) {
+    if (pos + 16 > app_data.size()) return false;
+    uint8_t marker = app_data[pos + 1];
+    size_t marker_len = (app_data[pos + 2] << 8) + app_data[pos + 3] + 2;
+    if (marker == 0xe2 && memcmp(&app_data[pos + 4], "ICC_PROFILE", 12) == 0) {
+      return true;
+    }
+    pos += marker_len;
+  }
+  return false;
+}
+
+Status WriteAppData(j_compress_ptr cinfo,
+                    const std::vector<uint8_t>& app_data) {
+  size_t pos = 0;
+  while (pos < app_data.size()) {
+    if (pos + 4 > app_data.size()) {
+      return JXL_FAILURE("Incomplete APP header.");
+    }
+    uint8_t marker = app_data[pos + 1];
+    size_t marker_len = (app_data[pos + 2] << 8) + app_data[pos + 3] + 2;
+    if (app_data[pos] != 0xff || marker < 0xe0 || marker > 0xef) {
+      return JXL_FAILURE("Invalid APP marker %02x %02x", app_data[pos], marker);
+    }
+    if (marker_len <= 4) {
+      return JXL_FAILURE("Invalid APP marker length.");
+    }
+    if (pos + marker_len > app_data.size()) {
+      return JXL_FAILURE("Incomplete APP data");
+    }
+    jpegli_write_marker(cinfo, marker, &app_data[pos + 4], marker_len - 4);
+    pos += marker_len;
+  }
+  return true;
+}
+
+static constexpr int kICCMarker = 0xe2;
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+static constexpr uint8_t kUnknownTf = 2;
+static constexpr unsigned char kCICPTagSignature[4] = {0x63, 0x69, 0x63, 0x70};
+static constexpr size_t kCICPTagSize = 12;
+
+bool FindCICPTag(const uint8_t* icc_data, size_t len, bool is_first_chunk,
+                 size_t* cicp_offset, size_t* cicp_length, uint8_t* cicp_tag,
+                 size_t* cicp_pos) {
+  if (is_first_chunk) {
+    // Look up the offset of the CICP tag from the first chunk of ICC data.
+    if (len < 132) {
+      return false;
+    }
+    uint32_t tag_count = LoadBE32(&icc_data[128]);
+    if (len < 132 + 12 * tag_count) {
+      return false;
+    }
+    for (uint32_t i = 0; i < tag_count; ++i) {
+      if (memcmp(&icc_data[132 + 12 * i], kCICPTagSignature, 4) == 0) {
+        *cicp_offset = LoadBE32(&icc_data[136 + 12 * i]);
+        *cicp_length = LoadBE32(&icc_data[140 + 12 * i]);
+      }
+    }
+    if (*cicp_length < kCICPTagSize) {
+      return false;
+    }
+  }
+  if (*cicp_offset < len) {
+    size_t n_bytes = std::min(len - *cicp_offset, kCICPTagSize - *cicp_pos);
+    memcpy(&cicp_tag[*cicp_pos], &icc_data[*cicp_offset], n_bytes);
+    *cicp_pos += n_bytes;
+    *cicp_offset = 0;
+  } else {
+    *cicp_offset -= len;
+  }
+  return true;
+}
+
+uint8_t LookupCICPTransferFunctionFromAppData(const uint8_t* app_data,
+                                              size_t len) {
+  size_t last_index = 0;
+  size_t cicp_offset = 0;
+  size_t cicp_length = 0;
+  uint8_t cicp_tag[kCICPTagSize] = {};
+  size_t cicp_pos = 0;
+  size_t pos = 0;
+  while (pos < len) {
+    const uint8_t* marker = &app_data[pos];
+    if (pos + 4 > len) {
+      return kUnknownTf;
+    }
+    size_t marker_size = (marker[2] << 8) + marker[3] + 2;
+    if (pos + marker_size > len) {
+      return kUnknownTf;
+    }
+    if (marker_size < 18 || marker[0] != 0xff || marker[1] != kICCMarker ||
+        memcmp(&marker[4], kICCSignature, 12) != 0) {
+      pos += marker_size;
+      continue;
+    }
+    uint8_t index = marker[16];
+    uint8_t total = marker[17];
+    const uint8_t* payload = marker + 18;
+    const size_t payload_size = marker_size - 18;
+    if (index != last_index + 1 || index > total) {
+      return kUnknownTf;
+    }
+    if (!FindCICPTag(payload, payload_size, last_index == 0, &cicp_offset,
+                     &cicp_length, &cicp_tag[0], &cicp_pos)) {
+      return kUnknownTf;
+    }
+    if (cicp_pos == kCICPTagSize) {
+      break;
+    }
+    ++last_index;
+  }
+  if (cicp_pos >= kCICPTagSize && memcmp(cicp_tag, kCICPTagSignature, 4) == 0) {
+    return cicp_tag[9];
+  }
+  return kUnknownTf;
+}
+
+uint8_t LookupCICPTransferFunctionFromICCProfile(const uint8_t* icc_data,
+                                                 size_t len) {
+  size_t cicp_offset = 0;
+  size_t cicp_length = 0;
+  uint8_t cicp_tag[kCICPTagSize] = {};
+  size_t cicp_pos = 0;
+  if (!FindCICPTag(icc_data, len, true, &cicp_offset, &cicp_length,
+                   &cicp_tag[0], &cicp_pos)) {
+    return kUnknownTf;
+  }
+  if (cicp_pos >= kCICPTagSize && memcmp(cicp_tag, kCICPTagSignature, 4) == 0) {
+    return cicp_tag[9];
+  }
+  return kUnknownTf;
+}
+
+JpegliDataType ConvertDataType(JxlDataType type) {
+  switch (type) {
+    case JXL_TYPE_UINT8:
+      return JPEGLI_TYPE_UINT8;
+    case JXL_TYPE_UINT16:
+      return JPEGLI_TYPE_UINT16;
+    case JXL_TYPE_FLOAT:
+      return JPEGLI_TYPE_FLOAT;
+    default:
+      return JPEGLI_TYPE_UINT8;
+  }
+}
+
+JpegliEndianness ConvertEndianness(JxlEndianness endianness) {
+  switch (endianness) {
+    case JXL_NATIVE_ENDIAN:
+      return JPEGLI_NATIVE_ENDIAN;
+    case JXL_LITTLE_ENDIAN:
+      return JPEGLI_LITTLE_ENDIAN;
+    case JXL_BIG_ENDIAN:
+      return JPEGLI_BIG_ENDIAN;
+    default:
+      return JPEGLI_NATIVE_ENDIAN;
+  }
+}
+
+void ToFloatRow(const uint8_t* row_in, JxlPixelFormat format, size_t len,
+                float* row_out) {
+  bool is_little_endian =
+      (format.endianness == JXL_LITTLE_ENDIAN ||
+       (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()));
+  static constexpr double kMul8 = 1.0 / 255.0;
+  static constexpr double kMul16 = 1.0 / 65535.0;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = row_in[x] * kMul8;
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16 && is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadLE16(&row_in[2 * x]) * kMul16;
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16 && !is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadBE16(&row_in[2 * x]) * kMul16;
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT && is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadLEFloat(&row_in[4 * x]);
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT && !is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadBEFloat(&row_in[4 * x]);
+    }
+  }
+}
+
+Status EncodeJpegToTargetSize(const PackedPixelFile& ppf,
+                              const JpegSettings& jpeg_settings,
+                              size_t target_size, ThreadPool* pool,
+                              std::vector<uint8_t>* output) {
+  output->clear();
+  size_t best_error = std::numeric_limits<size_t>::max();
+  float distance0 = -1.0f;
+  float distance1 = -1.0f;
+  float distance = 1.0f;
+  for (int step = 0; step < 15; ++step) {
+    JpegSettings settings = jpeg_settings;
+    settings.libjpeg_quality = 0;
+    settings.distance = distance;
+    settings.target_size = 0;
+    std::vector<uint8_t> compressed;
+    JXL_RETURN_IF_ERROR(EncodeJpeg(ppf, settings, pool, &compressed));
+    size_t size = compressed.size();
+    // prefer being under the target size to being over it
+    size_t error = size < target_size
+                       ? target_size - size
+                       : static_cast<size_t>(1.2f * (size - target_size));
+    if (error < best_error) {
+      best_error = error;
+      std::swap(*output, compressed);
+    }
+    float rel_error = size * 1.0f / target_size;
+    if (std::abs(rel_error - 1.0f) < 0.002f) {
+      break;
+    }
+    if (size < target_size) {
+      distance1 = distance;
+    } else {
+      distance0 = distance;
+    }
+    if (distance1 == -1) {
+      distance *= std::pow(rel_error, 1.5) * 1.05;
+    } else if (distance0 == -1) {
+      distance *= std::pow(rel_error, 1.5) * 0.95;
+    } else {
+      distance = 0.5 * (distance0 + distance1);
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed) {
+  if (jpeg_settings.libjpeg_quality > 0) {
+    auto encoder = Encoder::FromExtension(".jpg");
+    encoder->SetOption("q", std::to_string(jpeg_settings.libjpeg_quality));
+    if (!jpeg_settings.libjpeg_chroma_subsampling.empty()) {
+      encoder->SetOption("chroma_subsampling",
+                         jpeg_settings.libjpeg_chroma_subsampling);
+    }
+    EncodedImage encoded;
+    JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded, pool));
+    size_t target_size = encoded.bitstreams[0].size();
+    return EncodeJpegToTargetSize(ppf, jpeg_settings, target_size, pool,
+                                  compressed);
+  }
+  if (jpeg_settings.target_size > 0) {
+    return EncodeJpegToTargetSize(ppf, jpeg_settings, jpeg_settings.target_size,
+                                  pool, compressed);
+  }
+  JXL_RETURN_IF_ERROR(VerifyInput(ppf));
+
+  ColorEncoding color_encoding;
+  JXL_RETURN_IF_ERROR(GetColorEncoding(ppf, &color_encoding));
+
+  ColorSpaceTransform c_transform(GetJxlCms());
+  ColorEncoding xyb_encoding;
+  if (jpeg_settings.xyb) {
+    if (ppf.info.num_color_channels != 3) {
+      return JXL_FAILURE("Only RGB input is supported in XYB mode.");
+    }
+    if (HasICCProfile(jpeg_settings.app_data)) {
+      return JXL_FAILURE("APP data ICC profile is not supported in XYB mode.");
+    }
+    const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(false);
+    JXL_RETURN_IF_ERROR(
+        c_transform.Init(color_encoding, c_desired, 255.0f, ppf.info.xsize, 1));
+    xyb_encoding.SetColorSpace(jxl::ColorSpace::kXYB);
+    xyb_encoding.rendering_intent = jxl::RenderingIntent::kPerceptual;
+    JXL_RETURN_IF_ERROR(xyb_encoding.CreateICC());
+  }
+  const ColorEncoding& output_encoding =
+      jpeg_settings.xyb ? xyb_encoding : color_encoding;
+
+  // We need to declare all the non-trivial destructor local variables
+  // before the call to setjmp().
+  std::vector<uint8_t> pixels;
+  unsigned char* output_buffer = nullptr;
+  unsigned long output_size = 0;
+  std::vector<uint8_t> row_bytes;
+  size_t rowlen = RoundUpTo(ppf.info.xsize, VectorSize());
+  hwy::AlignedFreeUniquePtr<float[]> xyb_tmp =
+      hwy::AllocateAligned<float>(6 * rowlen);
+  hwy::AlignedFreeUniquePtr<float[]> premul_absorb =
+      hwy::AllocateAligned<float>(VectorSize() * 12);
+  ComputePremulAbsorb(255.0f, premul_absorb.get());
+
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &output_buffer, &output_size);
+    const JxlBasicInfo& info = ppf.info;
+    cinfo.image_width = info.xsize;
+    cinfo.image_height = info.ysize;
+    cinfo.input_components = info.num_color_channels;
+    cinfo.in_color_space =
+        cinfo.input_components == 1 ? JCS_GRAYSCALE : JCS_RGB;
+    if (jpeg_settings.xyb) {
+      jpegli_set_xyb_mode(&cinfo);
+    } else if (jpeg_settings.use_std_quant_tables) {
+      jpegli_use_standard_quant_tables(&cinfo);
+    }
+    uint8_t cicp_tf = kUnknownTf;
+    if (!jpeg_settings.app_data.empty()) {
+      cicp_tf = LookupCICPTransferFunctionFromAppData(
+          jpeg_settings.app_data.data(), jpeg_settings.app_data.size());
+    } else if (!output_encoding.IsSRGB()) {
+      cicp_tf = LookupCICPTransferFunctionFromICCProfile(
+          output_encoding.ICC().data(), output_encoding.ICC().size());
+    }
+    jpegli_set_cicp_transfer_function(&cinfo, cicp_tf);
+    jpegli_set_defaults(&cinfo);
+    if (!jpeg_settings.chroma_subsampling.empty()) {
+      if (jpeg_settings.chroma_subsampling == "444") {
+        cinfo.comp_info[0].h_samp_factor = 1;
+        cinfo.comp_info[0].v_samp_factor = 1;
+      } else if (jpeg_settings.chroma_subsampling == "440") {
+        cinfo.comp_info[0].h_samp_factor = 1;
+        cinfo.comp_info[0].v_samp_factor = 2;
+      } else if (jpeg_settings.chroma_subsampling == "422") {
+        cinfo.comp_info[0].h_samp_factor = 2;
+        cinfo.comp_info[0].v_samp_factor = 1;
+      } else if (jpeg_settings.chroma_subsampling == "420") {
+        cinfo.comp_info[0].h_samp_factor = 2;
+        cinfo.comp_info[0].v_samp_factor = 2;
+      } else {
+        return false;
+      }
+      for (int i = 1; i < cinfo.num_components; ++i) {
+        cinfo.comp_info[i].h_samp_factor = 1;
+        cinfo.comp_info[i].v_samp_factor = 1;
+      }
+    }
+    jpegli_enable_adaptive_quantization(
+        &cinfo, jpeg_settings.use_adaptive_quantization);
+    jpegli_set_distance(&cinfo, jpeg_settings.distance, TRUE);
+    jpegli_set_progressive_level(&cinfo, jpeg_settings.progressive_level);
+    cinfo.optimize_coding = jpeg_settings.optimize_coding;
+    if (!jpeg_settings.app_data.empty()) {
+      // Make sure jpegli_start_compress() does not write any APP markers.
+      cinfo.write_JFIF_header = false;
+      cinfo.write_Adobe_marker = false;
+    }
+    const PackedImage& image = ppf.frames[0].color;
+    if (jpeg_settings.xyb) {
+      jpegli_set_input_format(&cinfo, JPEGLI_TYPE_FLOAT, JPEGLI_NATIVE_ENDIAN);
+    } else {
+      jpegli_set_input_format(&cinfo, ConvertDataType(image.format.data_type),
+                              ConvertEndianness(image.format.endianness));
+    }
+    jpegli_start_compress(&cinfo, TRUE);
+    if (!jpeg_settings.app_data.empty()) {
+      JXL_RETURN_IF_ERROR(WriteAppData(&cinfo, jpeg_settings.app_data));
+    }
+    if ((jpeg_settings.app_data.empty() && !output_encoding.IsSRGB()) ||
+        jpeg_settings.xyb) {
+      jpegli_write_icc_profile(&cinfo, output_encoding.ICC().data(),
+                               output_encoding.ICC().size());
+    }
+    const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
+    if (jpeg_settings.xyb) {
+      float* src_buf = c_transform.BufSrc(0);
+      float* dst_buf = c_transform.BufDst(0);
+      for (size_t y = 0; y < image.ysize; ++y) {
+        // convert to float
+        ToFloatRow(&pixels[y * image.stride], image.format, 3 * image.xsize,
+                   src_buf);
+        // convert to linear srgb
+        if (!c_transform.Run(0, src_buf, dst_buf)) {
+          return false;
+        }
+        // deinterleave channels
+        float* row0 = &xyb_tmp[0];
+        float* row1 = &xyb_tmp[rowlen];
+        float* row2 = &xyb_tmp[2 * rowlen];
+        for (size_t x = 0; x < image.xsize; ++x) {
+          row0[x] = dst_buf[3 * x + 0];
+          row1[x] = dst_buf[3 * x + 1];
+          row2[x] = dst_buf[3 * x + 2];
+        }
+        // convert to xyb
+        LinearRGBRowToXYB(row0, row1, row2, premul_absorb.get(), image.xsize);
+        // scale xyb
+        ScaleXYBRow(row0, row1, row2, image.xsize);
+        // interleave channels
+        float* row_out = &xyb_tmp[3 * rowlen];
+        for (size_t x = 0; x < image.xsize; ++x) {
+          row_out[3 * x + 0] = row0[x];
+          row_out[3 * x + 1] = row1[x];
+          row_out[3 * x + 2] = row2[x];
+        }
+        // feed to jpegli as native endian floats
+        JSAMPROW row[] = {reinterpret_cast<uint8_t*>(row_out)};
+        jpegli_write_scanlines(&cinfo, row, 1);
+      }
+    } else {
+      row_bytes.resize(image.stride);
+      for (size_t y = 0; y < info.ysize; ++y) {
+        memcpy(&row_bytes[0], pixels + y * image.stride, image.stride);
+        JSAMPROW row[] = {row_bytes.data()};
+        jpegli_write_scanlines(&cinfo, row, 1);
+      }
+    }
+    jpegli_finish_compress(&cinfo);
+    compressed->resize(output_size);
+    std::copy_n(output_buffer, output_size, compressed->data());
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (output_buffer) free(output_buffer);
+  return success;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/jpegli.h b/third_party/jpeg-xl/lib/extras/enc/jpegli.h
new file mode 100644
index 0000000000..194b9f7e48
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jpegli.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JPEGLI_H_
+#define LIB_EXTRAS_ENC_JPEGLI_H_
+
+// Encodes JPG pixels and metadata in memory using the libjpegli library.
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+struct JpegSettings {
+  bool xyb = false;
+  size_t target_size = 0;
+  float distance = 1.f;
+  bool use_adaptive_quantization = true;
+  bool use_std_quant_tables = false;
+  int progressive_level = 2;
+  bool optimize_coding = true;
+  std::string chroma_subsampling;
+  int libjpeg_quality = 0;
+  std::string libjpeg_chroma_subsampling;
+  // If not empty, must contain concatenated APP marker segments. In this case,
+  // these and only these APP marker segments will be written to the JPEG
+  // output. In xyb mode app_data must not contain an ICC profile, in this
+  // case an additional APP2 ICC profile for the XYB colorspace will be emitted.
+  std::vector<uint8_t> app_data;
+};
+
+Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JPEGLI_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/jpg.cc b/third_party/jpeg-xl/lib/extras/enc/jpg.cc
new file mode 100644
index 0000000000..179bcbe777
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jpg.cc
@@ -0,0 +1,427 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jpg.h"
+
+#include <jpeglib.h>
+#include <setjmp.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/exif.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+#if JPEGXL_ENABLE_SJPEG
+#include "sjpeg.h"
+#endif
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+constexpr int kICCMarker = JPEG_APP0 + 2;
+constexpr size_t kMaxBytesInMarker = 65533;
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+constexpr int kExifMarker = JPEG_APP0 + 1;
+
+enum class JpegEncoder {
+  kLibJpeg,
+  kSJpeg,
+};
+
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+
+// Popular jpeg scan scripts
+// The fields of the individual scans are:
+// comps_in_scan, component_index[], Ss, Se, Ah, Al
+static constexpr jpeg_scan_info kScanScript1[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 8, 0, 0},   //
+    {1, {0}, 9, 63, 0, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans1 = ARRAY_SIZE(kScanScript1);
+
+static constexpr jpeg_scan_info kScanScript2[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 2, 0, 1},   //
+    {1, {0}, 3, 63, 0, 1},  //
+    {1, {0}, 1, 63, 1, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans2 = ARRAY_SIZE(kScanScript2);
+
+static constexpr jpeg_scan_info kScanScript3[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 63, 0, 2},  //
+    {1, {0}, 1, 63, 2, 1},  //
+    {1, {0}, 1, 63, 1, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans3 = ARRAY_SIZE(kScanScript3);
+
+static constexpr jpeg_scan_info kScanScript4[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 1},  //
+    {1, {0}, 1, 5, 0, 2},        //
+    {1, {2}, 1, 63, 0, 1},       //
+    {1, {1}, 1, 63, 0, 1},       //
+    {1, {0}, 6, 63, 0, 2},       //
+    {1, {0}, 1, 63, 2, 1},       //
+    {3, {0, 1, 2}, 0, 0, 1, 0},  //
+    {1, {2}, 1, 63, 1, 0},       //
+    {1, {1}, 1, 63, 1, 0},       //
+    {1, {0}, 1, 63, 1, 0},       //
+};
+static constexpr size_t kNumScans4 = ARRAY_SIZE(kScanScript4);
+
+static constexpr jpeg_scan_info kScanScript5[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 1},  //
+    {1, {0}, 1, 5, 0, 2},        //
+    {1, {1}, 1, 5, 0, 2},        //
+    {1, {2}, 1, 5, 0, 2},        //
+    {1, {1}, 6, 63, 0, 2},       //
+    {1, {2}, 6, 63, 0, 2},       //
+    {1, {0}, 6, 63, 0, 2},       //
+    {1, {0}, 1, 63, 2, 1},       //
+    {1, {1}, 1, 63, 2, 1},       //
+    {1, {2}, 1, 63, 2, 1},       //
+    {3, {0, 1, 2}, 0, 0, 1, 0},  //
+    {1, {0}, 1, 63, 1, 0},       //
+    {1, {1}, 1, 63, 1, 0},       //
+    {1, {2}, 1, 63, 1, 0},       //
+};
+static constexpr size_t kNumScans5 = ARRAY_SIZE(kScanScript5);
+
+// Adapt RGB scan info to grayscale jpegs.
+void FilterScanComponents(const jpeg_compress_struct* cinfo,
+                          jpeg_scan_info* si) {
+  const int all_comps_in_scan = si->comps_in_scan;
+  si->comps_in_scan = 0;
+  for (int j = 0; j < all_comps_in_scan; ++j) {
+    const int component = si->component_index[j];
+    if (component < cinfo->input_components) {
+      si->component_index[si->comps_in_scan++] = component;
+    }
+  }
+}
+
+Status SetJpegProgression(int progressive_id,
+                          std::vector<jpeg_scan_info>* scan_infos,
+                          jpeg_compress_struct* cinfo) {
+  if (progressive_id < 0) {
+    return true;
+  }
+  if (progressive_id == 0) {
+    jpeg_simple_progression(cinfo);
+    return true;
+  }
+  constexpr const jpeg_scan_info* kScanScripts[] = {
+      kScanScript1, kScanScript2, kScanScript3, kScanScript4, kScanScript5,
+  };
+  constexpr size_t kNumScans[] = {kNumScans1, kNumScans2, kNumScans3,
+                                  kNumScans4, kNumScans5};
+  if (progressive_id > static_cast<int>(ARRAY_SIZE(kNumScans))) {
+    return JXL_FAILURE("Unknown jpeg scan script id %d", progressive_id);
+  }
+  const jpeg_scan_info* scan_script = kScanScripts[progressive_id - 1];
+  const size_t num_scans = kNumScans[progressive_id - 1];
+  // filter scan script for number of components
+  for (size_t i = 0; i < num_scans; ++i) {
+    jpeg_scan_info scan_info = scan_script[i];
+    FilterScanComponents(cinfo, &scan_info);
+    if (scan_info.comps_in_scan > 0) {
+      scan_infos->emplace_back(std::move(scan_info));
+    }
+  }
+  cinfo->scan_info = scan_infos->data();
+  cinfo->num_scans = scan_infos->size();
+  return true;
+}
+
+bool IsSRGBEncoding(const JxlColorEncoding& c) {
+  return ((c.color_space == JXL_COLOR_SPACE_RGB ||
+           c.color_space == JXL_COLOR_SPACE_GRAY) &&
+          c.primaries == JXL_PRIMARIES_SRGB &&
+          c.white_point == JXL_WHITE_POINT_D65 &&
+          c.transfer_function == JXL_TRANSFER_FUNCTION_SRGB);
+}
+
+void WriteICCProfile(jpeg_compress_struct* const cinfo,
+                     const std::vector<uint8_t>& icc) {
+  constexpr size_t kMaxIccBytesInMarker =
+      kMaxBytesInMarker - sizeof kICCSignature - 2;
+  const int num_markers =
+      static_cast<int>(DivCeil(icc.size(), kMaxIccBytesInMarker));
+  size_t begin = 0;
+  for (int current_marker = 0; current_marker < num_markers; ++current_marker) {
+    const size_t length = std::min(kMaxIccBytesInMarker, icc.size() - begin);
+    jpeg_write_m_header(
+        cinfo, kICCMarker,
+        static_cast<unsigned int>(length + sizeof kICCSignature + 2));
+    for (const unsigned char c : kICCSignature) {
+      jpeg_write_m_byte(cinfo, c);
+    }
+    jpeg_write_m_byte(cinfo, current_marker + 1);
+    jpeg_write_m_byte(cinfo, num_markers);
+    for (size_t i = 0; i < length; ++i) {
+      jpeg_write_m_byte(cinfo, icc[begin]);
+      ++begin;
+    }
+  }
+}
+void WriteExif(jpeg_compress_struct* const cinfo,
+               const std::vector<uint8_t>& exif) {
+  jpeg_write_m_header(
+      cinfo, kExifMarker,
+      static_cast<unsigned int>(exif.size() + sizeof kExifSignature));
+  for (const unsigned char c : kExifSignature) {
+    jpeg_write_m_byte(cinfo, c);
+  }
+  for (size_t i = 0; i < exif.size(); ++i) {
+    jpeg_write_m_byte(cinfo, exif[i]);
+  }
+}
+
+Status SetChromaSubsampling(const std::string& subsampling,
+                            jpeg_compress_struct* const cinfo) {
+  const std::pair<const char*,
+                  std::pair<std::array<uint8_t, 3>, std::array<uint8_t, 3>>>
+      options[] = {{"444", {{{1, 1, 1}}, {{1, 1, 1}}}},
+                   {"420", {{{2, 1, 1}}, {{2, 1, 1}}}},
+                   {"422", {{{2, 1, 1}}, {{1, 1, 1}}}},
+                   {"440", {{{1, 1, 1}}, {{2, 1, 1}}}}};
+  for (const auto& option : options) {
+    if (subsampling == option.first) {
+      for (size_t i = 0; i < 3; i++) {
+        cinfo->comp_info[i].h_samp_factor = option.second.first[i];
+        cinfo->comp_info[i].v_samp_factor = option.second.second[i];
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
+                         const JxlColorEncoding& color_encoding,
+                         const std::vector<uint8_t>& icc,
+                         std::vector<uint8_t> exif, size_t quality,
+                         const std::string& chroma_subsampling,
+                         int progressive_id, bool optimize_coding,
+                         std::vector<uint8_t>* bytes) {
+  if (BITS_IN_JSAMPLE != 8 || sizeof(JSAMPLE) != 1) {
+    return JXL_FAILURE("Only 8 bit JSAMPLE is supported.");
+  }
+  jpeg_compress_struct cinfo = {};
+  jpeg_error_mgr jerr;
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+  unsigned char* buffer = nullptr;
+  unsigned long size = 0;
+  jpeg_mem_dest(&cinfo, &buffer, &size);
+  cinfo.image_width = image.xsize;
+  cinfo.image_height = image.ysize;
+  cinfo.input_components = info.num_color_channels;
+  cinfo.in_color_space = info.num_color_channels == 1 ? JCS_GRAYSCALE : JCS_RGB;
+  jpeg_set_defaults(&cinfo);
+  cinfo.optimize_coding = optimize_coding;
+  if (cinfo.input_components == 3) {
+    JXL_RETURN_IF_ERROR(SetChromaSubsampling(chroma_subsampling, &cinfo));
+  }
+  if (color_encoding.color_space == JXL_COLOR_SPACE_XYB) {
+    // Tell libjpeg not to convert XYB data to YCbCr.
+    jpeg_set_colorspace(&cinfo, JCS_RGB);
+  }
+  jpeg_set_quality(&cinfo, quality, TRUE);
+  std::vector<jpeg_scan_info> scan_infos;
+  JXL_RETURN_IF_ERROR(SetJpegProgression(progressive_id, &scan_infos, &cinfo));
+  jpeg_start_compress(&cinfo, TRUE);
+  if (!icc.empty()) {
+    WriteICCProfile(&cinfo, icc);
+  }
+  if (!exif.empty()) {
+    ResetExifOrientation(exif);
+    WriteExif(&cinfo, exif);
+  }
+  if (cinfo.input_components > 3 || cinfo.input_components < 0)
+    return JXL_FAILURE("invalid numbers of components");
+
+  std::vector<uint8_t> raw_bytes(image.pixels_size);
+  memcpy(&raw_bytes[0], reinterpret_cast<const uint8_t*>(image.pixels()),
+         image.pixels_size);
+  for (size_t y = 0; y < info.ysize; ++y) {
+    JSAMPROW row[] = {raw_bytes.data() + y * image.stride};
+
+    jpeg_write_scanlines(&cinfo, row, 1);
+  }
+  jpeg_finish_compress(&cinfo);
+  jpeg_destroy_compress(&cinfo);
+  bytes->resize(size);
+  // Compressed image data is initialized by libjpeg, which we are not
+  // instrumenting with msan.
+  msan::UnpoisonMemory(buffer, size);
+  std::copy_n(buffer, size, bytes->data());
+  std::free(buffer);
+  return true;
+}
+
+Status EncodeWithSJpeg(const PackedImage& image, const JxlBasicInfo& info,
+                       const std::vector<uint8_t>& icc,
+                       std::vector<uint8_t> exif, size_t quality,
+                       const std::string& chroma_subsampling,
+                       std::vector<uint8_t>* bytes) {
+#if !JPEGXL_ENABLE_SJPEG
+  return JXL_FAILURE("JPEG XL was built without sjpeg support");
+#else
+  sjpeg::EncoderParam param(quality);
+  if (!icc.empty()) {
+    param.iccp.assign(icc.begin(), icc.end());
+  }
+  if (!exif.empty()) {
+    ResetExifOrientation(exif);
+    param.exif.assign(exif.begin(), exif.end());
+  }
+  if (chroma_subsampling == "444") {
+    param.yuv_mode = SJPEG_YUV_444;
+  } else if (chroma_subsampling == "420") {
+    param.yuv_mode = SJPEG_YUV_SHARP;
+  } else {
+    return JXL_FAILURE("sjpeg does not support this chroma subsampling mode");
+  }
+  size_t stride = info.xsize * 3;
+  const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
+  std::string output;
+  JXL_RETURN_IF_ERROR(
+      sjpeg::Encode(pixels, image.xsize, image.ysize, stride, param, &output));
+  bytes->assign(
+      reinterpret_cast<const uint8_t*>(output.data()),
+      reinterpret_cast<const uint8_t*>(output.data() + output.size()));
+  return true;
+#endif
+}
+
+Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
+                      const JxlColorEncoding& color_encoding,
+                      const std::vector<uint8_t>& icc,
+                      std::vector<uint8_t> exif, JpegEncoder encoder,
+                      size_t quality, const std::string& chroma_subsampling,
+                      int progressive_id, bool optimize_coding,
+                      ThreadPool* pool, std::vector<uint8_t>* bytes) {
+  if (image.format.data_type != JXL_TYPE_UINT8) {
+    return JXL_FAILURE("Unsupported pixel data type");
+  }
+  if (info.alpha_bits > 0) {
+    return JXL_FAILURE("alpha is not supported");
+  }
+  if (quality > 100) {
+    return JXL_FAILURE("please specify a 0-100 JPEG quality");
+  }
+
+  switch (encoder) {
+    case JpegEncoder::kLibJpeg:
+      JXL_RETURN_IF_ERROR(EncodeWithLibJpeg(
+          image, info, color_encoding, icc, std::move(exif), quality,
+          chroma_subsampling, progressive_id, optimize_coding, bytes));
+      break;
+    case JpegEncoder::kSJpeg:
+      JXL_RETURN_IF_ERROR(EncodeWithSJpeg(image, info, icc, std::move(exif),
+                                          quality, chroma_subsampling, bytes));
+      break;
+    default:
+      return JXL_FAILURE("tried to use an unknown JPEG encoder");
+  }
+
+  return true;
+}
+
+class JPEGEncoder : public Encoder {
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 3}) {
+      for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/JXL_TYPE_UINT8,
+                                         /*endianness=*/endianness,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool = nullptr) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    int quality = 100;
+    std::string chroma_subsampling = "444";
+    JpegEncoder jpeg_encoder = JpegEncoder::kLibJpeg;
+    int progressive_id = -1;
+    bool optimize_coding = true;
+    for (const auto& it : options()) {
+      if (it.first == "q") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
+      } else if (it.first == "chroma_subsampling") {
+        chroma_subsampling = it.second;
+      } else if (it.first == "jpeg_encoder") {
+        if (it.second == "libjpeg") {
+          jpeg_encoder = JpegEncoder::kLibJpeg;
+        } else if (it.second == "sjpeg") {
+          jpeg_encoder = JpegEncoder::kSJpeg;
+        } else {
+          return JXL_FAILURE("unknown jpeg encoder \"%s\"", it.second.c_str());
+        }
+      } else if (it.first == "progressive") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> progressive_id));
+      } else if (it.first == "optimize" && it.second == "OFF") {
+        optimize_coding = false;
+      }
+    }
+    std::vector<uint8_t> icc;
+    if (!IsSRGBEncoding(ppf.color_encoding)) {
+      icc = ppf.icc;
+    }
+    encoded_image->bitstreams.clear();
+    encoded_image->bitstreams.reserve(ppf.frames.size());
+    for (const auto& frame : ppf.frames) {
+      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
+      encoded_image->bitstreams.emplace_back();
+      JXL_RETURN_IF_ERROR(EncodeImageJPG(
+          frame.color, ppf.info, ppf.color_encoding, icc, ppf.metadata.exif,
+          jpeg_encoder, quality, chroma_subsampling, progressive_id,
+          optimize_coding, pool, &encoded_image->bitstreams.back()));
+    }
+    return true;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetJPEGEncoder() {
+  return jxl::make_unique<JPEGEncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/jpg.h b/third_party/jpeg-xl/lib/extras/enc/jpg.h
new file mode 100644
index 0000000000..20b37cd168
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jpg.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JPG_H_
+#define LIB_EXTRAS_ENC_JPG_H_
+
+// Encodes JPG pixels and metadata in memory.
+
+#include <memory>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetJPEGEncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JPG_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/jxl.cc b/third_party/jpeg-xl/lib/extras/enc/jxl.cc
new file mode 100644
index 0000000000..633c6f2ade
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jxl.cc
@@ -0,0 +1,276 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jxl.h"
+
+#include <jxl/encode_cxx.h>
+
+#include "lib/jxl/exif.h"
+
+namespace jxl {
+namespace extras {
+
+JxlEncoderStatus SetOption(const JXLOption& opt,
+                           JxlEncoderFrameSettings* settings) {
+  return opt.is_float
+             ? JxlEncoderFrameSettingsSetFloatOption(settings, opt.id, opt.fval)
+             : JxlEncoderFrameSettingsSetOption(settings, opt.id, opt.ival);
+}
+
+bool SetFrameOptions(const std::vector<JXLOption>& options, size_t frame_index,
+                     size_t* option_idx, JxlEncoderFrameSettings* settings) {
+  while (*option_idx < options.size()) {
+    const auto& opt = options[*option_idx];
+    if (opt.frame_index > frame_index) {
+      break;
+    }
+    if (JXL_ENC_SUCCESS != SetOption(opt, settings)) {
+      fprintf(stderr, "Setting option id %d failed.\n", opt.id);
+      return false;
+    }
+    (*option_idx)++;
+  }
+  return true;
+}
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed) {
+  auto encoder = JxlEncoderMake(/*memory_manager=*/nullptr);
+  JxlEncoder* enc = encoder.get();
+
+  if (params.allow_expert_options) {
+    JxlEncoderAllowExpertOptions(enc);
+  }
+
+  if (params.runner_opaque != nullptr &&
+      JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc, params.runner,
+                                                     params.runner_opaque)) {
+    fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  auto settings = JxlEncoderFrameSettingsCreate(enc, nullptr);
+  size_t option_idx = 0;
+  if (!SetFrameOptions(params.options, 0, &option_idx, settings)) {
+    return false;
+  }
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetFrameDistance(settings, params.distance)) {
+    fprintf(stderr, "Setting frame distance failed.\n");
+    return false;
+  }
+
+  bool use_boxes = !ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
+                   !ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty();
+  bool use_container = params.use_container || use_boxes ||
+                       (jpeg_bytes && params.jpeg_store_metadata);
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderUseContainer(enc, static_cast<int>(use_container))) {
+    fprintf(stderr, "JxlEncoderUseContainer failed.\n");
+    return false;
+  }
+
+  if (jpeg_bytes) {
+    if (params.jpeg_store_metadata &&
+        JXL_ENC_SUCCESS != JxlEncoderStoreJPEGMetadata(enc, JXL_TRUE)) {
+      fprintf(stderr, "Storing JPEG metadata failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderAddJPEGFrame(settings, jpeg_bytes->data(),
+                                                  jpeg_bytes->size())) {
+      fprintf(stderr, "JxlEncoderAddJPEGFrame() failed.\n");
+      return false;
+    }
+  } else {
+    size_t num_alpha_channels = 0;  // Adjusted below.
+    JxlBasicInfo basic_info = ppf.info;
+    if (basic_info.alpha_bits > 0) num_alpha_channels = 1;
+    if (params.intensity_target > 0) {
+      basic_info.intensity_target = params.intensity_target;
+    }
+    basic_info.num_extra_channels =
+        std::max<uint32_t>(num_alpha_channels, ppf.info.num_extra_channels);
+    basic_info.num_color_channels = ppf.info.num_color_channels;
+    const bool lossless = params.distance == 0;
+    basic_info.uses_original_profile = lossless;
+    if (params.override_bitdepth != 0) {
+      basic_info.bits_per_sample = params.override_bitdepth;
+      basic_info.exponent_bits_per_sample =
+          params.override_bitdepth == 32 ? 8 : 0;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetCodestreamLevel(enc, params.codestream_level)) {
+      fprintf(stderr, "Setting --codestream_level failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderSetBasicInfo(enc, &basic_info)) {
+      fprintf(stderr, "JxlEncoderSetBasicInfo() failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetFrameBitDepth(settings, &params.input_bitdepth)) {
+      fprintf(stderr, "JxlEncoderSetFrameBitDepth() failed.\n");
+      return false;
+    }
+    if (num_alpha_channels != 0 &&
+        JXL_ENC_SUCCESS != JxlEncoderSetExtraChannelDistance(
+                               settings, 0, params.alpha_distance)) {
+      fprintf(stderr, "Setting alpha distance failed.\n");
+      return false;
+    }
+    if (lossless &&
+        JXL_ENC_SUCCESS != JxlEncoderSetFrameLossless(settings, JXL_TRUE)) {
+      fprintf(stderr, "JxlEncoderSetFrameLossless() failed.\n");
+      return false;
+    }
+    if (!ppf.icc.empty()) {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetICCProfile(enc, ppf.icc.data(), ppf.icc.size())) {
+        fprintf(stderr, "JxlEncoderSetICCProfile() failed.\n");
+        return false;
+      }
+    } else {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetColorEncoding(enc, &ppf.color_encoding)) {
+        fprintf(stderr, "JxlEncoderSetColorEncoding() failed.\n");
+        return false;
+      }
+    }
+
+    if (use_boxes) {
+      if (JXL_ENC_SUCCESS != JxlEncoderUseBoxes(enc)) {
+        fprintf(stderr, "JxlEncoderUseBoxes() failed.\n");
+        return false;
+      }
+      // Prepend 4 zero bytes to exif for tiff header offset
+      std::vector<uint8_t> exif_with_offset;
+      bool bigendian;
+      if (IsExif(ppf.metadata.exif, &bigendian)) {
+        exif_with_offset.resize(ppf.metadata.exif.size() + 4);
+        memcpy(exif_with_offset.data() + 4, ppf.metadata.exif.data(),
+               ppf.metadata.exif.size());
+      }
+      const struct BoxInfo {
+        const char* type;
+        const std::vector<uint8_t>& bytes;
+      } boxes[] = {
+          {"Exif", exif_with_offset},
+          {"xml ", ppf.metadata.xmp},
+          {"jumb", ppf.metadata.jumbf},
+          {"xml ", ppf.metadata.iptc},
+      };
+      for (size_t i = 0; i < sizeof boxes / sizeof *boxes; ++i) {
+        const BoxInfo& box = boxes[i];
+        if (!box.bytes.empty() &&
+            JXL_ENC_SUCCESS != JxlEncoderAddBox(enc, box.type, box.bytes.data(),
+                                                box.bytes.size(),
+                                                params.compress_boxes)) {
+          fprintf(stderr, "JxlEncoderAddBox() failed (%s).\n", box.type);
+          return false;
+        }
+      }
+      JxlEncoderCloseBoxes(enc);
+    }
+
+    for (size_t num_frame = 0; num_frame < ppf.frames.size(); ++num_frame) {
+      const jxl::extras::PackedFrame& pframe = ppf.frames[num_frame];
+      const jxl::extras::PackedImage& pimage = pframe.color;
+      JxlPixelFormat ppixelformat = pimage.format;
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetFrameHeader(settings, &pframe.frame_info)) {
+        fprintf(stderr, "JxlEncoderSetFrameHeader() failed.\n");
+        return false;
+      }
+      if (!SetFrameOptions(params.options, num_frame, &option_idx, settings)) {
+        return false;
+      }
+      if (num_alpha_channels > 0) {
+        JxlExtraChannelInfo extra_channel_info;
+        JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &extra_channel_info);
+        extra_channel_info.bits_per_sample = ppf.info.alpha_bits;
+        extra_channel_info.exponent_bits_per_sample =
+            ppf.info.alpha_exponent_bits;
+        if (params.premultiply != -1) {
+          if (params.premultiply != 0 && params.premultiply != 1) {
+            fprintf(stderr, "premultiply must be one of: -1, 0, 1.\n");
+            return false;
+          }
+          extra_channel_info.alpha_premultiplied = params.premultiply;
+        }
+        if (JXL_ENC_SUCCESS !=
+            JxlEncoderSetExtraChannelInfo(enc, 0, &extra_channel_info)) {
+          fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+          return false;
+        }
+        // We take the extra channel blend info frame_info, but don't do
+        // clamping.
+        JxlBlendInfo extra_channel_blend_info =
+            pframe.frame_info.layer_info.blend_info;
+        extra_channel_blend_info.clamp = JXL_FALSE;
+        JxlEncoderSetExtraChannelBlendInfo(settings, 0,
+                                           &extra_channel_blend_info);
+      }
+      size_t num_interleaved_alpha =
+          (ppixelformat.num_channels - ppf.info.num_color_channels);
+      // Add extra channel info for the rest of the extra channels.
+      for (size_t i = 0; i < ppf.info.num_extra_channels; ++i) {
+        if (i < ppf.extra_channels_info.size()) {
+          const auto& ec_info = ppf.extra_channels_info[i].ec_info;
+          if (JXL_ENC_SUCCESS !=
+              JxlEncoderSetExtraChannelInfo(enc, num_interleaved_alpha + i,
+                                            &ec_info)) {
+            fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+            return false;
+          }
+        }
+      }
+      if (JXL_ENC_SUCCESS != JxlEncoderAddImageFrame(settings, &ppixelformat,
+                                                     pimage.pixels(),
+                                                     pimage.pixels_size)) {
+        fprintf(stderr, "JxlEncoderAddImageFrame() failed.\n");
+        return false;
+      }
+      // Only set extra channel buffer if it is provided non-interleaved.
+      for (size_t i = 0; i < pframe.extra_channels.size(); ++i) {
+        if (JXL_ENC_SUCCESS !=
+            JxlEncoderSetExtraChannelBuffer(settings, &ppixelformat,
+                                            pframe.extra_channels[i].pixels(),
+                                            pframe.extra_channels[i].stride *
+                                                pframe.extra_channels[i].ysize,
+                                            num_interleaved_alpha + i)) {
+          fprintf(stderr, "JxlEncoderSetExtraChannelBuffer() failed.\n");
+          return false;
+        }
+      }
+    }
+  }
+  JxlEncoderCloseInput(enc);
+  // Reading compressed output
+  compressed->clear();
+  compressed->resize(4096);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (result == JXL_ENC_NEED_MORE_OUTPUT) {
+    result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  if (result != JXL_ENC_SUCCESS) {
+    fprintf(stderr, "JxlEncoderProcessOutput failed.\n");
+    return false;
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/jxl.h b/third_party/jpeg-xl/lib/extras/enc/jxl.h
new file mode 100644
index 0000000000..3e77fce3c1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/jxl.h
@@ -0,0 +1,78 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JXL_H_
+#define LIB_EXTRAS_ENC_JXL_H_
+
+#include <jxl/encode.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+struct JXLOption {
+  JXLOption(JxlEncoderFrameSettingId id, int64_t val, size_t frame_index)
+      : id(id), is_float(false), ival(val), frame_index(frame_index) {}
+  JXLOption(JxlEncoderFrameSettingId id, float val, size_t frame_index)
+      : id(id), is_float(true), fval(val), frame_index(frame_index) {}
+
+  JxlEncoderFrameSettingId id;
+  bool is_float;
+  union {
+    int64_t ival;
+    float fval;
+  };
+  size_t frame_index;
+};
+
+struct JXLCompressParams {
+  std::vector<JXLOption> options;
+  // Target butteraugli distance, 0.0 means lossless.
+  float distance = 1.0f;
+  float alpha_distance = 1.0f;
+  // If set to true, forces container mode.
+  bool use_container = false;
+  // Whether to enable/disable byte-exact jpeg reconstruction for jpeg inputs.
+  bool jpeg_store_metadata = true;
+  // Whether to create brob boxes.
+  bool compress_boxes = true;
+  // Upper bound on the intensity level present in the image in nits (zero means
+  // that the library chooses a default).
+  float intensity_target = 0;
+  // Overrides for bitdepth, codestream level and alpha premultiply.
+  size_t override_bitdepth = 0;
+  int32_t codestream_level = -1;
+  int32_t premultiply = -1;
+  // Override input buffer interpretation.
+  JxlBitDepth input_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
+  // If runner_opaque is set, the decoder uses this parallel runner.
+  JxlParallelRunner runner = JxlThreadParallelRunner;
+  void* runner_opaque = nullptr;
+
+  bool allow_expert_options = false;
+
+  void AddOption(JxlEncoderFrameSettingId id, int64_t val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+  void AddFloatOption(JxlEncoderFrameSettingId id, float val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+};
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JXL_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/npy.cc b/third_party/jpeg-xl/lib/extras/enc/npy.cc
new file mode 100644
index 0000000000..e7a659184b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/npy.cc
@@ -0,0 +1,322 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/npy.h"
+
+#include <jxl/types.h>
+#include <stdio.h>
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+// JSON value writing
+
+class JSONField {
+ public:
+  virtual ~JSONField() = default;
+  virtual void Write(std::ostream& o, uint32_t indent) const = 0;
+
+ protected:
+  JSONField() = default;
+};
+
+class JSONValue : public JSONField {
+ public:
+  template <typename T>
+  explicit JSONValue(const T& value) : value_(std::to_string(value)) {}
+
+  explicit JSONValue(const std::string& value) : value_("\"" + value + "\"") {}
+
+  explicit JSONValue(bool value) : value_(value ? "true" : "false") {}
+
+  void Write(std::ostream& o, uint32_t indent) const override { o << value_; }
+
+ private:
+  std::string value_;
+};
+
+class JSONDict : public JSONField {
+ public:
+  JSONDict() = default;
+
+  template <typename T>
+  T* AddEmpty(const std::string& key) {
+    static_assert(std::is_convertible<T*, JSONField*>::value,
+                  "T must be a JSONField");
+    T* ret = new T();
+    values_.emplace_back(
+        key, std::unique_ptr<JSONField>(static_cast<JSONField*>(ret)));
+    return ret;
+  }
+
+  template <typename T>
+  void Add(const std::string& key, const T& value) {
+    values_.emplace_back(key, std::unique_ptr<JSONField>(new JSONValue(value)));
+  }
+
+  void Write(std::ostream& o, uint32_t indent) const override {
+    std::string indent_str(indent, ' ');
+    o << "{";
+    bool is_first = true;
+    for (const auto& key_value : values_) {
+      if (!is_first) {
+        o << ",";
+      }
+      is_first = false;
+      o << std::endl << indent_str << "  \"" << key_value.first << "\": ";
+      key_value.second->Write(o, indent + 2);
+    }
+    if (!values_.empty()) {
+      o << std::endl << indent_str;
+    }
+    o << "}";
+  }
+
+ private:
+  // Dictionary with order.
+  std::vector<std::pair<std::string, std::unique_ptr<JSONField>>> values_;
+};
+
+class JSONArray : public JSONField {
+ public:
+  JSONArray() = default;
+
+  template <typename T>
+  T* AddEmpty() {
+    static_assert(std::is_convertible<T*, JSONField*>::value,
+                  "T must be a JSONField");
+    T* ret = new T();
+    values_.emplace_back(ret);
+    return ret;
+  }
+
+  template <typename T>
+  void Add(const T& value) {
+    values_.emplace_back(new JSONValue(value));
+  }
+
+  void Write(std::ostream& o, uint32_t indent) const override {
+    std::string indent_str(indent, ' ');
+    o << "[";
+    bool is_first = true;
+    for (const auto& value : values_) {
+      if (!is_first) {
+        o << ",";
+      }
+      is_first = false;
+      o << std::endl << indent_str << "  ";
+      value->Write(o, indent + 2);
+    }
+    if (!values_.empty()) {
+      o << std::endl << indent_str;
+    }
+    o << "]";
+  }
+
+ private:
+  std::vector<std::unique_ptr<JSONField>> values_;
+};
+
+void GenerateMetadata(const PackedPixelFile& ppf, std::vector<uint8_t>* out) {
+  JSONDict meta;
+  // Same order as in 18181-3 CD.
+
+  // Frames.
+  auto* meta_frames = meta.AddEmpty<JSONArray>("frames");
+  for (size_t i = 0; i < ppf.frames.size(); i++) {
+    auto* frame_i = meta_frames->AddEmpty<JSONDict>();
+    if (ppf.info.have_animation) {
+      frame_i->Add("duration",
+                   JSONValue(ppf.frames[i].frame_info.duration * 1.0f *
+                             ppf.info.animation.tps_denominator /
+                             ppf.info.animation.tps_numerator));
+    }
+
+    frame_i->Add("name", JSONValue(ppf.frames[i].name));
+
+    if (ppf.info.animation.have_timecodes) {
+      frame_i->Add("timecode", JSONValue(ppf.frames[i].frame_info.timecode));
+    }
+  }
+
+#define METADATA(FIELD) meta.Add(#FIELD, ppf.info.FIELD)
+
+  METADATA(intensity_target);
+  METADATA(min_nits);
+  METADATA(relative_to_max_display);
+  METADATA(linear_below);
+
+  if (ppf.info.have_preview) {
+    meta.AddEmpty<JSONDict>("preview");
+    // TODO(veluca): can we have duration/name/timecode here?
+  }
+
+  {
+    auto ectype = meta.AddEmpty<JSONArray>("extra_channel_type");
+    auto bps = meta.AddEmpty<JSONArray>("bits_per_sample");
+    auto ebps = meta.AddEmpty<JSONArray>("exp_bits_per_sample");
+    bps->Add(ppf.info.bits_per_sample);
+    ebps->Add(ppf.info.exponent_bits_per_sample);
+    for (size_t i = 0; i < ppf.extra_channels_info.size(); i++) {
+      switch (ppf.extra_channels_info[i].ec_info.type) {
+        case JXL_CHANNEL_ALPHA: {
+          ectype->Add(std::string("Alpha"));
+          break;
+        }
+        case JXL_CHANNEL_DEPTH: {
+          ectype->Add(std::string("Depth"));
+          break;
+        }
+        case JXL_CHANNEL_SPOT_COLOR: {
+          ectype->Add(std::string("SpotColor"));
+          break;
+        }
+        case JXL_CHANNEL_SELECTION_MASK: {
+          ectype->Add(std::string("SelectionMask"));
+          break;
+        }
+        case JXL_CHANNEL_BLACK: {
+          ectype->Add(std::string("Black"));
+          break;
+        }
+        case JXL_CHANNEL_CFA: {
+          ectype->Add(std::string("CFA"));
+          break;
+        }
+        case JXL_CHANNEL_THERMAL: {
+          ectype->Add(std::string("Thermal"));
+          break;
+        }
+        default: {
+          ectype->Add(std::string("UNKNOWN"));
+          break;
+        }
+      }
+      bps->Add(ppf.extra_channels_info[i].ec_info.bits_per_sample);
+      ebps->Add(ppf.extra_channels_info[i].ec_info.exponent_bits_per_sample);
+    }
+  }
+
+  std::ostringstream os;
+  meta.Write(os, 0);
+  out->resize(os.str().size());
+  memcpy(out->data(), os.str().data(), os.str().size());
+}
+
+void Append(std::vector<uint8_t>* out, const void* data, size_t size) {
+  size_t pos = out->size();
+  out->resize(pos + size);
+  memcpy(out->data() + pos, data, size);
+}
+
+void WriteNPYHeader(size_t xsize, size_t ysize, uint32_t num_channels,
+                    size_t num_frames, std::vector<uint8_t>* out) {
+  const uint8_t header[] = "\x93NUMPY\x01\x00";
+  Append(out, header, 8);
+  std::stringstream ss;
+  ss << "{'descr': '<f4', 'fortran_order': False, 'shape': (" << num_frames
+     << ", " << ysize << ", " << xsize << ", " << num_channels << "), }\n";
+  // 16-bit little endian header length.
+  uint8_t header_len[2] = {static_cast<uint8_t>(ss.str().size() % 256),
+                           static_cast<uint8_t>(ss.str().size() / 256)};
+  Append(out, header_len, 2);
+  Append(out, ss.str().data(), ss.str().size());
+}
+
+bool WriteFrameToNPYArray(size_t xsize, size_t ysize, const PackedFrame& frame,
+                          std::vector<uint8_t>* out) {
+  const auto& color = frame.color;
+  if (color.xsize != xsize || color.ysize != ysize) {
+    return false;
+  }
+  for (const auto& ec : frame.extra_channels) {
+    if (ec.xsize != xsize || ec.ysize != ysize) {
+      return false;
+    }
+  }
+  // interleave the samples from color and extra channels
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      {
+        size_t sample_size = color.pixel_stride();
+        size_t offset = y * color.stride + x * sample_size;
+        uint8_t* pixels = reinterpret_cast<uint8_t*>(color.pixels());
+        JXL_ASSERT(offset + sample_size <= color.pixels_size);
+        Append(out, pixels + offset, sample_size);
+      }
+      for (const auto& ec : frame.extra_channels) {
+        size_t sample_size = ec.pixel_stride();
+        size_t offset = y * ec.stride + x * sample_size;
+        uint8_t* pixels = reinterpret_cast<uint8_t*>(ec.pixels());
+        JXL_ASSERT(offset + sample_size <= ec.pixels_size);
+        Append(out, pixels + offset, sample_size);
+      }
+    }
+  }
+  return true;
+}
+
+// Writes a PackedPixelFile as a numpy 4D ndarray in binary format.
+bool WriteNPYArray(const PackedPixelFile& ppf, std::vector<uint8_t>* out) {
+  size_t xsize = ppf.info.xsize;
+  size_t ysize = ppf.info.ysize;
+  WriteNPYHeader(xsize, ysize,
+                 ppf.info.num_color_channels + ppf.extra_channels_info.size(),
+                 ppf.frames.size(), out);
+  for (const auto& frame : ppf.frames) {
+    if (!WriteFrameToNPYArray(xsize, ysize, frame, out)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+class NumPyEncoder : public Encoder {
+ public:
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool = nullptr) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    GenerateMetadata(ppf, &encoded_image->metadata);
+    encoded_image->bitstreams.emplace_back();
+    if (!WriteNPYArray(ppf, &encoded_image->bitstreams.back())) {
+      return false;
+    }
+    if (ppf.preview_frame) {
+      size_t xsize = ppf.info.preview.xsize;
+      size_t ysize = ppf.info.preview.ysize;
+      WriteNPYHeader(xsize, ysize, ppf.info.num_color_channels, 1,
+                     &encoded_image->preview_bitstream);
+      if (!WriteFrameToNPYArray(xsize, ysize, *ppf.preview_frame,
+                                &encoded_image->preview_bitstream)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 3}) {
+      formats.push_back(JxlPixelFormat{num_channels, JXL_TYPE_FLOAT,
+                                       JXL_LITTLE_ENDIAN, /*align=*/0});
+    }
+    return formats;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetNumPyEncoder() {
+  return jxl::make_unique<NumPyEncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/npy.h b/third_party/jpeg-xl/lib/extras/enc/npy.h
new file mode 100644
index 0000000000..3ee6208ec2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/npy.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_NPY_H_
+#define LIB_EXTRAS_ENC_NPY_H_
+
+// Encodes pixels to numpy array, used for conformance testing.
+
+#include <memory>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetNumPyEncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_NPY_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/pgx.cc b/third_party/jpeg-xl/lib/extras/enc/pgx.cc
new file mode 100644
index 0000000000..201c8b4189
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/pgx.cc
@@ -0,0 +1,123 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/pgx.h"
+
+#include <jxl/codestream_header.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/byte_order.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+constexpr size_t kMaxHeaderSize = 200;
+
+Status EncodeHeader(const JxlBasicInfo& info, char* header,
+                    int* chars_written) {
+  if (info.alpha_bits > 0) {
+    return JXL_FAILURE("PGX: can't store alpha");
+  }
+  if (info.num_color_channels != 1) {
+    return JXL_FAILURE("PGX: must be grayscale");
+  }
+  // TODO(lode): verify other bit depths: for other bit depths such as 1 or 4
+  // bits, have a test case to verify it works correctly. For bits > 16, we may
+  // need to change the way external_image works.
+  if (info.bits_per_sample != 8 && info.bits_per_sample != 16) {
+    return JXL_FAILURE("PGX: bits other than 8 or 16 not yet supported");
+  }
+
+  // Use ML (Big Endian), LM may not be well supported by all decoders.
+  *chars_written = snprintf(header, kMaxHeaderSize, "PG ML + %u %u %u\n",
+                            info.bits_per_sample, info.xsize, info.ysize);
+  JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
+                      kMaxHeaderSize);
+  return true;
+}
+
+Status EncodeImagePGX(const PackedFrame& frame, const JxlBasicInfo& info,
+                      std::vector<uint8_t>* bytes) {
+  char header[kMaxHeaderSize];
+  int header_size = 0;
+  JXL_RETURN_IF_ERROR(EncodeHeader(info, header, &header_size));
+
+  const PackedImage& color = frame.color;
+  const JxlPixelFormat format = color.format;
+  const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
+  size_t data_bits_per_sample = PackedImage::BitsPerChannel(format.data_type);
+  size_t bytes_per_sample = data_bits_per_sample / kBitsPerByte;
+  size_t num_samples = info.xsize * info.ysize;
+
+  if (info.bits_per_sample != data_bits_per_sample) {
+    return JXL_FAILURE("Bit depth does not match pixel data type");
+  }
+
+  std::vector<uint8_t> pixels(num_samples * bytes_per_sample);
+
+  if (format.data_type == JXL_TYPE_UINT8) {
+    memcpy(&pixels[0], in, num_samples * bytes_per_sample);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    if (format.endianness != JXL_BIG_ENDIAN) {
+      const uint8_t* p_in = in;
+      uint8_t* p_out = pixels.data();
+      for (size_t i = 0; i < num_samples; ++i, p_in += 2, p_out += 2) {
+        StoreBE16(LoadLE16(p_in), p_out);
+      }
+    } else {
+      memcpy(&pixels[0], in, num_samples * bytes_per_sample);
+    }
+  } else {
+    return JXL_FAILURE("Unsupported pixel data type");
+  }
+
+  bytes->resize(static_cast<size_t>(header_size) + pixels.size());
+  memcpy(bytes->data(), header, static_cast<size_t>(header_size));
+  memcpy(bytes->data() + header_size, pixels.data(), pixels.size());
+
+  return true;
+}
+
+class PGXEncoder : public Encoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
+      for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/1,
+                                         /*data_type=*/data_type,
+                                         /*endianness=*/endianness,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    encoded_image->icc.assign(ppf.icc.begin(), ppf.icc.end());
+    encoded_image->bitstreams.clear();
+    encoded_image->bitstreams.reserve(ppf.frames.size());
+    for (const auto& frame : ppf.frames) {
+      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
+      encoded_image->bitstreams.emplace_back();
+      JXL_RETURN_IF_ERROR(
+          EncodeImagePGX(frame, ppf.info, &encoded_image->bitstreams.back()));
+    }
+    return true;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetPGXEncoder() {
+  return jxl::make_unique<PGXEncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/pgx.h b/third_party/jpeg-xl/lib/extras/enc/pgx.h
new file mode 100644
index 0000000000..f24e391b09
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/pgx.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_PGX_H_
+#define LIB_EXTRAS_ENC_PGX_H_
+
+// Encodes PGX pixels in memory.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetPGXEncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_PGX_H_
diff --git a/third_party/jpeg-xl/lib/extras/enc/pnm.cc b/third_party/jpeg-xl/lib/extras/enc/pnm.cc
new file mode 100644
index 0000000000..7cebcc2a1f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/pnm.cc
@@ -0,0 +1,303 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/pnm.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_image_bundle.h"
+#include "lib/jxl/fields.h"  // AllDefault
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+constexpr size_t kMaxHeaderSize = 200;
+
+class PNMEncoder : public Encoder {
+ public:
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
+                ThreadPool* pool = nullptr) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    if (!ppf.metadata.exif.empty() || !ppf.metadata.iptc.empty() ||
+        !ppf.metadata.jumbf.empty() || !ppf.metadata.xmp.empty()) {
+      JXL_WARNING("PNM encoder ignoring metadata - use a different codec");
+    }
+    encoded_image->icc = ppf.icc;
+    encoded_image->bitstreams.clear();
+    encoded_image->bitstreams.reserve(ppf.frames.size());
+    for (const auto& frame : ppf.frames) {
+      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
+      encoded_image->bitstreams.emplace_back();
+      JXL_RETURN_IF_ERROR(
+          EncodeFrame(ppf, frame, &encoded_image->bitstreams.back()));
+    }
+    for (size_t i = 0; i < ppf.extra_channels_info.size(); ++i) {
+      const auto& ec_info = ppf.extra_channels_info[i].ec_info;
+      encoded_image->extra_channel_bitstreams.emplace_back();
+      auto& ec_bitstreams = encoded_image->extra_channel_bitstreams.back();
+      for (const auto& frame : ppf.frames) {
+        ec_bitstreams.emplace_back();
+        JXL_RETURN_IF_ERROR(EncodeExtraChannel(frame.extra_channels[i],
+                                               ec_info.bits_per_sample,
+                                               &ec_bitstreams.back()));
+      }
+    }
+    return true;
+  }
+
+ protected:
+  virtual Status EncodeFrame(const PackedPixelFile& ppf,
+                             const PackedFrame& frame,
+                             std::vector<uint8_t>* bytes) const = 0;
+  virtual Status EncodeExtraChannel(const PackedImage& image,
+                                    size_t bits_per_sample,
+                                    std::vector<uint8_t>* bytes) const = 0;
+};
+
+class PPMEncoder : public PNMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    return {JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+            JxlPixelFormat{3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, ppf.info.bits_per_sample, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bits_per_sample, bytes);
+  }
+
+ private:
+  Status EncodeImage(const PackedImage& image, size_t bits_per_sample,
+                     std::vector<uint8_t>* bytes) const {
+    uint32_t maxval = (1u << bits_per_sample) - 1;
+    char type = image.format.num_channels == 1 ? '5' : '6';
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
+                 type, image.xsize, image.ysize, maxval);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    memcpy(bytes->data() + header_size,
+           reinterpret_cast<uint8_t*>(image.pixels()), image.pixels_size);
+    return true;
+  }
+};
+
+class PGMEncoder : public PPMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    return {JxlPixelFormat{1, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+            JxlPixelFormat{1, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
+  }
+};
+
+class PFMEncoder : public PNMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 3}) {
+      for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/JXL_TYPE_FLOAT,
+                                         /*endianness=*/endianness,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bytes);
+  }
+
+ private:
+  Status EncodeImage(const PackedImage& image,
+                     std::vector<uint8_t>* bytes) const {
+    char type = image.format.num_channels == 1 ? 'f' : 'F';
+    double scale = image.format.endianness == JXL_LITTLE_ENDIAN ? -1.0 : 1.0;
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
+                 type, image.xsize, image.ysize, scale);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
+    uint8_t* out = bytes->data() + header_size;
+    for (size_t y = 0; y < image.ysize; ++y) {
+      size_t y_out = image.ysize - 1 - y;
+      const uint8_t* row_in = &in[y * image.stride];
+      uint8_t* row_out = &out[y_out * image.stride];
+      memcpy(row_out, row_in, image.stride);
+    }
+    return true;
+  }
+};
+
+class PAMEncoder : public PNMEncoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/data_type,
+                                         /*endianness=*/JXL_BIG_ENDIAN,
+                                         /*align=*/0});
+      }
+    }
+    return formats;
+  }
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    const PackedImage& color = frame.color;
+    const auto& ec_info = ppf.extra_channels_info;
+    JXL_RETURN_IF_ERROR(frame.extra_channels.size() == ec_info.size());
+    for (const auto& ec : frame.extra_channels) {
+      if (ec.xsize != color.xsize || ec.ysize != color.ysize) {
+        return JXL_FAILURE("Extra channel and color size mismatch.");
+      }
+      if (ec.format.data_type != color.format.data_type ||
+          ec.format.endianness != color.format.endianness) {
+        return JXL_FAILURE("Extra channel and color format mismatch.");
+      }
+    }
+    if (ppf.info.bits_per_sample != ppf.info.alpha_bits) {
+      return JXL_FAILURE("Alpha bit depth does not match image bit depth");
+    }
+    for (const auto& it : ec_info) {
+      if (it.ec_info.bits_per_sample != ppf.info.bits_per_sample) {
+        return JXL_FAILURE(
+            "Extra channel bit depth does not match image bit depth");
+      }
+    }
+    const char* kColorTypes[4] = {"GRAYSCALE", "GRAYSCALE_ALPHA", "RGB",
+                                  "RGB_ALPHA"};
+    uint32_t maxval = (1u << ppf.info.bits_per_sample) - 1;
+    uint32_t depth = color.format.num_channels + ec_info.size();
+    char header[kMaxHeaderSize];
+    size_t pos = 0;
+    pos += snprintf(header + pos, kMaxHeaderSize - pos,
+                    "P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
+                    "\nDEPTH %u\n"
+                    "MAXVAL %u\nTUPLTYPE %s\n",
+                    color.xsize, color.ysize, depth, maxval,
+                    kColorTypes[color.format.num_channels - 1]);
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    for (const auto& info : ec_info) {
+      pos += snprintf(header + pos, kMaxHeaderSize - pos, "TUPLTYPE %s\n",
+                      ExtraChannelTypeName(info.ec_info.type).c_str());
+      JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    }
+    pos += snprintf(header + pos, kMaxHeaderSize - pos, "ENDHDR\n");
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    size_t total_size = color.pixels_size;
+    for (const auto& ec : frame.extra_channels) {
+      total_size += ec.pixels_size;
+    }
+    bytes->resize(pos + total_size);
+    memcpy(bytes->data(), header, pos);
+    // If we have no extra channels, just copy color pixel data over.
+    if (frame.extra_channels.empty()) {
+      memcpy(bytes->data() + pos, reinterpret_cast<uint8_t*>(color.pixels()),
+             color.pixels_size);
+      return true;
+    }
+    // Interleave color and extra channels.
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
+    std::vector<const uint8_t*> ec_in(frame.extra_channels.size());
+    for (size_t i = 0; i < frame.extra_channels.size(); ++i) {
+      ec_in[i] =
+          reinterpret_cast<const uint8_t*>(frame.extra_channels[i].pixels());
+    }
+    uint8_t* out = bytes->data() + pos;
+    size_t pwidth = PackedImage::BitsPerChannel(color.format.data_type) / 8;
+    for (size_t y = 0; y < color.ysize; ++y) {
+      for (size_t x = 0; x < color.xsize; ++x) {
+        memcpy(out, in, color.pixel_stride());
+        out += color.pixel_stride();
+        in += color.pixel_stride();
+        for (auto& p : ec_in) {
+          memcpy(out, p, pwidth);
+          out += pwidth;
+          p += pwidth;
+        }
+      }
+    }
+    return true;
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return true;
+  }
+
+ private:
+  static std::string ExtraChannelTypeName(JxlExtraChannelType type) {
+    switch (type) {
+      case JXL_CHANNEL_ALPHA:
+        return std::string("Alpha");
+      case JXL_CHANNEL_DEPTH:
+        return std::string("Depth");
+      case JXL_CHANNEL_SPOT_COLOR:
+        return std::string("SpotColor");
+      case JXL_CHANNEL_SELECTION_MASK:
+        return std::string("SelectionMask");
+      case JXL_CHANNEL_BLACK:
+        return std::string("Black");
+      case JXL_CHANNEL_CFA:
+        return std::string("CFA");
+      case JXL_CHANNEL_THERMAL:
+        return std::string("Thermal");
+      default:
+        return std::string("UNKNOWN");
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Encoder> GetPPMEncoder() {
+  return jxl::make_unique<PPMEncoder>();
+}
+
+std::unique_ptr<Encoder> GetPFMEncoder() {
+  return jxl::make_unique<PFMEncoder>();
+}
+
+std::unique_ptr<Encoder> GetPGMEncoder() {
+  return jxl::make_unique<PGMEncoder>();
+}
+
+std::unique_ptr<Encoder> GetPAMEncoder() {
+  return jxl::make_unique<PAMEncoder>();
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/enc/pnm.h b/third_party/jpeg-xl/lib/extras/enc/pnm.h
new file mode 100644
index 0000000000..403208cecd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/enc/pnm.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_PNM_H_
+#define LIB_EXTRAS_ENC_PNM_H_
+
+// Encodes/decodes PBM/PGM/PPM/PFM pixels in memory.
+
+// TODO(janwas): workaround for incorrect Win64 codegen (cause unknown)
+#include <hwy/highway.h>
+#include <memory>
+
+#include "lib/extras/enc/encode.h"
+
+namespace jxl {
+namespace extras {
+
+std::unique_ptr<Encoder> GetPAMEncoder();
+std::unique_ptr<Encoder> GetPGMEncoder();
+std::unique_ptr<Encoder> GetPPMEncoder();
+std::unique_ptr<Encoder> GetPFMEncoder();
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_PNM_H_
diff --git a/third_party/jpeg-xl/lib/extras/exif.cc b/third_party/jpeg-xl/lib/extras/exif.cc
new file mode 100644
index 0000000000..7d926558c3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/exif.cc
@@ -0,0 +1,55 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/exif.h"
+
+#include "lib/jxl/base/byte_order.h"
+
+namespace jxl {
+
+constexpr uint16_t kExifOrientationTag = 274;
+
+void ResetExifOrientation(std::vector<uint8_t>& exif) {
+  if (exif.size() < 12) return;  // not enough bytes for a valid exif blob
+  bool bigendian;
+  uint8_t* t = exif.data();
+  if (LoadLE32(t) == 0x2A004D4D) {
+    bigendian = true;
+  } else if (LoadLE32(t) == 0x002A4949) {
+    bigendian = false;
+  } else {
+    return;  // not a valid tiff header
+  }
+  t += 4;
+  uint32_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
+  if (exif.size() < 12 + offset + 2 || offset < 8) return;
+  t += offset - 4;
+  uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t));
+  t += 2;
+  while (nb_tags > 0) {
+    if (t + 12 >= exif.data() + exif.size()) return;
+    uint16_t tag = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 2;
+    if (tag == kExifOrientationTag) {
+      uint16_t type = (bigendian ? LoadBE16(t) : LoadLE16(t));
+      t += 2;
+      uint32_t count = (bigendian ? LoadBE32(t) : LoadLE32(t));
+      t += 4;
+      if (type == 3 && count == 1) {
+        if (bigendian) {
+          StoreBE16(1, t);
+        } else {
+          StoreLE16(1, t);
+        }
+      }
+      return;
+    } else {
+      t += 10;
+      nb_tags--;
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/exif.h b/third_party/jpeg-xl/lib/extras/exif.h
new file mode 100644
index 0000000000..f22b2ccef5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/exif.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_EXIF_H_
+#define LIB_EXTRAS_EXIF_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+namespace jxl {
+
+// Sets the Exif orientation to the identity, to avoid repeated orientation
+void ResetExifOrientation(std::vector<uint8_t>& exif);
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_EXIF_H_
diff --git a/third_party/jpeg-xl/lib/extras/hlg.cc b/third_party/jpeg-xl/lib/extras/hlg.cc
new file mode 100644
index 0000000000..e39a0807f5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/hlg.cc
@@ -0,0 +1,56 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/hlg.h"
+
+#include <cmath>
+
+#include "lib/jxl/enc_color_management.h"
+
+namespace jxl {
+
+float GetHlgGamma(const float peak_luminance, const float surround_luminance) {
+  return 1.2f * std::pow(1.111f, std::log2(peak_luminance / 1000.f)) *
+         std::pow(0.98f, std::log2(surround_luminance / 5.f));
+}
+
+Status HlgOOTF(ImageBundle* ib, const float gamma, ThreadPool* pool) {
+  ColorEncoding linear_rec2020;
+  linear_rec2020.SetColorSpace(ColorSpace::kRGB);
+  linear_rec2020.primaries = Primaries::k2100;
+  linear_rec2020.white_point = WhitePoint::kD65;
+  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
+  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, ib->ysize(), ThreadPool::NoInit,
+      [&](const int y, const int thread) {
+        float* const JXL_RESTRICT rows[3] = {ib->color()->PlaneRow(0, y),
+                                             ib->color()->PlaneRow(1, y),
+                                             ib->color()->PlaneRow(2, y)};
+        for (size_t x = 0; x < ib->xsize(); ++x) {
+          float& red = rows[0][x];
+          float& green = rows[1][x];
+          float& blue = rows[2][x];
+          const float luminance =
+              0.2627f * red + 0.6780f * green + 0.0593f * blue;
+          const float ratio = std::pow(luminance, gamma - 1);
+          if (std::isfinite(ratio)) {
+            red *= ratio;
+            green *= ratio;
+            blue *= ratio;
+          }
+        }
+      },
+      "HlgOOTF"));
+  return true;
+}
+
+Status HlgInverseOOTF(ImageBundle* ib, const float gamma, ThreadPool* pool) {
+  return HlgOOTF(ib, 1.f / gamma, pool);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/hlg.h b/third_party/jpeg-xl/lib/extras/hlg.h
new file mode 100644
index 0000000000..4cfec444f4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/hlg.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_HLG_H_
+#define LIB_EXTRAS_HLG_H_
+
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+float GetHlgGamma(float peak_luminance, float surround_luminance = 5.f);
+
+Status HlgOOTF(ImageBundle* ib, float gamma, ThreadPool* pool = nullptr);
+
+Status HlgInverseOOTF(ImageBundle* ib, float gamma, ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_HLG_H_
diff --git a/third_party/jpeg-xl/lib/extras/jpegli_test.cc b/third_party/jpeg-xl/lib/extras/jpegli_test.cc
new file mode 100644
index 0000000000..6aa8afe4c0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/jpegli_test.cc
@@ -0,0 +1,405 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if JPEGXL_ENABLE_JPEG && JPEGXL_ENABLE_JPEGLI
+
+#include "lib/extras/dec/jpegli.h"
+
+#include <jxl/color_encoding.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/dec/jpg.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/enc/jpegli.h"
+#include "lib/extras/enc/jpg.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+using test::Butteraugli3Norm;
+using test::ButteraugliDistance;
+using test::TestImage;
+
+Status ReadTestImage(const std::string& pathname, PackedPixelFile* ppf) {
+  const PaddedBytes encoded = jxl::test::ReadTestData(pathname);
+  ColorHints color_hints;
+  if (pathname.find(".ppm") != std::string::npos) {
+    color_hints.Add("color_space", "RGB_D65_SRG_Rel_SRG");
+  } else if (pathname.find(".pgm") != std::string::npos) {
+    color_hints.Add("color_space", "Gra_D65_Rel_SRG");
+  }
+  return DecodeBytes(Span<const uint8_t>(encoded), color_hints, ppf);
+}
+
+std::vector<uint8_t> GetAppData(const std::vector<uint8_t>& compressed) {
+  std::vector<uint8_t> result;
+  size_t pos = 2;  // After SOI
+  while (pos + 4 < compressed.size()) {
+    if (compressed[pos] != 0xff || compressed[pos + 1] < 0xe0 ||
+        compressed[pos + 1] > 0xf0) {
+      break;
+    }
+    size_t len = (compressed[pos + 2] << 8) + compressed[pos + 3] + 2;
+    if (pos + len > compressed.size()) {
+      break;
+    }
+    result.insert(result.end(), &compressed[pos], &compressed[pos] + len);
+    pos += len;
+  }
+  return result;
+}
+
+Status DecodeWithLibjpeg(const std::vector<uint8_t>& compressed,
+                         PackedPixelFile* ppf,
+                         const JPGDecompressParams* dparams = nullptr) {
+  return DecodeImageJPG(Span<const uint8_t>(compressed), ColorHints(), ppf,
+                        /*constraints=*/nullptr, dparams);
+}
+
+Status EncodeWithLibjpeg(const PackedPixelFile& ppf, int quality,
+                         std::vector<uint8_t>* compressed) {
+  std::unique_ptr<Encoder> encoder = GetJPEGEncoder();
+  encoder->SetOption("q", std::to_string(quality));
+  EncodedImage encoded;
+  JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded));
+  JXL_RETURN_IF_ERROR(!encoded.bitstreams.empty());
+  *compressed = std::move(encoded.bitstreams[0]);
+  return true;
+}
+
+std::string Description(const JxlColorEncoding& color_encoding) {
+  ColorEncoding c_enc;
+  JXL_CHECK(ConvertExternalToInternalColorEncoding(color_encoding, &c_enc));
+  return Description(c_enc);
+}
+
+float BitsPerPixel(const PackedPixelFile& ppf,
+                   const std::vector<uint8_t>& compressed) {
+  const size_t num_pixels = ppf.info.xsize * ppf.info.ysize;
+  return compressed.size() * 8.0 / num_pixels;
+}
+
+TEST(JpegliTest, JpegliSRGBDecodeTest) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1));
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf2));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf2), ButteraugliDistance(ppf0, ppf1));
+}
+
+TEST(JpegliTest, JpegliGrayscaleDecodeTest) {
+  std::string testimage = "jxl/flower/flower_small.g.depth8.pgm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("Gra_D65_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1));
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf2));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf2), ButteraugliDistance(ppf0, ppf1));
+}
+
+TEST(JpegliTest, JpegliXYBEncodeTest) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = true;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.45f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.32f));
+}
+
+TEST(JpegliTest, JpegliDecodeTestLargeSmoothArea) {
+  TestImage t;
+  const size_t xsize = 2070;
+  const size_t ysize = 1063;
+  t.SetDimensions(xsize, ysize).SetChannels(3);
+  t.SetAllBitDepths(8).SetEndianness(JXL_NATIVE_ENDIAN);
+  TestImage::Frame frame = t.AddFrame();
+  frame.RandomFill();
+  // Create a large smooth area in the top half of the image. This is to test
+  // that the bias statistics calculation can handle many blocks with all-zero
+  // AC coefficients.
+  for (size_t y = 0; y < ysize / 2; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      for (size_t c = 0; c < 3; ++c) {
+        frame.SetValue(y, x, c, 0.5f);
+      }
+    }
+  }
+  const PackedPixelFile& ppf0 = t.ppf();
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf1));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf1), 3.0f);
+}
+
+TEST(JpegliTest, JpegliYUVEncodeTest) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.7f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.32f));
+}
+
+TEST(JpegliTest, JpegliYUVChromaSubsamplingEncodeTest) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  for (const char* sampling : {"440", "422", "420"}) {
+    settings.xyb = false;
+    settings.chroma_subsampling = std::string(sampling);
+    ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+    PackedPixelFile ppf_out;
+    ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+    EXPECT_LE(BitsPerPixel(ppf_in, compressed), 1.55f);
+    EXPECT_LE(ButteraugliDistance(ppf_in, ppf_out), 1.82f);
+  }
+}
+
+TEST(JpegliTest, JpegliYUVEncodeTestNoAq) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  settings.use_adaptive_quantization = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.85f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.25f));
+}
+
+TEST(JpegliTest, JpegliHDRRoundtripTest) {
+  std::string testimage = "jxl/hdr_room.png";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_202_Rel_HLG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(16, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  JpegDecompressParams dparams;
+  dparams.output_data_type = JXL_TYPE_UINT16;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(2.95f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.05f));
+}
+
+TEST(JpegliTest, JpegliSetAppData) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.app_data = {0xff, 0xe3, 0, 4, 0, 1};
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.app_data = {0xff, 0xe3, 0, 6, 0, 1, 2, 3, 0xff, 0xef, 0, 4, 0, 1};
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.xyb = true;
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(0, memcmp(settings.app_data.data(), GetAppData(compressed).data(),
+                      settings.app_data.size()));
+
+  settings.xyb = false;
+  settings.app_data = {0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xe0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xe0, 0, 2};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xeb, 0, 4, 0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xeb, 0, 4, 0, 1, 2, 3};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xab, 0, 4, 0, 1};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.xyb = false;
+  settings.app_data = {
+      0xff, 0xeb, 0,    4,    0,    1,                       //
+      0xff, 0xe2, 0,    20,   0x49, 0x43, 0x43, 0x5F, 0x50,  //
+      0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00, 0,    1,     //
+      0,    0,    0,    0,                                   //
+  };
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.xyb = true;
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+}
+
+struct TestConfig {
+  int num_colors;
+  int passes;
+  int dither;
+};
+
+class JpegliColorQuantTestParam : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(JpegliColorQuantTestParam, JpegliColorQuantizeTest) {
+  TestConfig config = GetParam();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  JPGDecompressParams dparams1;
+  dparams1.two_pass_quant = (config.passes == 2);
+  dparams1.num_colors = config.num_colors;
+  dparams1.dither_mode = config.dither;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1, &dparams1));
+
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams2;
+  dparams2.two_pass_quant = (config.passes == 2);
+  dparams2.num_colors = config.num_colors;
+  dparams2.dither_mode = config.dither;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams2, nullptr, &ppf2));
+
+  double dist1 = Butteraugli3Norm(ppf0, ppf1);
+  double dist2 = Butteraugli3Norm(ppf0, ppf2);
+  printf("distance: %f  vs %f\n", dist2, dist1);
+  if (config.passes == 1) {
+    if (config.num_colors == 16 && config.dither == 2) {
+      // TODO(szabadka) Fix this case.
+      EXPECT_LT(dist2, dist1 * 1.5);
+    } else {
+      EXPECT_LT(dist2, dist1 * 1.05);
+    }
+  } else if (config.num_colors > 64) {
+    // TODO(szabadka) Fix 2pass quantization for <= 64 colors.
+    EXPECT_LT(dist2, dist1 * 1.1);
+  } else if (config.num_colors > 32) {
+    EXPECT_LT(dist2, dist1 * 1.2);
+  } else {
+    EXPECT_LT(dist2, dist1 * 1.7);
+  }
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  for (int num_colors = 8; num_colors <= 256; num_colors *= 2) {
+    for (int passes = 1; passes <= 2; ++passes) {
+      for (int dither = 0; dither < 3; dither += passes) {
+        TestConfig config;
+        config.num_colors = num_colors;
+        config.passes = passes;
+        config.dither = dither;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  static constexpr const char* kDitherModeStr[] = {"No", "Ordered", "FS"};
+  os << c.passes << "pass";
+  os << c.num_colors << "colors";
+  os << kDitherModeStr[c.dither] << "dither";
+  return os;
+}
+
+std::string TestDescription(const testing::TestParamInfo<TestConfig>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(JpegliColorQuantTest,
+                                   JpegliColorQuantTestParam,
+                                   testing::ValuesIn(GenerateTests()),
+                                   TestDescription);
+
+}  // namespace
+}  // namespace extras
+}  // namespace jxl
+#endif  // JPEGXL_ENABLE_JPEG
diff --git a/third_party/jpeg-xl/lib/extras/packed_image.h b/third_party/jpeg-xl/lib/extras/packed_image.h
new file mode 100644
index 0000000000..3eaf5a0c6d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/packed_image.h
@@ -0,0 +1,170 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_PACKED_IMAGE_H_
+#define LIB_EXTRAS_PACKED_IMAGE_H_
+
+// Helper class for storing external (int or float, interleaved) images. This is
+// the common format used by other libraries and in the libjxl API.
+
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/common.h"
+
+namespace jxl {
+namespace extras {
+
+// Class representing an interleaved image with a bunch of channels.
+class PackedImage {
+ public:
+  PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format)
+      : PackedImage(xsize, ysize, format, CalcStride(format, xsize)) {}
+
+  PackedImage Copy() const {
+    PackedImage copy(xsize, ysize, format);
+    memcpy(reinterpret_cast<uint8_t*>(copy.pixels()),
+           reinterpret_cast<const uint8_t*>(pixels()), pixels_size);
+    return copy;
+  }
+
+  // The interleaved pixels as defined in the storage format.
+  void* pixels() const { return pixels_.get(); }
+
+  // The image size in pixels.
+  size_t xsize;
+  size_t ysize;
+
+  // The number of bytes per row.
+  size_t stride;
+
+  // Pixel storage format and buffer size of the pixels_ pointer.
+  JxlPixelFormat format;
+  size_t pixels_size;
+
+  size_t pixel_stride() const {
+    return (BitsPerChannel(format.data_type) * format.num_channels /
+            jxl::kBitsPerByte);
+  }
+
+  static size_t BitsPerChannel(JxlDataType data_type) {
+    switch (data_type) {
+      case JXL_TYPE_UINT8:
+        return 8;
+      case JXL_TYPE_UINT16:
+        return 16;
+      case JXL_TYPE_FLOAT:
+        return 32;
+      case JXL_TYPE_FLOAT16:
+        return 16;
+      default:
+        JXL_ABORT("Unhandled JxlDataType");
+    }
+  }
+
+ private:
+  PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format,
+              size_t stride)
+      : xsize(xsize),
+        ysize(ysize),
+        stride(stride),
+        format(format),
+        pixels_size(ysize * stride),
+        pixels_(malloc(std::max<size_t>(1, pixels_size)), free) {}
+
+  static size_t CalcStride(const JxlPixelFormat& format, size_t xsize) {
+    size_t stride = xsize * (BitsPerChannel(format.data_type) *
+                             format.num_channels / jxl::kBitsPerByte);
+    if (format.align > 1) {
+      stride = jxl::DivCeil(stride, format.align) * format.align;
+    }
+    return stride;
+  }
+
+  std::unique_ptr<void, decltype(free)*> pixels_;
+};
+
+// Helper class representing a frame, as seen from the API. Animations will have
+// multiple frames, but a single frame can have a color/grayscale channel and
+// multiple extra channels. The order of the extra channels should be the same
+// as all other frames in the same image.
+class PackedFrame {
+ public:
+  template <typename... Args>
+  explicit PackedFrame(Args&&... args) : color(std::forward<Args>(args)...) {}
+
+  PackedFrame Copy() const {
+    PackedFrame copy(color.xsize, color.ysize, color.format);
+    copy.frame_info = frame_info;
+    copy.name = name;
+    copy.color = color.Copy();
+    for (size_t i = 0; i < extra_channels.size(); ++i) {
+      PackedImage ec = extra_channels[i].Copy();
+      copy.extra_channels.emplace_back(std::move(ec));
+    }
+    return copy;
+  }
+
+  // The Frame metadata.
+  JxlFrameHeader frame_info = {};
+  std::string name;
+
+  // The pixel data for the color (or grayscale) channels.
+  PackedImage color;
+  // Extra channel image data.
+  std::vector<PackedImage> extra_channels;
+};
+
+// Optional metadata associated with a file
+class PackedMetadata {
+ public:
+  std::vector<uint8_t> exif;
+  std::vector<uint8_t> iptc;
+  std::vector<uint8_t> jumbf;
+  std::vector<uint8_t> xmp;
+};
+
+// The extra channel metadata information.
+struct PackedExtraChannel {
+  JxlExtraChannelInfo ec_info;
+  size_t index;
+  std::string name;
+};
+
+// Helper class representing a JXL image file as decoded to pixels from the API.
+class PackedPixelFile {
+ public:
+  JxlBasicInfo info = {};
+
+  std::vector<PackedExtraChannel> extra_channels_info;
+
+  // Color information of the decoded pixels.
+  // If the icc is empty, the JxlColorEncoding should be used instead.
+  std::vector<uint8_t> icc;
+  JxlColorEncoding color_encoding = {};
+  // The icc profile of the original image.
+  std::vector<uint8_t> orig_icc;
+
+  std::unique_ptr<PackedFrame> preview_frame;
+  std::vector<PackedFrame> frames;
+
+  PackedMetadata metadata;
+  PackedPixelFile() { JxlEncoderInitBasicInfo(&info); };
+};
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_PACKED_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
new file mode 100644
index 0000000000..1dd2b45a7f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
@@ -0,0 +1,300 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/packed_image_convert.h"
+
+#include <jxl/color_encoding.h>
+#include <jxl/types.h>
+
+#include <cstdint>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_image_bundle.h"
+#include "lib/jxl/luminance.h"
+
+namespace jxl {
+namespace extras {
+
+Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,
+                                       const PackedFrame& frame,
+                                       const CodecInOut& io, ThreadPool* pool,
+                                       ImageBundle* bundle) {
+  JXL_ASSERT(frame.color.pixels() != nullptr);
+  const bool float_in = frame.color.format.data_type == JXL_TYPE_FLOAT16 ||
+                        frame.color.format.data_type == JXL_TYPE_FLOAT;
+  size_t frame_bits_per_sample =
+      float_in ? PackedImage::BitsPerChannel(frame.color.format.data_type)
+               : info.bits_per_sample;
+  JXL_ASSERT(frame_bits_per_sample != 0);
+  // It is ok for the frame.color.format.num_channels to not match the
+  // number of channels on the image.
+  JXL_ASSERT(1 <= frame.color.format.num_channels &&
+             frame.color.format.num_channels <= 4);
+
+  const Span<const uint8_t> span(
+      static_cast<const uint8_t*>(frame.color.pixels()),
+      frame.color.pixels_size);
+  JXL_ASSERT(Rect(frame.frame_info.layer_info.crop_x0,
+                  frame.frame_info.layer_info.crop_y0,
+                  frame.frame_info.layer_info.xsize,
+                  frame.frame_info.layer_info.ysize)
+                 .IsInside(Rect(0, 0, info.xsize, info.ysize)));
+  if (info.have_animation) {
+    bundle->duration = frame.frame_info.duration;
+    bundle->blend = frame.frame_info.layer_info.blend_info.blendmode > 0;
+    bundle->use_for_next_frame =
+        frame.frame_info.layer_info.save_as_reference > 0;
+    bundle->origin.x0 = frame.frame_info.layer_info.crop_x0;
+    bundle->origin.y0 = frame.frame_info.layer_info.crop_y0;
+  }
+  bundle->name = frame.name;  // frame.frame_info.name_length is ignored here.
+  JXL_ASSERT(io.metadata.m.color_encoding.IsGray() ==
+             (frame.color.format.num_channels <= 2));
+
+  JXL_RETURN_IF_ERROR(ConvertFromExternal(
+      span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
+      frame_bits_per_sample, frame.color.format, pool, bundle));
+
+  bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
+  for (size_t i = 0; i < frame.extra_channels.size(); i++) {
+    const auto& ppf_ec = frame.extra_channels[i];
+    bundle->extra_channels()[i] = ImageF(ppf_ec.xsize, ppf_ec.ysize);
+    JXL_CHECK(BufferToImageF(ppf_ec.format, ppf_ec.xsize, ppf_ec.ysize,
+                             ppf_ec.pixels(), ppf_ec.pixels_size, pool,
+                             &bundle->extra_channels()[i]));
+  }
+  return true;
+}
+
+Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
+                                          ThreadPool* pool, CodecInOut* io) {
+  const bool has_alpha = ppf.info.alpha_bits != 0;
+  JXL_ASSERT(!ppf.frames.empty());
+  if (has_alpha) {
+    JXL_ASSERT(ppf.info.alpha_bits == ppf.info.bits_per_sample);
+    JXL_ASSERT(ppf.info.alpha_exponent_bits ==
+               ppf.info.exponent_bits_per_sample);
+  }
+
+  const bool is_gray = ppf.info.num_color_channels == 1;
+  JXL_ASSERT(ppf.info.num_color_channels == 1 ||
+             ppf.info.num_color_channels == 3);
+
+  // Convert the image metadata
+  io->SetSize(ppf.info.xsize, ppf.info.ysize);
+  io->metadata.m.bit_depth.bits_per_sample = ppf.info.bits_per_sample;
+  io->metadata.m.bit_depth.exponent_bits_per_sample =
+      ppf.info.exponent_bits_per_sample;
+  io->metadata.m.bit_depth.floating_point_sample =
+      ppf.info.exponent_bits_per_sample != 0;
+  io->metadata.m.modular_16_bit_buffer_sufficient =
+      ppf.info.exponent_bits_per_sample == 0 && ppf.info.bits_per_sample <= 12;
+
+  io->metadata.m.SetAlphaBits(ppf.info.alpha_bits,
+                              ppf.info.alpha_premultiplied);
+
+  io->metadata.m.xyb_encoded = !ppf.info.uses_original_profile;
+  JXL_ASSERT(ppf.info.orientation > 0 && ppf.info.orientation <= 8);
+  io->metadata.m.orientation = ppf.info.orientation;
+
+  // Convert animation metadata
+  JXL_ASSERT(ppf.frames.size() == 1 || ppf.info.have_animation);
+  io->metadata.m.have_animation = ppf.info.have_animation;
+  io->metadata.m.animation.tps_numerator = ppf.info.animation.tps_numerator;
+  io->metadata.m.animation.tps_denominator = ppf.info.animation.tps_denominator;
+  io->metadata.m.animation.num_loops = ppf.info.animation.num_loops;
+
+  // Convert the color encoding.
+  if (!ppf.icc.empty()) {
+    PaddedBytes icc;
+    icc.append(ppf.icc);
+    if (!io->metadata.m.color_encoding.SetICC(std::move(icc))) {
+      fprintf(stderr, "Warning: error setting ICC profile, assuming SRGB\n");
+      io->metadata.m.color_encoding = ColorEncoding::SRGB(is_gray);
+    } else {
+      if (io->metadata.m.color_encoding.IsGray() != is_gray) {
+        // E.g. JPG image has 3 channels, but gray ICC.
+        return JXL_FAILURE("Embedded ICC does not match image color type");
+      }
+    }
+  } else {
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalColorEncoding(
+        ppf.color_encoding, &io->metadata.m.color_encoding));
+    if (io->metadata.m.color_encoding.ICC().empty()) {
+      return JXL_FAILURE("Failed to serialize ICC");
+    }
+  }
+
+  // Convert the extra blobs
+  io->blobs.exif = ppf.metadata.exif;
+  io->blobs.iptc = ppf.metadata.iptc;
+  io->blobs.jumbf = ppf.metadata.jumbf;
+  io->blobs.xmp = ppf.metadata.xmp;
+
+  // Append all other extra channels.
+  for (const auto& info : ppf.extra_channels_info) {
+    ExtraChannelInfo out;
+    out.type = static_cast<jxl::ExtraChannel>(info.ec_info.type);
+    out.bit_depth.bits_per_sample = info.ec_info.bits_per_sample;
+    out.bit_depth.exponent_bits_per_sample =
+        info.ec_info.exponent_bits_per_sample;
+    out.bit_depth.floating_point_sample =
+        info.ec_info.exponent_bits_per_sample != 0;
+    out.dim_shift = info.ec_info.dim_shift;
+    out.name = info.name;
+    out.alpha_associated = (info.ec_info.alpha_premultiplied != 0);
+    out.spot_color[0] = info.ec_info.spot_color[0];
+    out.spot_color[1] = info.ec_info.spot_color[1];
+    out.spot_color[2] = info.ec_info.spot_color[2];
+    out.spot_color[3] = info.ec_info.spot_color[3];
+    io->metadata.m.extra_channel_info.push_back(std::move(out));
+  }
+
+  // Convert the preview
+  if (ppf.preview_frame) {
+    size_t preview_xsize = ppf.preview_frame->color.xsize;
+    size_t preview_ysize = ppf.preview_frame->color.ysize;
+    io->metadata.m.have_preview = true;
+    JXL_RETURN_IF_ERROR(
+        io->metadata.m.preview_size.Set(preview_xsize, preview_ysize));
+    JXL_RETURN_IF_ERROR(ConvertPackedFrameToImageBundle(
+        ppf.info, *ppf.preview_frame, *io, pool, &io->preview_frame));
+  }
+
+  // Convert the pixels
+  io->frames.clear();
+  for (const auto& frame : ppf.frames) {
+    ImageBundle bundle(&io->metadata.m);
+    JXL_RETURN_IF_ERROR(
+        ConvertPackedFrameToImageBundle(ppf.info, frame, *io, pool, &bundle));
+    io->frames.push_back(std::move(bundle));
+  }
+
+  if (ppf.info.exponent_bits_per_sample == 0) {
+    // uint case.
+    io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth();
+  }
+  if (ppf.info.intensity_target != 0) {
+    io->metadata.m.SetIntensityTarget(ppf.info.intensity_target);
+  } else {
+    SetIntensityTarget(&io->metadata.m);
+  }
+  io->CheckMetadata();
+  return true;
+}
+
+// Allows converting from internal CodecInOut to external PackedPixelFile
+Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
+                                          const JxlPixelFormat& pixel_format,
+                                          const ColorEncoding& c_desired,
+                                          ThreadPool* pool,
+                                          PackedPixelFile* ppf) {
+  const bool has_alpha = io.metadata.m.HasAlpha();
+  bool alpha_premultiplied = false;
+  JXL_ASSERT(!io.frames.empty());
+
+  if (has_alpha) {
+    JXL_ASSERT(io.metadata.m.GetAlphaBits() ==
+               io.metadata.m.bit_depth.bits_per_sample);
+    const auto* alpha_channel = io.metadata.m.Find(ExtraChannel::kAlpha);
+    JXL_ASSERT(alpha_channel->bit_depth.exponent_bits_per_sample ==
+               io.metadata.m.bit_depth.exponent_bits_per_sample);
+    alpha_premultiplied = alpha_channel->alpha_associated;
+  }
+
+  // Convert the image metadata
+  ppf->info.xsize = io.metadata.size.xsize();
+  ppf->info.ysize = io.metadata.size.ysize();
+  ppf->info.num_color_channels = io.metadata.m.color_encoding.Channels();
+  ppf->info.bits_per_sample = io.metadata.m.bit_depth.bits_per_sample;
+  ppf->info.exponent_bits_per_sample =
+      io.metadata.m.bit_depth.exponent_bits_per_sample;
+
+  ppf->info.intensity_target = io.metadata.m.tone_mapping.intensity_target;
+  ppf->info.linear_below = io.metadata.m.tone_mapping.linear_below;
+  ppf->info.min_nits = io.metadata.m.tone_mapping.min_nits;
+  ppf->info.relative_to_max_display =
+      io.metadata.m.tone_mapping.relative_to_max_display;
+
+  ppf->info.alpha_bits = io.metadata.m.GetAlphaBits();
+  ppf->info.alpha_premultiplied = alpha_premultiplied;
+
+  ppf->info.uses_original_profile = !io.metadata.m.xyb_encoded;
+  JXL_ASSERT(0 < io.metadata.m.orientation && io.metadata.m.orientation <= 8);
+  ppf->info.orientation =
+      static_cast<JxlOrientation>(io.metadata.m.orientation);
+  ppf->info.num_color_channels = io.metadata.m.color_encoding.Channels();
+
+  // Convert animation metadata
+  JXL_ASSERT(io.frames.size() == 1 || io.metadata.m.have_animation);
+  ppf->info.have_animation = io.metadata.m.have_animation;
+  ppf->info.animation.tps_numerator = io.metadata.m.animation.tps_numerator;
+  ppf->info.animation.tps_denominator = io.metadata.m.animation.tps_denominator;
+  ppf->info.animation.num_loops = io.metadata.m.animation.num_loops;
+
+  // Convert the color encoding
+  ppf->icc.assign(c_desired.ICC().begin(), c_desired.ICC().end());
+  ConvertInternalToExternalColorEncoding(c_desired, &ppf->color_encoding);
+
+  // Convert the extra blobs
+  ppf->metadata.exif = io.blobs.exif;
+  ppf->metadata.iptc = io.blobs.iptc;
+  ppf->metadata.jumbf = io.blobs.jumbf;
+  ppf->metadata.xmp = io.blobs.xmp;
+  const bool float_out = pixel_format.data_type == JXL_TYPE_FLOAT ||
+                         pixel_format.data_type == JXL_TYPE_FLOAT16;
+  // Convert the pixels
+  ppf->frames.clear();
+  for (const auto& frame : io.frames) {
+    JXL_ASSERT(frame.metadata()->bit_depth.bits_per_sample != 0);
+    // It is ok for the frame.color().kNumPlanes to not match the
+    // number of channels on the image.
+    const uint32_t num_channels =
+        frame.metadata()->color_encoding.Channels() + has_alpha;
+    JxlPixelFormat format{/*num_channels=*/num_channels,
+                          /*data_type=*/pixel_format.data_type,
+                          /*endianness=*/pixel_format.endianness,
+                          /*align=*/pixel_format.align};
+
+    PackedFrame packed_frame(frame.oriented_xsize(), frame.oriented_ysize(),
+                             format);
+    const size_t bits_per_sample =
+        float_out ? packed_frame.color.BitsPerChannel(pixel_format.data_type)
+                  : ppf->info.bits_per_sample;
+    packed_frame.name = frame.name;
+    packed_frame.frame_info.name_length = frame.name.size();
+    // Color transform
+    ImageBundle ib = frame.Copy();
+    const ImageBundle* to_color_transform = &ib;
+    ImageMetadata metadata = io.metadata.m;
+    ImageBundle store(&metadata);
+    const ImageBundle* transformed;
+    // TODO(firsching): handle the transform here.
+    JXL_RETURN_IF_ERROR(TransformIfNeeded(*to_color_transform, c_desired,
+                                          GetJxlCms(), pool, &store,
+                                          &transformed));
+
+    JXL_RETURN_IF_ERROR(ConvertToExternal(
+        *transformed, bits_per_sample, float_out, format.num_channels,
+        format.endianness,
+        /* stride_out=*/packed_frame.color.stride, pool,
+        packed_frame.color.pixels(), packed_frame.color.pixels_size,
+        /*out_callback=*/{}, frame.metadata()->GetOrientation()));
+
+    // TODO(firsching): Convert the extra channels, beside one potential alpha
+    // channel. FIXME!
+    JXL_CHECK(frame.extra_channels().size() <= has_alpha);
+    ppf->frames.push_back(std::move(packed_frame));
+  }
+
+  return true;
+}
+}  // namespace extras
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/packed_image_convert.h b/third_party/jpeg-xl/lib/extras/packed_image_convert.h
new file mode 100644
index 0000000000..100adccc09
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_PACKED_IMAGE_CONVERT_H_
+#define LIB_EXTRAS_PACKED_IMAGE_CONVERT_H_
+
+// Helper functions to convert from the external image types to the internal
+// CodecInOut to help transitioning to the external types.
+
+#include <jxl/types.h>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+
+namespace jxl {
+namespace extras {
+
+// Converts an external PackedPixelFile to the internal CodecInOut for use with
+// internal functions directly.
+Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
+                                          ThreadPool* pool, CodecInOut* io);
+
+// Converts an internal CodecInOut for use with internal function to an external
+// PackedPixelFile.
+Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
+                                          const JxlPixelFormat& pixel_format,
+                                          const ColorEncoding& c_desired,
+                                          ThreadPool* pool,
+                                          PackedPixelFile* ppf);
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_PACKED_IMAGE_CONVERT_H_
diff --git a/third_party/jpeg-xl/lib/extras/size_constraints.h b/third_party/jpeg-xl/lib/extras/size_constraints.h
new file mode 100644
index 0000000000..cf06f8cb22
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/size_constraints.h
@@ -0,0 +1,43 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SIZE_CONSTRAINTS_H_
+#define LIB_JXL_SIZE_CONSTRAINTS_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+struct SizeConstraints {
+  // Upper limit on pixel dimensions/area, enforced by VerifyDimensions
+  // (called from decoders). Fuzzers set smaller values to limit memory use.
+  uint32_t dec_max_xsize = 0xFFFFFFFFu;
+  uint32_t dec_max_ysize = 0xFFFFFFFFu;
+  uint64_t dec_max_pixels = 0xFFFFFFFFu;  // Might be up to ~0ull
+};
+
+template <typename T,
+          class = typename std::enable_if<std::is_unsigned<T>::value>::type>
+Status VerifyDimensions(const SizeConstraints* constraints, T xs, T ys) {
+  if (!constraints) return true;
+
+  if (xs == 0 || ys == 0) return JXL_FAILURE("Empty image.");
+  if (xs > constraints->dec_max_xsize) return JXL_FAILURE("Image too wide.");
+  if (ys > constraints->dec_max_ysize) return JXL_FAILURE("Image too tall.");
+
+  const uint64_t num_pixels = static_cast<uint64_t>(xs) * ys;
+  if (num_pixels > constraints->dec_max_pixels) {
+    return JXL_FAILURE("Image too big.");
+  }
+
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_SIZE_CONSTRAINTS_H_
diff --git a/third_party/jpeg-xl/lib/extras/time.cc b/third_party/jpeg-xl/lib/extras/time.cc
new file mode 100644
index 0000000000..73d1b8f260
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/time.cc
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/time.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <ctime>
+
+#include "lib/jxl/base/os_macros.h"  // for JXL_OS_*
+
+#if JXL_OS_WIN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#include <windows.h>
+#endif  // JXL_OS_WIN
+
+#if JXL_OS_MAC
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif  // JXL_OS_MAC
+
+#if JXL_OS_HAIKU
+#include <OS.h>
+#endif  // JXL_OS_HAIKU
+
+namespace jxl {
+
+double Now() {
+#if JXL_OS_WIN
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  LARGE_INTEGER freq;
+  (void)QueryPerformanceFrequency(&freq);
+  return double(counter.QuadPart) / freq.QuadPart;
+#elif JXL_OS_MAC
+  const auto t = mach_absolute_time();
+  // On OSX/iOS platform the elapsed time is cpu time unit
+  // We have to query the time base information to convert it back
+  // See https://developer.apple.com/library/mac/qa/qa1398/_index.html
+  static mach_timebase_info_data_t timebase;
+  if (timebase.denom == 0) {
+    (void)mach_timebase_info(&timebase);
+  }
+  return double(t) * timebase.numer / timebase.denom * 1E-9;
+#elif JXL_OS_HAIKU
+  return double(system_time_nsecs()) * 1E-9;
+#else
+  timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return t.tv_sec + t.tv_nsec * 1E-9;
+#endif
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/extras/time.h b/third_party/jpeg-xl/lib/extras/time.h
new file mode 100644
index 0000000000..c71414b877
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/time.h
@@ -0,0 +1,19 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_TIME_H_
+#define LIB_EXTRAS_TIME_H_
+
+// OS-specific function for timing.
+
+namespace jxl {
+
+// Returns current time [seconds] from a monotonic clock with unspecified
+// starting point - only suitable for computing elapsed time.
+double Now();
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_TIME_H_
diff --git a/third_party/jpeg-xl/lib/extras/tone_mapping.cc b/third_party/jpeg-xl/lib/extras/tone_mapping.cc
new file mode 100644
index 0000000000..1cdd6ed826
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/tone_mapping.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/tone_mapping.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/extras/tone_mapping.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/image_bundle.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+static constexpr float rec2020_luminances[3] = {0.2627f, 0.6780f, 0.0593f};
+
+Status ToneMapFrame(const std::pair<float, float> display_nits,
+                    ImageBundle* const ib, ThreadPool* const pool) {
+  // Perform tone mapping as described in Report ITU-R BT.2390-8, section 5.4
+  // (pp. 23-25).
+  // https://www.itu.int/pub/R-REP-BT.2390-8-2020
+
+  HWY_FULL(float) df;
+  using V = decltype(Zero(df));
+
+  ColorEncoding linear_rec2020;
+  linear_rec2020.SetColorSpace(ColorSpace::kRGB);
+  linear_rec2020.primaries = Primaries::k2100;
+  linear_rec2020.white_point = WhitePoint::kD65;
+  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
+  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+
+  Rec2408ToneMapper<decltype(df)> tone_mapper(
+      {ib->metadata()->tone_mapping.min_nits,
+       ib->metadata()->IntensityTarget()},
+      display_nits, rec2020_luminances);
+
+  return RunOnPool(
+      pool, 0, ib->ysize(), ThreadPool::NoInit,
+      [&](const uint32_t y, size_t /* thread */) {
+        float* const JXL_RESTRICT row_r = ib->color()->PlaneRow(0, y);
+        float* const JXL_RESTRICT row_g = ib->color()->PlaneRow(1, y);
+        float* const JXL_RESTRICT row_b = ib->color()->PlaneRow(2, y);
+        for (size_t x = 0; x < ib->xsize(); x += Lanes(df)) {
+          V red = Load(df, row_r + x);
+          V green = Load(df, row_g + x);
+          V blue = Load(df, row_b + x);
+          tone_mapper.ToneMap(&red, &green, &blue);
+          Store(red, df, row_r + x);
+          Store(green, df, row_g + x);
+          Store(blue, df, row_b + x);
+        }
+      },
+      "ToneMap");
+}
+
+Status GamutMapFrame(ImageBundle* const ib, float preserve_saturation,
+                     ThreadPool* const pool) {
+  HWY_FULL(float) df;
+  using V = decltype(Zero(df));
+
+  ColorEncoding linear_rec2020;
+  linear_rec2020.SetColorSpace(ColorSpace::kRGB);
+  linear_rec2020.primaries = Primaries::k2100;
+  linear_rec2020.white_point = WhitePoint::kD65;
+  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
+  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, ib->ysize(), ThreadPool::NoInit,
+      [&](const uint32_t y, size_t /* thread*/) {
+        float* const JXL_RESTRICT row_r = ib->color()->PlaneRow(0, y);
+        float* const JXL_RESTRICT row_g = ib->color()->PlaneRow(1, y);
+        float* const JXL_RESTRICT row_b = ib->color()->PlaneRow(2, y);
+        for (size_t x = 0; x < ib->xsize(); x += Lanes(df)) {
+          V red = Load(df, row_r + x);
+          V green = Load(df, row_g + x);
+          V blue = Load(df, row_b + x);
+          GamutMap(&red, &green, &blue, rec2020_luminances,
+                   preserve_saturation);
+          Store(red, df, row_r + x);
+          Store(green, df, row_g + x);
+          Store(blue, df, row_b + x);
+        }
+      },
+      "GamutMap"));
+
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+namespace {
+HWY_EXPORT(ToneMapFrame);
+HWY_EXPORT(GamutMapFrame);
+}  // namespace
+
+Status ToneMapTo(const std::pair<float, float> display_nits,
+                 CodecInOut* const io, ThreadPool* const pool) {
+  const auto tone_map_frame = HWY_DYNAMIC_DISPATCH(ToneMapFrame);
+  for (ImageBundle& ib : io->frames) {
+    JXL_RETURN_IF_ERROR(tone_map_frame(display_nits, &ib, pool));
+  }
+  io->metadata.m.SetIntensityTarget(display_nits.second);
+  return true;
+}
+
+Status GamutMap(CodecInOut* const io, float preserve_saturation,
+                ThreadPool* const pool) {
+  const auto gamut_map_frame = HWY_DYNAMIC_DISPATCH(GamutMapFrame);
+  for (ImageBundle& ib : io->frames) {
+    JXL_RETURN_IF_ERROR(gamut_map_frame(&ib, preserve_saturation, pool));
+  }
+  return true;
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/extras/tone_mapping.h b/third_party/jpeg-xl/lib/extras/tone_mapping.h
new file mode 100644
index 0000000000..1f474101eb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/tone_mapping.h
@@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_TONE_MAPPING_H_
+#define LIB_EXTRAS_TONE_MAPPING_H_
+
+#include "lib/jxl/codec_in_out.h"
+
+namespace jxl {
+
+// Important: after calling this, the result will contain many out-of-gamut
+// colors. It is very strongly recommended to call GamutMap afterwards to
+// rectify this.
+Status ToneMapTo(std::pair<float, float> display_nits, CodecInOut* io,
+                 ThreadPool* pool = nullptr);
+
+// `preserve_saturation` indicates to what extent to favor saturation over
+// luminance when mapping out-of-gamut colors to Rec. 2020. 0 preserves
+// luminance at the complete expense of saturation, while 1 gives the most
+// saturated color with the same hue that Rec. 2020 can represent even if it
+// means lowering the luminance. Values in between correspond to linear mixtures
+// of those two extremes.
+Status GamutMap(CodecInOut* io, float preserve_saturation,
+                ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_TONE_MAPPING_H_
diff --git a/third_party/jpeg-xl/lib/extras/tone_mapping_gbench.cc b/third_party/jpeg-xl/lib/extras/tone_mapping_gbench.cc
new file mode 100644
index 0000000000..f1d5357345
--- /dev/null
+++ b/third_party/jpeg-xl/lib/extras/tone_mapping_gbench.cc
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/extras/codec.h"
+#include "lib/extras/tone_mapping.h"
+#include "lib/jxl/enc_color_management.h"
+
+namespace jxl {
+
+static void BM_ToneMapping(benchmark::State& state) {
+  Image3F color(2268, 1512);
+  FillImage(0.5f, &color);
+
+  // Use linear Rec. 2020 so that `ToneMapTo` doesn't have to convert to it and
+  // we mainly measure the tone mapping itself.
+  ColorEncoding linear_rec2020;
+  linear_rec2020.SetColorSpace(ColorSpace::kRGB);
+  linear_rec2020.primaries = Primaries::k2100;
+  linear_rec2020.white_point = WhitePoint::kD65;
+  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_CHECK(linear_rec2020.CreateICC());
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    CodecInOut tone_mapping_input;
+    tone_mapping_input.SetFromImage(CopyImage(color), linear_rec2020);
+    tone_mapping_input.metadata.m.SetIntensityTarget(255);
+    state.ResumeTiming();
+
+    JXL_CHECK(ToneMapTo({0.1, 100}, &tone_mapping_input));
+  }
+
+  state.SetItemsProcessed(state.iterations() * color.xsize() * color.ysize());
+}
+BENCHMARK(BM_ToneMapping);
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/gbench_main.cc b/third_party/jpeg-xl/lib/gbench_main.cc
new file mode 100644
index 0000000000..1cc1772017
--- /dev/null
+++ b/third_party/jpeg-xl/lib/gbench_main.cc
@@ -0,0 +1,8 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_MAIN();
diff --git a/third_party/jpeg-xl/lib/include/jxl/butteraugli.h b/third_party/jpeg-xl/lib/include/jxl/butteraugli.h
new file mode 100644
index 0000000000..88b97f6d07
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/butteraugli.h
@@ -0,0 +1,160 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_butteraugli
+ * @{
+ * @file butteraugli.h
+ * @brief Butteraugli API for JPEG XL.
+ */
+
+#ifndef JXL_BUTTERAUGLI_H_
+#define JXL_BUTTERAUGLI_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#include <jxl/jxl_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+
+/**
+ * Opaque structure that holds a butteraugli API.
+ *
+ * Allocated and initialized with JxlButteraugliApiCreate().
+ * Cleaned up and deallocated with JxlButteraugliApiDestroy().
+ */
+typedef struct JxlButteraugliApiStruct JxlButteraugliApi;
+
+/**
+ * Opaque structure that holds intermediary butteraugli results.
+ *
+ * Allocated and initialized with JxlButteraugliCompute().
+ * Cleaned up and deallocated with JxlButteraugliResultDestroy().
+ */
+typedef struct JxlButteraugliResultStruct JxlButteraugliResult;
+
+/**
+ * Deinitializes and frees JxlButteraugliResult instance.
+ *
+ * @param result instance to be cleaned up and deallocated.
+ */
+JXL_EXPORT void JxlButteraugliResultDestroy(JxlButteraugliResult* result);
+
+/**
+ * Creates an instance of JxlButteraugliApi and initializes it.
+ *
+ * @p memory_manager will be used for all the library dynamic allocations made
+ * from this instance. The parameter may be NULL, in which case the default
+ * allocator will be used. See jxl/memory_manager.h for details.
+ *
+ * @param memory_manager custom allocator function. It may be NULL. The memory
+ *        manager will be copied internally.
+ * @return @c NULL if the instance can not be allocated or initialized
+ * @return pointer to initialized JxlEncoder otherwise
+ */
+JXL_EXPORT JxlButteraugliApi* JxlButteraugliApiCreate(
+    const JxlMemoryManager* memory_manager);
+
+/**
+ * Set the parallel runner for multithreading.
+ *
+ * @param api api instance.
+ * @param parallel_runner function pointer to runner for multithreading. A
+ * multithreaded runner should be set to reach fast performance.
+ * @param parallel_runner_opaque opaque pointer for parallel_runner.
+ */
+JXL_EXPORT void JxlButteraugliApiSetParallelRunner(
+    JxlButteraugliApi* api, JxlParallelRunner parallel_runner,
+    void* parallel_runner_opaque);
+
+/**
+ * Set the hf_asymmetry option for butteraugli.
+ *
+ * @param api api instance.
+ * @param v new hf_asymmetry value.
+ */
+JXL_EXPORT void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api,
+                                                float v);
+
+/**
+ * Set the intensity_target option for butteraugli.
+ *
+ * @param api api instance.
+ * @param v new intensity_target value.
+ */
+JXL_EXPORT void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api,
+                                                    float v);
+
+/**
+ * Deinitializes and frees JxlButteraugliApi instance.
+ *
+ * @param api instance to be cleaned up and deallocated.
+ */
+JXL_EXPORT void JxlButteraugliApiDestroy(JxlButteraugliApi* api);
+
+/**
+ * Computes intermediary butteraugli result between an original image and a
+ * distortion.
+ *
+ * @param api api instance for this computation.
+ * @param xsize width of the compared images.
+ * @param ysize height of the compared images.
+ * @param pixel_format_orig pixel format for original image.
+ * @param buffer_orig pixel data for original image.
+ * @param size_orig size of buffer_orig in bytes.
+ * @param pixel_format_dist pixel format for distortion.
+ * @param buffer_dist pixel data for distortion.
+ * @param size_dist size of buffer_dist in bytes.
+ * @return @c NULL if the results can not be computed or initialized.
+ * @return pointer to initialized and computed intermediary result.
+ */
+JXL_EXPORT JxlButteraugliResult* JxlButteraugliCompute(
+    const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize,
+    const JxlPixelFormat* pixel_format_orig, const void* buffer_orig,
+    size_t size_orig, const JxlPixelFormat* pixel_format_dist,
+    const void* buffer_dist, size_t size_dist);
+
+/**
+ * Computes butteraugli max distance based on an intermediary butteraugli
+ * result.
+ *
+ * @param result intermediary result instance.
+ * @return max distance.
+ */
+JXL_EXPORT float JxlButteraugliResultGetMaxDistance(
+    const JxlButteraugliResult* result);
+
+/**
+ * Computes a butteraugli distance based on an intermediary butteraugli result.
+ *
+ * @param result intermediary result instance.
+ * @param pnorm pnorm to calculate.
+ * @return distance using the given pnorm.
+ */
+JXL_EXPORT float JxlButteraugliResultGetDistance(
+    const JxlButteraugliResult* result, float pnorm);
+
+/**
+ * Get a pointer to the distmap in the result.
+ *
+ * @param result intermediary result instance.
+ * @param buffer will be set to the distmap. The distance value for (x,y) will
+ * be available at buffer + y * row_stride + x.
+ * @param row_stride will be set to the row stride of the distmap.
+ */
+JXL_EXPORT void JxlButteraugliResultGetDistmap(
+    const JxlButteraugliResult* result, const float** buffer,
+    uint32_t* row_stride);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_BUTTERAUGLI_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h b/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h
new file mode 100644
index 0000000000..34f904c0df
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/// @addtogroup libjxl_butteraugli
+/// @{
+///
+/// @file butteraugli_cxx.h
+/// @brief C++ header-only helper for @ref butteraugli.h.
+///
+/// There's no binary library associated with the header since this is a header
+/// only library.
+
+#ifndef JXL_BUTTERAUGLI_CXX_H_
+#define JXL_BUTTERAUGLI_CXX_H_
+
+#include <jxl/butteraugli.h>
+
+#include <memory>
+
+#if !(defined(__cplusplus) || defined(c_plusplus))
+#error "This a C++ only header. Use jxl/butteraugli.h from C sources."
+#endif
+
+/// Struct to call JxlButteraugliApiDestroy from the JxlButteraugliApiPtr
+/// unique_ptr.
+struct JxlButteraugliApiDestroyStruct {
+  /// Calls @ref JxlButteraugliApiDestroy() on the passed api.
+  void operator()(JxlButteraugliApi* api) { JxlButteraugliApiDestroy(api); }
+};
+
+/// std::unique_ptr<> type that calls JxlButteraugliApiDestroy() when releasing
+/// the pointer.
+///
+/// Use this helper type from C++ sources to ensure the api is destroyed and
+/// their internal resources released.
+typedef std::unique_ptr<JxlButteraugliApi, JxlButteraugliApiDestroyStruct>
+    JxlButteraugliApiPtr;
+
+/// Struct to call JxlButteraugliResultDestroy from the JxlButteraugliResultPtr
+/// unique_ptr.
+struct JxlButteraugliResultDestroyStruct {
+  /// Calls @ref JxlButteraugliResultDestroy() on the passed result object.
+  void operator()(JxlButteraugliResult* result) {
+    JxlButteraugliResultDestroy(result);
+  }
+};
+
+/// std::unique_ptr<> type that calls JxlButteraugliResultDestroy() when
+/// releasing the pointer.
+///
+/// Use this helper type from C++ sources to ensure the result object is
+/// destroyed and their internal resources released.
+typedef std::unique_ptr<JxlButteraugliResult, JxlButteraugliResultDestroyStruct>
+    JxlButteraugliResultPtr;
+
+#endif  // JXL_BUTTERAUGLI_CXX_H_
+
+/// @}
diff --git a/third_party/jpeg-xl/lib/include/jxl/cms_interface.h b/third_party/jpeg-xl/lib/include/jxl/cms_interface.h
new file mode 100644
index 0000000000..684281e641
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/cms_interface.h
@@ -0,0 +1,232 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file cms_interface.h
+ * @brief Interface to allow the injection of different color management systems
+ * (CMSes, also called color management modules, or CMMs) in JPEG XL.
+ *
+ * A CMS is needed by the JPEG XL encoder and decoder to perform colorspace
+ * conversions. This defines an interface that can be implemented for different
+ * CMSes and then passed to the library.
+ */
+
+#ifndef JXL_CMS_INTERFACE_H_
+#define JXL_CMS_INTERFACE_H_
+
+#include <jxl/color_encoding.h>
+#include <jxl/types.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Represents an input or output colorspace to a color transform, as a
+ * serialized ICC profile. */
+typedef struct {
+  /** The serialized ICC profile. This is guaranteed to be present and valid. */
+  struct {
+    const uint8_t* data;
+    size_t size;
+  } icc;
+
+  /** Structured representation of the colorspace, if applicable. If all fields
+   * are different from their "unknown" value, then this is equivalent to the
+   * ICC representation of the colorspace. If some are "unknown", those that are
+   * not are still valid and can still be used on their own if they are useful.
+   */
+  JxlColorEncoding color_encoding;
+
+  /** Number of components per pixel. This can be deduced from the other
+   * representations of the colorspace but is provided for convenience and
+   * validation. */
+  size_t num_channels;
+} JxlColorProfile;
+
+/** Allocates and returns the data needed for @p num_threads parallel transforms
+ * from the @p input colorspace to @p output, with up to @p pixels_per_thread
+ * pixels to transform per call to JxlCmsInterface::run. @p init_data comes
+ * directly from the JxlCmsInterface instance. Since @c run only receives the
+ * data returned by @c init, a reference to @p init_data should be kept there
+ * if access to it is desired in @c run. Likewise for JxlCmsInterface::destroy.
+ *
+ * The ICC data in @p input and @p output is guaranteed to outlive the @c init /
+ * @c run / @c destroy cycle.
+ *
+ * @param init_data JxlCmsInterface::init_data passed as-is.
+ * @param num_threads the maximum number of threads from which
+ *        JxlCmsInterface::run will be called.
+ * @param pixels_per_thread the maximum number of pixels that each call to
+ *        JxlCmsInterface::run will have to transform.
+ * @param input_profile the input colorspace for the transform.
+ * @param output_profile the colorspace to which JxlCmsInterface::run should
+ *        convert the input data.
+ * @param intensity_target for colorspaces where luminance is relative
+ *        (essentially: not PQ), indicates the luminance at which (1, 1, 1) will
+ *        be displayed. This is useful for conversions between PQ and a relative
+ *        luminance colorspace, in either direction: @p intensity_target cd/m²
+ *        in PQ should map to and from (1, 1, 1) in the relative one.\n
+ *        It is also used for conversions to and from HLG, as it is
+ *        scene-referred while other colorspaces are assumed to be
+ *        display-referred. That is, conversions from HLG should apply the OOTF
+ *        for a peak display luminance of @p intensity_target, and conversions
+ *        to HLG should undo it. The OOTF is a gamma function applied to the
+ *        luminance channel (https://www.itu.int/rec/R-REC-BT.2100-2-201807-I
+ *        page 7), with the gamma value computed as
+ *        <tt>1.2 * 1.111^log2(intensity_target / 1000)</tt> (footnote 2 page 8
+ *        of the same document).
+ * @return The data needed for the transform, or @c NULL in case of failure.
+ *         This will be passed to the other functions as @c user_data.
+ */
+typedef void* (*jpegxl_cms_init_func)(void* init_data, size_t num_threads,
+                                      size_t pixels_per_thread,
+                                      const JxlColorProfile* input_profile,
+                                      const JxlColorProfile* output_profile,
+                                      float intensity_target);
+
+/** Returns a buffer that can be used by callers of the interface to store the
+ * input of the conversion or read its result, if they pass it as the input or
+ * output of the @c run function.
+ * @param user_data the data returned by @c init.
+ * @param thread the index of the thread for which to return a buffer.
+ * @return A buffer that can be used by the caller for passing to @c run.
+ */
+typedef float* (*jpegxl_cms_get_buffer_func)(void* user_data, size_t thread);
+
+/** Executes one transform and returns true on success or false on error. It
+ * must be possible to call this from different threads with different values
+ * for @p thread, all between 0 (inclusive) and the value of @p num_threads
+ * passed to @c init (exclusive). It is allowed to implement this by locking
+ * such that the transforms are essentially performed sequentially, if such a
+ * performance profile is acceptable. @p user_data is the data returned by
+ * @c init.
+ * The buffers each contain @p num_pixels × @c num_channels interleaved floating
+ * point (0..1) samples where @c num_channels is the number of color channels of
+ * their respective color profiles. It is guaranteed that the only case in which
+ * they might overlap is if the output has fewer channels than the input, in
+ * which case the pointers may be identical.
+ * For CMYK data, 0 represents the maximum amount of ink while 1 represents no
+ * ink.
+ * @param user_data the data returned by @c init.
+ * @param thread the index of the thread from which the function is being
+ *        called.
+ * @param input_buffer the buffer containing the pixel data to be transformed.
+ * @param output_buffer the buffer receiving the transformed pixel data.
+ * @param num_pixels the number of pixels to transform from @p input to
+ * @p output.
+ * @return JXL_TRUE on success, JXL_FALSE on failure.
+ */
+typedef JXL_BOOL (*jpegxl_cms_run_func)(void* user_data, size_t thread,
+                                        const float* input_buffer,
+                                        float* output_buffer,
+                                        size_t num_pixels);
+
+/** Performs the necessary clean-up and frees the memory allocated for user
+ * data.
+ */
+typedef void (*jpegxl_cms_destroy_func)(void*);
+
+/**
+ * Interface for performing colorspace transforms. The @c init function can be
+ * called several times to instantiate several transforms, including before
+ * other transforms have been destroyed.
+ *
+ * The call sequence for a given colorspace transform could look like the
+ * following:
+ * @dot
+ * digraph calls {
+ *   newrank = true
+ *   node [shape = box, fontname = monospace]
+ *   init [label = "user_data <- init(\l\
+ *     init_data = data,\l\
+ *     num_threads = 3,\l\
+ *     pixels_per_thread = 20,\l\
+ *     input = (sRGB, 3 channels),\l\
+ *     output = (Display-P3, 3 channels),\l\
+ *     intensity_target = 255\l\
+ *   )\l"]
+ *   subgraph cluster_0 {
+ *   color = lightgrey
+ *   label = "thread 1"
+ *   labeljust = "c"
+ *   run_1_1 [label = "run(\l\
+ *     user_data,\l\
+ *     thread = 1,\l\
+ *     input = in[0],\l\
+ *     output = out[0],\l\
+ *     num_pixels = 20\l\
+ *   )\l"]
+ *   run_1_2 [label = "run(\l\
+ *     user_data,\l\
+ *     thread = 1,\l\
+ *     input = in[3],\l\
+ *     output = out[3],\l\
+ *     num_pixels = 20\l\
+ *   )\l"]
+ *   }
+ *   subgraph cluster_1 {
+ *   color = lightgrey
+ *   label = "thread 2"
+ *   labeljust = "l"
+ *   run_2_1 [label = "run(\l\
+ *     user_data,\l\
+ *     thread = 2,\l\
+ *     input = in[1],\l\
+ *     output = out[1],\l\
+ *     num_pixels = 20\l\
+ *   )\l"]
+ *   run_2_2 [label = "run(\l\
+ *     user_data,\l\
+ *     thread = 2,\l\
+ *     input = in[4],\l\
+ *     output = out[4],\l\
+ *     num_pixels = 13\l\
+ *   )\l"]
+ *   }
+ *   subgraph cluster_3 {
+ *   color = lightgrey
+ *   label = "thread 3"
+ *   labeljust = "c"
+ *   run_3_1 [label = "run(\l\
+ *     user_data,\l\
+ *     thread = 3,\l\
+ *     input = in[2],\l\
+ *     output = out[2],\l\
+ *     num_pixels = 20\l\
+ *   )\l"]
+ *   }
+ *   init -> {run_1_1; run_2_1; run_3_1; rank = same}
+ *   run_1_1 -> run_1_2
+ *   run_2_1 -> run_2_2
+ *   {run_1_2; run_2_2, run_3_1} -> "destroy(user_data)"
+ * }
+ * @enddot
+ */
+typedef struct {
+  /** CMS-specific data that will be passed to @ref init. */
+  void* init_data;
+  /** Prepares a colorspace transform as described in the documentation of @ref
+   * jpegxl_cms_init_func. */
+  jpegxl_cms_init_func init;
+  /** Returns a buffer that can be used as input to @c run. */
+  jpegxl_cms_get_buffer_func get_src_buf;
+  /** Returns a buffer that can be used as output from @c run. */
+  jpegxl_cms_get_buffer_func get_dst_buf;
+  /** Executes the transform on a batch of pixels, per @ref jpegxl_cms_run_func.
+   */
+  jpegxl_cms_run_func run;
+  /** Cleans up the transform. */
+  jpegxl_cms_destroy_func destroy;
+} JxlCmsInterface;
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_CMS_INTERFACE_H_ */
+
+/** @} */
diff --git a/third_party/jpeg-xl/lib/include/jxl/codestream_header.h b/third_party/jpeg-xl/lib/include/jxl/codestream_header.h
new file mode 100644
index 0000000000..66dd7df4ce
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/codestream_header.h
@@ -0,0 +1,430 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file codestream_header.h
+ * @brief Definitions of structs and enums for the metadata from the JPEG XL
+ * codestream headers (signature, metadata, preview dimensions, ...), excluding
+ * color encoding which is in color_encoding.h.
+ */
+
+#ifndef JXL_CODESTREAM_HEADER_H_
+#define JXL_CODESTREAM_HEADER_H_
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Image orientation metadata.
+ * Values 1..8 match the EXIF definitions.
+ * The name indicates the operation to perform to transform from the encoded
+ * image to the display image.
+ */
+typedef enum {
+  JXL_ORIENT_IDENTITY = 1,
+  JXL_ORIENT_FLIP_HORIZONTAL = 2,
+  JXL_ORIENT_ROTATE_180 = 3,
+  JXL_ORIENT_FLIP_VERTICAL = 4,
+  JXL_ORIENT_TRANSPOSE = 5,
+  JXL_ORIENT_ROTATE_90_CW = 6,
+  JXL_ORIENT_ANTI_TRANSPOSE = 7,
+  JXL_ORIENT_ROTATE_90_CCW = 8,
+} JxlOrientation;
+
+/** Given type of an extra channel.
+ */
+typedef enum {
+  JXL_CHANNEL_ALPHA,
+  JXL_CHANNEL_DEPTH,
+  JXL_CHANNEL_SPOT_COLOR,
+  JXL_CHANNEL_SELECTION_MASK,
+  JXL_CHANNEL_BLACK,
+  JXL_CHANNEL_CFA,
+  JXL_CHANNEL_THERMAL,
+  JXL_CHANNEL_RESERVED0,
+  JXL_CHANNEL_RESERVED1,
+  JXL_CHANNEL_RESERVED2,
+  JXL_CHANNEL_RESERVED3,
+  JXL_CHANNEL_RESERVED4,
+  JXL_CHANNEL_RESERVED5,
+  JXL_CHANNEL_RESERVED6,
+  JXL_CHANNEL_RESERVED7,
+  JXL_CHANNEL_UNKNOWN,
+  JXL_CHANNEL_OPTIONAL
+} JxlExtraChannelType;
+
+/** The codestream preview header */
+typedef struct {
+  /** Preview width in pixels */
+  uint32_t xsize;
+
+  /** Preview height in pixels */
+  uint32_t ysize;
+} JxlPreviewHeader;
+
+/** The codestream animation header, optionally present in the beginning of
+ * the codestream, and if it is it applies to all animation frames, unlike
+ * JxlFrameHeader which applies to an individual frame.
+ */
+typedef struct {
+  /** Numerator of ticks per second of a single animation frame time unit */
+  uint32_t tps_numerator;
+
+  /** Denominator of ticks per second of a single animation frame time unit */
+  uint32_t tps_denominator;
+
+  /** Amount of animation loops, or 0 to repeat infinitely */
+  uint32_t num_loops;
+
+  /** Whether animation time codes are present at animation frames in the
+   * codestream */
+  JXL_BOOL have_timecodes;
+} JxlAnimationHeader;
+
+/** Basic image information. This information is available from the file
+ * signature and first part of the codestream header.
+ */
+typedef struct {
+  /* TODO(lode): need additional fields for (transcoded) JPEG? For reusable
+   * fields orientation must be read from Exif APP1. For has_icc_profile: must
+   * look up where ICC profile is guaranteed to be in a JPEG file to be able to
+   * indicate this. */
+
+  /* TODO(lode): make struct packed, and/or make this opaque struct with getter
+   * functions (still separate struct from opaque decoder) */
+
+  /** Whether the codestream is embedded in the container format. If true,
+   * metadata information and extensions may be available in addition to the
+   * codestream.
+   */
+  JXL_BOOL have_container;
+
+  /** Width of the image in pixels, before applying orientation.
+   */
+  uint32_t xsize;
+
+  /** Height of the image in pixels, before applying orientation.
+   */
+  uint32_t ysize;
+
+  /** Original image color channel bit depth.
+   */
+  uint32_t bits_per_sample;
+
+  /** Original image color channel floating point exponent bits, or 0 if they
+   * are unsigned integer. For example, if the original data is half-precision
+   * (binary16) floating point, bits_per_sample is 16 and
+   * exponent_bits_per_sample is 5, and so on for other floating point
+   * precisions.
+   */
+  uint32_t exponent_bits_per_sample;
+
+  /** Upper bound on the intensity level present in the image in nits. For
+   * unsigned integer pixel encodings, this is the brightness of the largest
+   * representable value. The image does not necessarily contain a pixel
+   * actually this bright. An encoder is allowed to set 255 for SDR images
+   * without computing a histogram.
+   * Leaving this set to its default of 0 lets libjxl choose a sensible default
+   * value based on the color encoding.
+   */
+  float intensity_target;
+
+  /** Lower bound on the intensity level present in the image. This may be
+   * loose, i.e. lower than the actual darkest pixel. When tone mapping, a
+   * decoder will map [min_nits, intensity_target] to the display range.
+   */
+  float min_nits;
+
+  /** See the description of @see linear_below.
+   */
+  JXL_BOOL relative_to_max_display;
+
+  /** The tone mapping will leave unchanged (linear mapping) any pixels whose
+   * brightness is strictly below this. The interpretation depends on
+   * relative_to_max_display. If true, this is a ratio [0, 1] of the maximum
+   * display brightness [nits], otherwise an absolute brightness [nits].
+   */
+  float linear_below;
+
+  /** Whether the data in the codestream is encoded in the original color
+   * profile that is attached to the codestream metadata header, or is
+   * encoded in an internally supported absolute color space (which the decoder
+   * can always convert to linear or non-linear sRGB or to XYB). If the original
+   * profile is used, the decoder outputs pixel data in the color space matching
+   * that profile, but doesn't convert it to any other color space. If the
+   * original profile is not used, the decoder only outputs the data as sRGB
+   * (linear if outputting to floating point, nonlinear with standard sRGB
+   * transfer function if outputting to unsigned integers) but will not convert
+   * it to to the original color profile. The decoder also does not convert to
+   * the target display color profile. To convert the pixel data produced by
+   * the decoder to the original color profile, one of the JxlDecoderGetColor*
+   * functions needs to be called with @ref JXL_COLOR_PROFILE_TARGET_DATA to get
+   * the color profile of the decoder output, and then an external CMS can be
+   * used for conversion.
+   * Note that for lossy compression, this should be set to false for most use
+   * cases, and if needed, the image should be converted to the original color
+   * profile after decoding, as described above.
+   */
+  JXL_BOOL uses_original_profile;
+
+  /** Indicates a preview image exists near the beginning of the codestream.
+   * The preview itself or its dimensions are not included in the basic info.
+   */
+  JXL_BOOL have_preview;
+
+  /** Indicates animation frames exist in the codestream. The animation
+   * information is not included in the basic info.
+   */
+  JXL_BOOL have_animation;
+
+  /** Image orientation, value 1-8 matching the values used by JEITA CP-3451C
+   * (Exif version 2.3).
+   */
+  JxlOrientation orientation;
+
+  /** Number of color channels encoded in the image, this is either 1 for
+   * grayscale data, or 3 for colored data. This count does not include
+   * the alpha channel or other extra channels. To check presence of an alpha
+   * channel, such as in the case of RGBA color, check alpha_bits != 0.
+   * If and only if this is 1, the JxlColorSpace in the JxlColorEncoding is
+   * JXL_COLOR_SPACE_GRAY.
+   */
+  uint32_t num_color_channels;
+
+  /** Number of additional image channels. This includes the main alpha channel,
+   * but can also include additional channels such as depth, additional alpha
+   * channels, spot colors, and so on. Information about the extra channels
+   * can be queried with JxlDecoderGetExtraChannelInfo. The main alpha channel,
+   * if it exists, also has its information available in the alpha_bits,
+   * alpha_exponent_bits and alpha_premultiplied fields in this JxlBasicInfo.
+   */
+  uint32_t num_extra_channels;
+
+  /** Bit depth of the encoded alpha channel, or 0 if there is no alpha channel.
+   * If present, matches the alpha_bits value of the JxlExtraChannelInfo
+   * associated with this alpha channel.
+   */
+  uint32_t alpha_bits;
+
+  /** Alpha channel floating point exponent bits, or 0 if they are unsigned. If
+   * present, matches the alpha_bits value of the JxlExtraChannelInfo associated
+   * with this alpha channel. integer.
+   */
+  uint32_t alpha_exponent_bits;
+
+  /** Whether the alpha channel is premultiplied. Only used if there is a main
+   * alpha channel. Matches the alpha_premultiplied value of the
+   * JxlExtraChannelInfo associated with this alpha channel.
+   */
+  JXL_BOOL alpha_premultiplied;
+
+  /** Dimensions of encoded preview image, only used if have_preview is
+   * JXL_TRUE.
+   */
+  JxlPreviewHeader preview;
+
+  /** Animation header with global animation properties for all frames, only
+   * used if have_animation is JXL_TRUE.
+   */
+  JxlAnimationHeader animation;
+
+  /** Intrinsic width of the image.
+   * The intrinsic size can be different from the actual size in pixels
+   * (as given by xsize and ysize) and it denotes the recommended dimensions
+   * for displaying the image, i.e. applications are advised to resample the
+   * decoded image to the intrinsic dimensions.
+   */
+  uint32_t intrinsic_xsize;
+
+  /** Intrinsic height of the image.
+   * The intrinsic size can be different from the actual size in pixels
+   * (as given by xsize and ysize) and it denotes the recommended dimensions
+   * for displaying the image, i.e. applications are advised to resample the
+   * decoded image to the intrinsic dimensions.
+   */
+  uint32_t intrinsic_ysize;
+
+  /** Padding for forwards-compatibility, in case more fields are exposed
+   * in a future version of the library.
+   */
+  uint8_t padding[100];
+} JxlBasicInfo;
+
+/** Information for a single extra channel.
+ */
+typedef struct {
+  /** Given type of an extra channel.
+   */
+  JxlExtraChannelType type;
+
+  /** Total bits per sample for this channel.
+   */
+  uint32_t bits_per_sample;
+
+  /** Floating point exponent bits per channel, or 0 if they are unsigned
+   * integer.
+   */
+  uint32_t exponent_bits_per_sample;
+
+  /** The exponent the channel is downsampled by on each axis.
+   * TODO(lode): expand this comment to match the JPEG XL specification,
+   * specify how to upscale, how to round the size computation, and to which
+   * extra channels this field applies.
+   */
+  uint32_t dim_shift;
+
+  /** Length of the extra channel name in bytes, or 0 if no name.
+   * Excludes null termination character.
+   */
+  uint32_t name_length;
+
+  /** Whether alpha channel uses premultiplied alpha. Only applicable if
+   * type is JXL_CHANNEL_ALPHA.
+   */
+  JXL_BOOL alpha_premultiplied;
+
+  /** Spot color of the current spot channel in linear RGBA. Only applicable if
+   * type is JXL_CHANNEL_SPOT_COLOR.
+   */
+  float spot_color[4];
+
+  /** Only applicable if type is JXL_CHANNEL_CFA.
+   * TODO(lode): add comment about the meaning of this field.
+   */
+  uint32_t cfa_channel;
+} JxlExtraChannelInfo;
+
+/* TODO(lode): add API to get the codestream header extensions. */
+/** Extensions in the codestream header. */
+typedef struct {
+  /** Extension bits. */
+  uint64_t extensions;
+} JxlHeaderExtensions;
+
+/** Frame blend modes.
+ * When decoding, if coalescing is enabled (default), this can be ignored.
+ */
+typedef enum {
+  JXL_BLEND_REPLACE = 0,
+  JXL_BLEND_ADD = 1,
+  JXL_BLEND_BLEND = 2,
+  JXL_BLEND_MULADD = 3,
+  JXL_BLEND_MUL = 4,
+} JxlBlendMode;
+
+/** The information about blending the color channels or a single extra channel.
+ * When decoding, if coalescing is enabled (default), this can be ignored and
+ * the blend mode is considered to be JXL_BLEND_REPLACE.
+ * When encoding, these settings apply to the pixel data given to the encoder.
+ */
+typedef struct {
+  /** Blend mode.
+   */
+  JxlBlendMode blendmode;
+  /** Reference frame ID to use as the 'bottom' layer (0-3).
+   */
+  uint32_t source;
+  /** Which extra channel to use as the 'alpha' channel for blend modes
+   * JXL_BLEND_BLEND and JXL_BLEND_MULADD.
+   */
+  uint32_t alpha;
+  /** Clamp values to [0,1] for the purpose of blending.
+   */
+  JXL_BOOL clamp;
+} JxlBlendInfo;
+
+/** The information about layers.
+ * When decoding, if coalescing is enabled (default), this can be ignored.
+ * When encoding, these settings apply to the pixel data given to the encoder,
+ * the encoder could choose an internal representation that differs.
+ */
+typedef struct {
+  /** Whether cropping is applied for this frame. When decoding, if false,
+   * crop_x0 and crop_y0 are set to zero, and xsize and ysize to the main
+   * image dimensions. When encoding and this is false, those fields are
+   * ignored. When decoding, if coalescing is enabled (default), this is always
+   * false, regardless of the internal encoding in the JPEG XL codestream.
+   */
+  JXL_BOOL have_crop;
+
+  /** Horizontal offset of the frame (can be negative).
+   */
+  int32_t crop_x0;
+
+  /** Vertical offset of the frame (can be negative).
+   */
+  int32_t crop_y0;
+
+  /** Width of the frame (number of columns).
+   */
+  uint32_t xsize;
+
+  /** Height of the frame (number of rows).
+   */
+  uint32_t ysize;
+
+  /** The blending info for the color channels. Blending info for extra channels
+   * has to be retrieved separately using JxlDecoderGetExtraChannelBlendInfo.
+   */
+  JxlBlendInfo blend_info;
+
+  /** After blending, save the frame as reference frame with this ID (0-3).
+   * Special case: if the frame duration is nonzero, ID 0 means "will not be
+   * referenced in the future". This value is not used for the last frame.
+   * When encoding, ID 3 is reserved to frames that are generated internally by
+   * the encoder, and should not be used by applications.
+   */
+  uint32_t save_as_reference;
+} JxlLayerInfo;
+
+/** The header of one displayed frame or non-coalesced layer. */
+typedef struct {
+  /** How long to wait after rendering in ticks. The duration in seconds of a
+   * tick is given by tps_numerator and tps_denominator in JxlAnimationHeader.
+   */
+  uint32_t duration;
+
+  /** SMPTE timecode of the current frame in form 0xHHMMSSFF, or 0. The bits are
+   * interpreted from most-significant to least-significant as hour, minute,
+   * second, and frame. If timecode is nonzero, it is strictly larger than that
+   * of a previous frame with nonzero duration. These values are only available
+   * if have_timecodes in JxlAnimationHeader is JXL_TRUE.
+   * This value is only used if have_timecodes in JxlAnimationHeader is
+   * JXL_TRUE.
+   */
+  uint32_t timecode;
+
+  /** Length of the frame name in bytes, or 0 if no name.
+   * Excludes null termination character. This value is set by the decoder.
+   * For the encoder, this value is ignored and @ref JxlEncoderSetFrameName is
+   * used instead to set the name and the length.
+   */
+  uint32_t name_length;
+
+  /** Indicates this is the last animation frame. This value is set by the
+   * decoder to indicate no further frames follow. For the encoder, it is not
+   * required to set this value and it is ignored, @ref JxlEncoderCloseFrames is
+   * used to indicate the last frame to the encoder instead.
+   */
+  JXL_BOOL is_last;
+
+  /** Information about the layer in case of no coalescing.
+   */
+  JxlLayerInfo layer_info;
+} JxlFrameHeader;
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_CODESTREAM_HEADER_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/color_encoding.h b/third_party/jpeg-xl/lib/include/jxl/color_encoding.h
new file mode 100644
index 0000000000..b16f6a01ee
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/color_encoding.h
@@ -0,0 +1,162 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file color_encoding.h
+ * @brief Color Encoding definitions used by JPEG XL.
+ * All CIE units are for the standard 1931 2 degree observer.
+ */
+
+#ifndef JXL_COLOR_ENCODING_H_
+#define JXL_COLOR_ENCODING_H_
+
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Color space of the image data. */
+typedef enum {
+  /** Tristimulus RGB */
+  JXL_COLOR_SPACE_RGB,
+  /** Luminance based, the primaries in JxlColorEncoding must be ignored. This
+   * value implies that num_color_channels in JxlBasicInfo is 1, any other value
+   * implies num_color_channels is 3. */
+  JXL_COLOR_SPACE_GRAY,
+  /** XYB (opsin) color space */
+  JXL_COLOR_SPACE_XYB,
+  /** None of the other table entries describe the color space appropriately */
+  JXL_COLOR_SPACE_UNKNOWN,
+} JxlColorSpace;
+
+/** Built-in whitepoints for color encoding. When decoding, the numerical xy
+ * whitepoint value can be read from the JxlColorEncoding white_point field
+ * regardless of the enum value. When encoding, enum values except
+ * JXL_WHITE_POINT_CUSTOM override the numerical fields. Some enum values match
+ * a subset of CICP (Rec. ITU-T H.273 | ISO/IEC 23091-2:2019(E)), however the
+ * white point and RGB primaries are separate enums here.
+ */
+typedef enum {
+  /** CIE Standard Illuminant D65: 0.3127, 0.3290 */
+  JXL_WHITE_POINT_D65 = 1,
+  /** White point must be read from the JxlColorEncoding white_point field, or
+   * as ICC profile. This enum value is not an exact match of the corresponding
+   * CICP value. */
+  JXL_WHITE_POINT_CUSTOM = 2,
+  /** CIE Standard Illuminant E (equal-energy): 1/3, 1/3 */
+  JXL_WHITE_POINT_E = 10,
+  /** DCI-P3 from SMPTE RP 431-2: 0.314, 0.351 */
+  JXL_WHITE_POINT_DCI = 11,
+} JxlWhitePoint;
+
+/** Built-in primaries for color encoding. When decoding, the primaries can be
+ * read from the JxlColorEncoding primaries_red_xy, primaries_green_xy and
+ * primaries_blue_xy fields regardless of the enum value. When encoding, the
+ * enum values except JXL_PRIMARIES_CUSTOM override the numerical fields. Some
+ * enum values match a subset of CICP (Rec. ITU-T H.273 | ISO/IEC
+ * 23091-2:2019(E)), however the white point and RGB primaries are separate
+ * enums here.
+ */
+typedef enum {
+  /** The CIE xy values of the red, green and blue primaries are: 0.639998686,
+     0.330010138; 0.300003784, 0.600003357; 0.150002046, 0.059997204 */
+  JXL_PRIMARIES_SRGB = 1,
+  /** Primaries must be read from the JxlColorEncoding primaries_red_xy,
+   * primaries_green_xy and primaries_blue_xy fields, or as ICC profile. This
+   * enum value is not an exact match of the corresponding CICP value. */
+  JXL_PRIMARIES_CUSTOM = 2,
+  /** As specified in Rec. ITU-R BT.2100-1 */
+  JXL_PRIMARIES_2100 = 9,
+  /** As specified in SMPTE RP 431-2 */
+  JXL_PRIMARIES_P3 = 11,
+} JxlPrimaries;
+
+/** Built-in transfer functions for color encoding. Enum values match a subset
+ * of CICP (Rec. ITU-T H.273 | ISO/IEC 23091-2:2019(E)) unless specified
+ * otherwise. */
+typedef enum {
+  /** As specified in SMPTE RP 431-2 */
+  JXL_TRANSFER_FUNCTION_709 = 1,
+  /** None of the other table entries describe the transfer function. */
+  JXL_TRANSFER_FUNCTION_UNKNOWN = 2,
+  /** The gamma exponent is 1 */
+  JXL_TRANSFER_FUNCTION_LINEAR = 8,
+  /** As specified in IEC 61966-2-1 sRGB */
+  JXL_TRANSFER_FUNCTION_SRGB = 13,
+  /** As specified in SMPTE ST 2084 */
+  JXL_TRANSFER_FUNCTION_PQ = 16,
+  /** As specified in SMPTE ST 428-1 */
+  JXL_TRANSFER_FUNCTION_DCI = 17,
+  /** As specified in Rec. ITU-R BT.2100-1 (HLG) */
+  JXL_TRANSFER_FUNCTION_HLG = 18,
+  /** Transfer function follows power law given by the gamma value in
+     JxlColorEncoding. Not a CICP value. */
+  JXL_TRANSFER_FUNCTION_GAMMA = 65535,
+} JxlTransferFunction;
+
+/** Renderig intent for color encoding, as specified in ISO 15076-1:2010 */
+typedef enum {
+  /** vendor-specific */
+  JXL_RENDERING_INTENT_PERCEPTUAL = 0,
+  /** media-relative */
+  JXL_RENDERING_INTENT_RELATIVE,
+  /** vendor-specific */
+  JXL_RENDERING_INTENT_SATURATION,
+  /** ICC-absolute */
+  JXL_RENDERING_INTENT_ABSOLUTE,
+} JxlRenderingIntent;
+
+/** Color encoding of the image as structured information.
+ */
+typedef struct {
+  /** Color space of the image data.
+   */
+  JxlColorSpace color_space;
+
+  /** Built-in white point. If this value is JXL_WHITE_POINT_CUSTOM, must
+   * use the numerical whitepoint values from white_point_xy.
+   */
+  JxlWhitePoint white_point;
+
+  /** Numerical whitepoint values in CIE xy space. */
+  double white_point_xy[2];
+
+  /** Built-in RGB primaries. If this value is JXL_PRIMARIES_CUSTOM, must
+   * use the numerical primaries values below. This field and the custom values
+   * below are unused and must be ignored if the color space is
+   * JXL_COLOR_SPACE_GRAY or JXL_COLOR_SPACE_XYB.
+   */
+  JxlPrimaries primaries;
+
+  /** Numerical red primary values in CIE xy space. */
+  double primaries_red_xy[2];
+
+  /** Numerical green primary values in CIE xy space. */
+  double primaries_green_xy[2];
+
+  /** Numerical blue primary values in CIE xy space. */
+  double primaries_blue_xy[2];
+
+  /** Transfer function if have_gamma is 0 */
+  JxlTransferFunction transfer_function;
+
+  /** Gamma value used when transfer_function is JXL_TRANSFER_FUNCTION_GAMMA
+   */
+  double gamma;
+
+  /** Rendering intent defined for the color profile. */
+  JxlRenderingIntent rendering_intent;
+} JxlColorEncoding;
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_COLOR_ENCODING_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/decode.h b/third_party/jpeg-xl/lib/include/jxl/decode.h
new file mode 100644
index 0000000000..156499ce48
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/decode.h
@@ -0,0 +1,1446 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_decoder
+ * @{
+ * @file decode.h
+ * @brief Decoding API for JPEG XL.
+ */
+
+#ifndef JXL_DECODE_H_
+#define JXL_DECODE_H_
+
+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
+#include <jxl/jxl_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+#include <jxl/version.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * Decoder library version.
+ *
+ * @return the decoder library version as an integer:
+ * MAJOR_VERSION * 1000000 + MINOR_VERSION * 1000 + PATCH_VERSION. For example,
+ * version 1.2.3 would return 1002003.
+ */
+JXL_EXPORT uint32_t JxlDecoderVersion(void);
+
+/** The result of @ref JxlSignatureCheck.
+ */
+typedef enum {
+  /** Not enough bytes were passed to determine if a valid signature was found.
+   */
+  JXL_SIG_NOT_ENOUGH_BYTES = 0,
+
+  /** No valid JPEG XL header was found. */
+  JXL_SIG_INVALID = 1,
+
+  /** A valid JPEG XL codestream signature was found, that is a JPEG XL image
+   * without container.
+   */
+  JXL_SIG_CODESTREAM = 2,
+
+  /** A valid container signature was found, that is a JPEG XL image embedded
+   * in a box format container.
+   */
+  JXL_SIG_CONTAINER = 3,
+} JxlSignature;
+
+/**
+ * JPEG XL signature identification.
+ *
+ * Checks if the passed buffer contains a valid JPEG XL signature. The passed @p
+ * buf of size
+ * @p size doesn't need to be a full image, only the beginning of the file.
+ *
+ * @return a flag indicating if a JPEG XL signature was found and what type.
+ *  - @ref JXL_SIG_NOT_ENOUGH_BYTES if not enough bytes were passed to
+ *    determine if a valid signature is there.
+ *  - @ref JXL_SIG_INVALID if no valid signature found for JPEG XL decoding.
+ *  - @ref JXL_SIG_CODESTREAM if a valid JPEG XL codestream signature was
+ *    found.
+ *  - @ref JXL_SIG_CONTAINER if a valid JPEG XL container signature was found.
+ */
+JXL_EXPORT JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len);
+
+/**
+ * Opaque structure that holds the JPEG XL decoder.
+ *
+ * Allocated and initialized with @ref JxlDecoderCreate().
+ * Cleaned up and deallocated with @ref JxlDecoderDestroy().
+ */
+typedef struct JxlDecoderStruct JxlDecoder;
+
+/**
+ * Creates an instance of @ref JxlDecoder and initializes it.
+ *
+ * @p memory_manager will be used for all the library dynamic allocations made
+ * from this instance. The parameter may be NULL, in which case the default
+ * allocator will be used. See jxl/memory_manager.h for details.
+ *
+ * @param memory_manager custom allocator function. It may be NULL. The memory
+ *        manager will be copied internally.
+ * @return @c NULL if the instance can not be allocated or initialized
+ * @return pointer to initialized @ref JxlDecoder otherwise
+ */
+JXL_EXPORT JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager);
+
+/**
+ * Re-initializes a @ref JxlDecoder instance, so it can be re-used for decoding
+ * another image. All state and settings are reset as if the object was
+ * newly created with @ref JxlDecoderCreate, but the memory manager is kept.
+ *
+ * @param dec instance to be re-initialized.
+ */
+JXL_EXPORT void JxlDecoderReset(JxlDecoder* dec);
+
+/**
+ * Deinitializes and frees @ref JxlDecoder instance.
+ *
+ * @param dec instance to be cleaned up and deallocated.
+ */
+JXL_EXPORT void JxlDecoderDestroy(JxlDecoder* dec);
+
+/**
+ * Return value for @ref JxlDecoderProcessInput.
+ * The values from @ref JXL_DEC_BASIC_INFO onwards are optional informative
+ * events that can be subscribed to, they are never returned if they
+ * have not been registered with @ref JxlDecoderSubscribeEvents.
+ */
+typedef enum {
+  /** Function call finished successfully, or decoding is finished and there is
+   * nothing more to be done.
+   *
+   * Note that @ref JxlDecoderProcessInput will return JXL_DEC_SUCCESS if all
+   * events that were registered with @ref JxlDecoderSubscribeEvents were
+   * processed, even before the end of the JPEG XL codestream.
+   *
+   * In this case, the return value @ref JxlDecoderReleaseInput will be the same
+   * as it was at the last signaled event. E.g. if JXL_DEC_FULL_IMAGE was
+   * subscribed to, then all bytes from the end of the JPEG XL codestream
+   * (including possible boxes needed for jpeg reconstruction) will be returned
+   * as unprocessed.
+   */
+  JXL_DEC_SUCCESS = 0,
+
+  /** An error occurred, for example invalid input file or out of memory.
+   * TODO(lode): add function to get error information from decoder.
+   */
+  JXL_DEC_ERROR = 1,
+
+  /** The decoder needs more input bytes to continue. Before the next @ref
+   * JxlDecoderProcessInput call, more input data must be set, by calling @ref
+   * JxlDecoderReleaseInput (if input was set previously) and then calling @ref
+   * JxlDecoderSetInput. @ref JxlDecoderReleaseInput returns how many bytes
+   * are not yet processed, before a next call to @ref JxlDecoderProcessInput
+   * all unprocessed bytes must be provided again (the address need not match,
+   * but the contents must), and more bytes must be concatenated after the
+   * unprocessed bytes.
+   * In most cases, @ref JxlDecoderReleaseInput will return no unprocessed bytes
+   * at this event, the only exceptions are if the previously set input ended
+   * within (a) the raw codestream signature, (b) the signature box, (c) a box
+   * header, or (d) the first 4 bytes of a brob, ftyp, or jxlp box. In any of
+   * these cases the number of unprocessed bytes is less than 20.
+   */
+  JXL_DEC_NEED_MORE_INPUT = 2,
+
+  /** The decoder is able to decode a preview image and requests setting a
+   * preview output buffer using @ref JxlDecoderSetPreviewOutBuffer. This occurs
+   * if @ref JXL_DEC_PREVIEW_IMAGE is requested and it is possible to decode a
+   * preview image from the codestream and the preview out buffer was not yet
+   * set. There is maximum one preview image in a codestream.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the frame header (including ToC) of the preview frame as
+   * unprocessed.
+   */
+  JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3,
+
+  /** The decoder requests an output buffer to store the full resolution image,
+   * which can be set with @ref JxlDecoderSetImageOutBuffer or with @ref
+   * JxlDecoderSetImageOutCallback. This event re-occurs for new frames if
+   * there are multiple animation frames and requires setting an output again.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the frame header (including ToC) as unprocessed.
+   */
+  JXL_DEC_NEED_IMAGE_OUT_BUFFER = 5,
+
+  /** The JPEG reconstruction buffer is too small for reconstructed JPEG
+   * codestream to fit. @ref JxlDecoderSetJPEGBuffer must be called again to
+   * make room for remaining bytes. This event may occur multiple times
+   * after @ref JXL_DEC_JPEG_RECONSTRUCTION.
+   */
+  JXL_DEC_JPEG_NEED_MORE_OUTPUT = 6,
+
+  /** The box contents output buffer is too small. @ref JxlDecoderSetBoxBuffer
+   * must be called again to make room for remaining bytes. This event may occur
+   * multiple times after @ref JXL_DEC_BOX.
+   */
+  JXL_DEC_BOX_NEED_MORE_OUTPUT = 7,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": Basic information such as image dimensions and
+   * extra channels. This event occurs max once per image.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the basic info as unprocessed (including the last byte of basic info
+   * if it did not end on a byte boundary).
+   */
+  JXL_DEC_BASIC_INFO = 0x40,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": User extensions of the codestream header. This
+   * event occurs max once per image and always later than @ref
+   * JXL_DEC_BASIC_INFO and earlier than any pixel data.
+   *
+   * @deprecated The decoder no longer returns this, the header extensions,
+   * if any, are available at the JXL_DEC_BASIC_INFO event.
+   */
+  JXL_DEC_EXTENSIONS = 0x80,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": Color encoding or ICC profile from the
+   * codestream header. This event occurs max once per image and always later
+   * than @ref JXL_DEC_BASIC_INFO and earlier than any pixel data.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the image header (which is the start of the first frame) as
+   * unprocessed.
+   */
+  JXL_DEC_COLOR_ENCODING = 0x100,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": Preview image, a small frame, decoded. This
+   * event can only happen if the image has a preview frame encoded. This event
+   * occurs max once for the codestream and always later than @ref
+   * JXL_DEC_COLOR_ENCODING and before @ref JXL_DEC_FRAME.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the preview frame as unprocessed.
+   */
+  JXL_DEC_PREVIEW_IMAGE = 0x200,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": Beginning of a frame. @ref
+   * JxlDecoderGetFrameHeader can be used at this point. A note on frames:
+   * a JPEG XL image can have internal frames that are not intended to be
+   * displayed (e.g. used for compositing a final frame), but this only returns
+   * displayed frames, unless @ref JxlDecoderSetCoalescing was set to JXL_FALSE:
+   * in that case, the individual layers are returned, without blending. Note
+   * that even when coalescing is disabled, only frames of type kRegularFrame
+   * are returned; frames of type kReferenceOnly and kLfFrame are always for
+   * internal purposes only and cannot be accessed. A displayed frame either has
+   * an animation duration or is the only or last frame in the image. This event
+   * occurs max once per displayed frame, always later than @ref
+   * JXL_DEC_COLOR_ENCODING, and always earlier than any pixel data. While
+   * JPEG XL supports encoding a single frame as the composition of multiple
+   * internal sub-frames also called frames, this event is not indicated for the
+   * internal frames.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the frame header (including ToC) as unprocessed.
+   */
+  JXL_DEC_FRAME = 0x400,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": full frame (or layer, in case coalescing is
+   * disabled) is decoded. @ref JxlDecoderSetImageOutBuffer must be used after
+   * getting the basic image information to be able to get the image pixels, if
+   * not this return status only indicates we're past this point in the
+   * codestream. This event occurs max once per frame.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the frame (or if @ref JXL_DEC_JPEG_RECONSTRUCTION is subscribed to,
+   * from the end of the last box that is needed for jpeg reconstruction) as
+   * unprocessed.
+   */
+  JXL_DEC_FULL_IMAGE = 0x1000,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": JPEG reconstruction data decoded. @ref
+   * JxlDecoderSetJPEGBuffer may be used to set a JPEG reconstruction buffer
+   * after getting the JPEG reconstruction data. If a JPEG reconstruction buffer
+   * is set a byte stream identical to the JPEG codestream used to encode the
+   * image will be written to the JPEG reconstruction buffer instead of pixels
+   * to the image out buffer. This event occurs max once per image and always
+   * before @ref JXL_DEC_FULL_IMAGE.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the 'jbrd' box as unprocessed.
+   */
+  JXL_DEC_JPEG_RECONSTRUCTION = 0x2000,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": The header of a box of the container format
+   * (BMFF) is decoded. The following API functions related to boxes can be used
+   * after this event:
+   *  - @ref JxlDecoderSetBoxBuffer and @ref JxlDecoderReleaseBoxBuffer
+   *    "JxlDecoderReleaseBoxBuffer": set and release a buffer to get the box
+   *    data.
+   *  - @ref JxlDecoderGetBoxType get the 4-character box typename.
+   *  - @ref JxlDecoderGetBoxSizeRaw get the size of the box as it appears in
+   *    the container file, not decompressed.
+   *  - @ref JxlDecoderSetDecompressBoxes to configure whether to get the box
+   *    data decompressed, or possibly compressed.
+   *
+   * Boxes can be compressed. This is so when their box type is
+   * "brob". In that case, they have an underlying decompressed box
+   * type and decompressed data. @ref JxlDecoderSetDecompressBoxes allows
+   * configuring which data to get. Decompressing requires
+   * Brotli. @ref JxlDecoderGetBoxType has a flag to get the compressed box
+   * type, which can be "brob", or the decompressed box type. If a box
+   * is not compressed (its compressed type is not "brob"), then
+   * the output decompressed box type and data is independent of what
+   * setting is configured.
+   *
+   * The buffer set with @ref JxlDecoderSetBoxBuffer must be set again for each
+   * next box to be obtained, or can be left unset to skip outputting this box.
+   * The output buffer contains the full box data when the next @ref JXL_DEC_BOX
+   * event or @ref JXL_DEC_SUCCESS occurs. @ref JXL_DEC_BOX occurs for all
+   * boxes, including non-metadata boxes such as the signature box or codestream
+   * boxes. To check whether the box is a metadata type for respectively EXIF,
+   * XMP or JUMBF, use @ref JxlDecoderGetBoxType and check for types "Exif",
+   * "xml " and "jumb" respectively.
+   *
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * start of the box header as unprocessed.
+   */
+  JXL_DEC_BOX = 0x4000,
+
+  /** Informative event by @ref JxlDecoderProcessInput
+   * "JxlDecoderProcessInput": a progressive step in decoding the frame is
+   * reached. When calling @ref JxlDecoderFlushImage at this point, the flushed
+   * image will correspond exactly to this point in decoding, and not yet
+   * contain partial results (such as partially more fine detail) of a next
+   * step. By default, this event will trigger maximum once per frame, when a
+   * 8x8th resolution (DC) image is ready (the image data is still returned at
+   * full resolution, giving upscaled DC). Use @ref
+   * JxlDecoderSetProgressiveDetail to configure more fine-grainedness. The
+   * event is not guaranteed to trigger, not all images have progressive steps
+   * or DC encoded.
+   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
+   * end of the section that was needed to produce this progressive event as
+   * unprocessed.
+   */
+  JXL_DEC_FRAME_PROGRESSION = 0x8000,
+} JxlDecoderStatus;
+
+/** Rewinds decoder to the beginning. The same input must be given again from
+ * the beginning of the file and the decoder will emit events from the beginning
+ * again. When rewinding (as opposed to @ref JxlDecoderReset), the decoder can
+ * keep state about the image, which it can use to skip to a requested frame
+ * more efficiently with @ref JxlDecoderSkipFrames. Settings such as parallel
+ * runner or subscribed events are kept. After rewind, @ref
+ * JxlDecoderSubscribeEvents can be used again, and it is feasible to leave out
+ * events that were already handled before, such as @ref JXL_DEC_BASIC_INFO
+ * and @ref JXL_DEC_COLOR_ENCODING, since they will provide the same information
+ * as before.
+ * The difference to @ref JxlDecoderReset is that some state is kept, namely
+ * settings set by a call to
+ *  - @ref JxlDecoderSetCoalescing,
+ *  - @ref JxlDecoderSetDesiredIntensityTarget,
+ *  - @ref JxlDecoderSetDecompressBoxes,
+ *  - @ref JxlDecoderSetKeepOrientation,
+ *  - @ref JxlDecoderSetUnpremultiplyAlpha,
+ *  - @ref JxlDecoderSetParallelRunner,
+ *  - @ref JxlDecoderSetRenderSpotcolors, and
+ *  - @ref JxlDecoderSubscribeEvents.
+ *
+ * @param dec decoder object
+ */
+JXL_EXPORT void JxlDecoderRewind(JxlDecoder* dec);
+
+/** Makes the decoder skip the next `amount` frames. It still needs to process
+ * the input, but will not output the frame events. It can be more efficient
+ * when skipping frames, and even more so when using this after @ref
+ * JxlDecoderRewind. If the decoder is already processing a frame (could
+ * have emitted @ref JXL_DEC_FRAME but not yet @ref JXL_DEC_FULL_IMAGE), it
+ * starts skipping from the next frame. If the amount is larger than the amount
+ * of frames remaining in the image, all remaining frames are skipped. Calling
+ * this function multiple times adds the amount to skip to the already existing
+ * amount.
+ *
+ * A frame here is defined as a frame that without skipping emits events such
+ * as @ref JXL_DEC_FRAME and @ref JXL_DEC_FULL_IMAGE, frames that are internal
+ * to the file format but are not rendered as part of an animation, or are not
+ * the final still frame of a still image, are not counted.
+ *
+ * @param dec decoder object
+ * @param amount the amount of frames to skip
+ */
+JXL_EXPORT void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount);
+
+/**
+ * Skips processing the current frame. Can be called after frame processing
+ * already started, signaled by a @ref JXL_DEC_NEED_IMAGE_OUT_BUFFER event,
+ * but before the corresponding @ref JXL_DEC_FULL_IMAGE event. The next signaled
+ * event will be another @ref JXL_DEC_FRAME, or @ref JXL_DEC_SUCCESS if there
+ * are no more frames. If pixel data is required from the already processed part
+ * of the frame, @ref JxlDecoderFlushImage must be called before this.
+ *
+ * @param dec decoder object
+ * @return @ref JXL_DEC_SUCCESS if there is a frame to skip, and @ref
+ *     JXL_DEC_ERROR if the function was not called during frame processing.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec);
+
+/**
+ * Get the default pixel format for this decoder.
+ *
+ * Requires that the decoder can produce JxlBasicInfo.
+ *
+ * @param dec @ref JxlDecoder to query when creating the recommended pixel
+ *     format.
+ * @param format JxlPixelFormat to populate with the recommended settings for
+ *     the data loaded into this decoder.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_NEED_MORE_INPUT if the
+ *     basic info isn't yet available, and @ref JXL_DEC_ERROR otherwise.
+ *
+ * DEPRECATED: this function will be removed in the future.
+ */
+JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus
+JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, JxlPixelFormat* format);
+
+/**
+ * Set the parallel runner for multithreading. May only be set before starting
+ * decoding.
+ *
+ * @param dec decoder object
+ * @param parallel_runner function pointer to runner for multithreading. It may
+ *     be NULL to use the default, single-threaded, runner. A multithreaded
+ *     runner should be set to reach fast performance.
+ * @param parallel_runner_opaque opaque pointer for parallel_runner.
+ * @return @ref JXL_DEC_SUCCESS if the runner was set, @ref JXL_DEC_ERROR
+ *     otherwise (the previous runner remains set).
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner,
+                            void* parallel_runner_opaque);
+
+/**
+ * Returns a hint indicating how many more bytes the decoder is expected to
+ * need to make @ref JxlDecoderGetBasicInfo available after the next @ref
+ * JxlDecoderProcessInput call. This is a suggested large enough value for
+ * the amount of bytes to provide in the next @ref JxlDecoderSetInput call, but
+ * it is not guaranteed to be an upper bound nor a lower bound. This number does
+ * not include bytes that have already been released from the input. Can be used
+ * before the first @ref JxlDecoderProcessInput call, and is correct the first
+ * time in most cases. If not, @ref JxlDecoderSizeHintBasicInfo can be called
+ * again to get an updated hint.
+ *
+ * @param dec decoder object
+ * @return the size hint in bytes if the basic info is not yet fully decoded.
+ * @return 0 when the basic info is already available.
+ */
+JXL_EXPORT size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec);
+
+/** Select for which informative events, i.e. @ref JXL_DEC_BASIC_INFO, etc., the
+ * decoder should return with a status. It is not required to subscribe to any
+ * events, data can still be requested from the decoder as soon as it available.
+ * By default, the decoder is subscribed to no events (events_wanted == 0), and
+ * the decoder will then only return when it cannot continue because it needs
+ * more input data or more output buffer. This function may only be be called
+ * before using @ref JxlDecoderProcessInput.
+ *
+ * @param dec decoder object
+ * @param events_wanted bitfield of desired events.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec,
+                                                      int events_wanted);
+
+/** Enables or disables preserving of as-in-bitstream pixeldata
+ * orientation. Some images are encoded with an Orientation tag
+ * indicating that the decoder must perform a rotation and/or
+ * mirroring to the encoded image data.
+ *
+ *  - If skip_reorientation is JXL_FALSE (the default): the decoder
+ *    will apply the transformation from the orientation setting, hence
+ *    rendering the image according to its specified intent. When
+ *    producing a JxlBasicInfo, the decoder will always set the
+ *    orientation field to JXL_ORIENT_IDENTITY (matching the returned
+ *    pixel data) and also align xsize and ysize so that they correspond
+ *    to the width and the height of the returned pixel data.
+ *  - If skip_reorientation is JXL_TRUE: the decoder will skip
+ *    applying the transformation from the orientation setting, returning
+ *    the image in the as-in-bitstream pixeldata orientation.
+ *    This may be faster to decode since the decoder doesn't have to apply the
+ *    transformation, but can cause wrong display of the image if the
+ *    orientation tag is not correctly taken into account by the user.
+ *
+ * By default, this option is disabled, and the returned pixel data is
+ * re-oriented according to the image's Orientation setting.
+ *
+ * This function must be called at the beginning, before decoding is performed.
+ *
+ * @see JxlBasicInfo for the orientation field, and @ref JxlOrientation for the
+ * possible values.
+ *
+ * @param dec decoder object
+ * @param skip_reorientation JXL_TRUE to enable, JXL_FALSE to disable.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetKeepOrientation(JxlDecoder* dec, JXL_BOOL skip_reorientation);
+
+/**
+ * Enables or disables preserving of associated alpha channels. If
+ * unpremul_alpha is set to JXL_FALSE then for associated alpha channel, the
+ * pixel data is returned with premultiplied colors. If it is set to JXL_TRUE,
+ * The colors will be unpremultiplied based on the alpha channel. This function
+ * has no effect if the image does not have an associated alpha channel.
+ *
+ * By default, this option is disabled, and the returned pixel data "as is".
+ *
+ * This function must be called at the beginning, before decoding is performed.
+ *
+ * @param dec decoder object
+ * @param unpremul_alpha JXL_TRUE to enable, JXL_FALSE to disable.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetUnpremultiplyAlpha(JxlDecoder* dec, JXL_BOOL unpremul_alpha);
+
+/** Enables or disables rendering spot colors. By default, spot colors
+ * are rendered, which is OK for viewing the decoded image. If render_spotcolors
+ * is JXL_FALSE, then spot colors are not rendered, and have to be retrieved
+ * separately using @ref JxlDecoderSetExtraChannelBuffer. This is useful for
+ * e.g. printing applications.
+ *
+ * @param dec decoder object
+ * @param render_spotcolors JXL_TRUE to enable (default), JXL_FALSE to disable.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetRenderSpotcolors(JxlDecoder* dec, JXL_BOOL render_spotcolors);
+
+/** Enables or disables coalescing of zero-duration frames. By default, frames
+ * are returned with coalescing enabled, i.e. all frames have the image
+ * dimensions, and are blended if needed. When coalescing is disabled, frames
+ * can have arbitrary dimensions, a non-zero crop offset, and blending is not
+ * performed. For display, coalescing is recommended. For loading a multi-layer
+ * still image as separate layers (as opposed to the merged image), coalescing
+ * has to be disabled.
+ *
+ * @param dec decoder object
+ * @param coalescing JXL_TRUE to enable coalescing (default), JXL_FALSE to
+ *     disable it.
+ * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
+                                                    JXL_BOOL coalescing);
+
+/**
+ * Decodes JPEG XL file using the available bytes. Requires input has been
+ * set with @ref JxlDecoderSetInput. After @ref JxlDecoderProcessInput, input
+ * can optionally be released with @ref JxlDecoderReleaseInput and then set
+ * again to next bytes in the stream. @ref JxlDecoderReleaseInput returns how
+ * many bytes are not yet processed, before a next call to @ref
+ * JxlDecoderProcessInput all unprocessed bytes must be provided again (the
+ * address need not match, but the contents must), and more bytes may be
+ * concatenated after the unprocessed bytes.
+ *
+ * The returned status indicates whether the decoder needs more input bytes, or
+ * more output buffer for a certain type of output data. No matter what the
+ * returned status is (other than @ref JXL_DEC_ERROR), new information, such
+ * as @ref JxlDecoderGetBasicInfo, may have become available after this call.
+ * When the return value is not @ref JXL_DEC_ERROR or @ref JXL_DEC_SUCCESS, the
+ * decoding requires more @ref JxlDecoderProcessInput calls to continue.
+ *
+ * @param dec decoder object
+ * @return @ref JXL_DEC_SUCCESS when decoding finished and all events handled.
+ *     If you still have more unprocessed input data anyway, then you can still
+ *     continue by using @ref JxlDecoderSetInput and calling @ref
+ *     JxlDecoderProcessInput again, similar to handling @ref
+ *     JXL_DEC_NEED_MORE_INPUT. @ref JXL_DEC_SUCCESS can occur instead of @ref
+ *     JXL_DEC_NEED_MORE_INPUT when, for example, the input data ended right at
+ *     the boundary of a box of the container format, all essential codestream
+ *     boxes were already decoded, but extra metadata boxes are still present in
+ *     the next data. @ref JxlDecoderProcessInput cannot return success if all
+ *     codestream boxes have not been seen yet.
+ * @return @ref JXL_DEC_ERROR when decoding failed, e.g. invalid codestream.
+ *     TODO(lode): document the input data mechanism
+ * @return @ref JXL_DEC_NEED_MORE_INPUT when more input data is necessary.
+ * @return @ref JXL_DEC_BASIC_INFO when basic info such as image dimensions is
+ *     available and this informative event is subscribed to.
+ * @return @ref JXL_DEC_COLOR_ENCODING when color profile information is
+ *     available and this informative event is subscribed to.
+ * @return @ref JXL_DEC_PREVIEW_IMAGE when preview pixel information is
+ *     available and output in the preview buffer.
+ * @return @ref JXL_DEC_FULL_IMAGE when all pixel information at highest detail
+ *     is available and has been output in the pixel buffer.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec);
+
+/**
+ * Sets input data for @ref JxlDecoderProcessInput. The data is owned by the
+ * caller and may be used by the decoder until @ref JxlDecoderReleaseInput is
+ * called or the decoder is destroyed or reset so must be kept alive until then.
+ * Cannot be called if @ref JxlDecoderSetInput was already called and @ref
+ * JxlDecoderReleaseInput was not yet called, and cannot be called after @ref
+ * JxlDecoderCloseInput indicating the end of input was called.
+ *
+ * @param dec decoder object
+ * @param data pointer to next bytes to read from
+ * @param size amount of bytes available starting from data
+ * @return @ref JXL_DEC_ERROR if input was already set without releasing or @ref
+ *     JxlDecoderCloseInput was already called, @ref JXL_DEC_SUCCESS otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec,
+                                               const uint8_t* data,
+                                               size_t size);
+
+/**
+ * Releases input which was provided with @ref JxlDecoderSetInput. Between @ref
+ * JxlDecoderProcessInput and @ref JxlDecoderReleaseInput, the user may not
+ * alter the data in the buffer. Calling @ref JxlDecoderReleaseInput is required
+ * whenever any input is already set and new input needs to be added with @ref
+ * JxlDecoderSetInput, but is not required before @ref JxlDecoderDestroy or @ref
+ * JxlDecoderReset. Calling @ref JxlDecoderReleaseInput when no input is set is
+ * not an error and returns 0.
+ *
+ * @param dec decoder object
+ * @return The amount of bytes the decoder has not yet processed that are still
+ *     remaining in the data set by @ref JxlDecoderSetInput, or 0 if no input is
+ *     set or @ref JxlDecoderReleaseInput was already called. For a next call
+ *     to @ref JxlDecoderProcessInput, the buffer must start with these
+ *     unprocessed bytes. From this value it is possible to infer the position
+ *     of certain JPEG XL codestream elements (e.g. end of headers, frame
+ *     start/end). See the documentation of individual values of @ref
+ *     JxlDecoderStatus for more information.
+ */
+JXL_EXPORT size_t JxlDecoderReleaseInput(JxlDecoder* dec);
+
+/**
+ * Marks the input as finished, indicates that no more @ref JxlDecoderSetInput
+ * will be called. This function allows the decoder to determine correctly if it
+ * should return success, need more input or error in certain cases. For
+ * backwards compatibility with a previous version of the API, using this
+ * function is optional when not using the @ref JXL_DEC_BOX event (the decoder
+ * is able to determine the end of the image frames without marking the end),
+ * but using this function is required when using @ref JXL_DEC_BOX for getting
+ * metadata box contents. This function does not replace @ref
+ * JxlDecoderReleaseInput, that function should still be called if its return
+ * value is needed.
+ *
+ * @ref JxlDecoderCloseInput should be called as soon as all known input bytes
+ * are set (e.g. at the beginning when not streaming but setting all input
+ * at once), before the final @ref JxlDecoderProcessInput calls.
+ *
+ * @param dec decoder object
+ */
+JXL_EXPORT void JxlDecoderCloseInput(JxlDecoder* dec);
+
+/**
+ * Outputs the basic image information, such as image dimensions, bit depth and
+ * all other JxlBasicInfo fields, if available.
+ *
+ * @param dec decoder object
+ * @param info struct to copy the information into, or NULL to only check
+ *     whether the information is available through the return value.
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR
+ *     in case of other error conditions.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec,
+                                                   JxlBasicInfo* info);
+
+/**
+ * Outputs information for extra channel at the given index. The index must be
+ * smaller than num_extra_channels in the associated JxlBasicInfo.
+ *
+ * @param dec decoder object
+ * @param index index of the extra channel to query.
+ * @param info struct to copy the information into, or NULL to only check
+ *     whether the information is available through the return value.
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR
+ *     in case of other error conditions.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelInfo(
+    const JxlDecoder* dec, size_t index, JxlExtraChannelInfo* info);
+
+/**
+ * Outputs name for extra channel at the given index in UTF-8. The index must be
+ * smaller than num_extra_channels in the associated JxlBasicInfo. The buffer
+ * for name must have at least name_length + 1 bytes allocated, gotten from
+ * the associated JxlExtraChannelInfo.
+ *
+ * @param dec decoder object
+ * @param index index of the extra channel to query.
+ * @param name buffer to copy the name into
+ * @param size size of the name buffer in bytes
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR
+ *     in case of other error conditions.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec,
+                                                          size_t index,
+                                                          char* name,
+                                                          size_t size);
+
+/** Defines which color profile to get: the profile from the codestream
+ * metadata header, which represents the color profile of the original image,
+ * or the color profile from the pixel data produced by the decoder. Both are
+ * the same if the JxlBasicInfo has uses_original_profile set.
+ */
+typedef enum {
+  /** Get the color profile of the original image from the metadata.
+   */
+  JXL_COLOR_PROFILE_TARGET_ORIGINAL = 0,
+
+  /** Get the color profile of the pixel data the decoder outputs. */
+  JXL_COLOR_PROFILE_TARGET_DATA = 1,
+} JxlColorProfileTarget;
+
+/**
+ * Outputs the color profile as JPEG XL encoded structured data, if available.
+ * This is an alternative to an ICC Profile, which can represent a more limited
+ * amount of color spaces, but represents them exactly through enum values.
+ *
+ * It is often possible to use @ref JxlDecoderGetColorAsICCProfile as an
+ * alternative anyway. The following scenarios are possible:
+ *  - The JPEG XL image has an attached ICC Profile, in that case, the encoded
+ *    structured data is not available, this function will return an error
+ *    status. @ref JxlDecoderGetColorAsICCProfile should be called instead.
+ *  - The JPEG XL image has an encoded structured color profile, and it
+ *    represents an RGB or grayscale color space. This function will return it.
+ *    You can still use @ref JxlDecoderGetColorAsICCProfile as well as an
+ *    alternative if desired, though depending on which RGB color space is
+ *    represented, the ICC profile may be a close approximation. It is also not
+ *    always feasible to deduce from an ICC profile which named color space it
+ *    exactly represents, if any, as it can represent any arbitrary space.
+ *    HDR color spaces such as those using PQ and HLG are also potentially
+ *    problematic, in that: while ICC profiles can encode a transfer function
+ *    that happens to approximate those of PQ and HLG (HLG for only one given
+ *    system gamma at a time, and necessitating a 3D LUT if gamma is to be
+ *    different from 1), they cannot (before ICCv4.4) semantically signal that
+ *    this is the color space that they represent. Therefore, they will
+ *    typically not actually be interpreted as representing an HDR color space.
+ *    This is especially detrimental to PQ which will then be interpreted as if
+ *    the maximum signal value represented SDR white instead of 10000 cd/m^2,
+ *    meaning that the image will be displayed two orders of magnitude (5-7 EV)
+ *    too dim.
+ *  - The JPEG XL image has an encoded structured color profile, and it
+ *    indicates an unknown or xyb color space. In that case, @ref
+ *    JxlDecoderGetColorAsICCProfile is not available.
+ *
+ * When rendering an image on a system where ICC-based color management is used,
+ * @ref JxlDecoderGetColorAsICCProfile should generally be used first as it will
+ * return a ready-to-use profile (with the aforementioned caveat about HDR).
+ * When knowledge about the nominal color space is desired if available, @ref
+ * JxlDecoderGetColorAsEncodedProfile should be used first.
+ *
+ * @param dec decoder object
+ * @param unused_format deprecated, can be NULL
+ * @param target whether to get the original color profile from the metadata
+ *     or the color profile of the decoded pixels.
+ * @param color_encoding struct to copy the information into, or NULL to only
+ *     check whether the information is available through the return value.
+ * @return @ref JXL_DEC_SUCCESS if the data is available and returned, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR in
+ *     case the encoded structured color profile does not exist in the
+ *     codestream.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, JxlColorEncoding* color_encoding);
+
+/**
+ * Outputs the size in bytes of the ICC profile returned by @ref
+ * JxlDecoderGetColorAsICCProfile, if available, or indicates there is none
+ * available. In most cases, the image will have an ICC profile available, but
+ * if it does not, @ref JxlDecoderGetColorAsEncodedProfile must be used instead.
+ *
+ * @see JxlDecoderGetColorAsEncodedProfile for more information. The ICC
+ * profile is either the exact ICC profile attached to the codestream metadata,
+ * or a close approximation generated from JPEG XL encoded structured data,
+ * depending of what is encoded in the codestream.
+ *
+ * @param dec decoder object
+ * @param unused_format deprecated, can be NULL
+ * @param target whether to get the original color profile from the metadata
+ *     or the color profile of the decoded pixels.
+ * @param size variable to output the size into, or NULL to only check the
+ *     return status.
+ * @return @ref JXL_DEC_SUCCESS if the ICC profile is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if the decoder has not yet received enough
+ *     input data to determine whether an ICC profile is available or what its
+ *     size is, @ref JXL_DEC_ERROR in case the ICC profile is not available and
+ *     cannot be generated.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetICCProfileSize(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, size_t* size);
+
+/**
+ * Outputs ICC profile if available. The profile is only available if @ref
+ * JxlDecoderGetICCProfileSize returns success. The output buffer must have
+ * at least as many bytes as given by @ref JxlDecoderGetICCProfileSize.
+ *
+ * @param dec decoder object
+ * @param unused_format deprecated, can be NULL
+ * @param target whether to get the original color profile from the metadata
+ *     or the color profile of the decoded pixels.
+ * @param icc_profile buffer to copy the ICC profile into
+ * @param size size of the icc_profile buffer in bytes
+ * @return @ref JXL_DEC_SUCCESS if the profile was successfully returned is
+ *     available, @ref JXL_DEC_NEED_MORE_INPUT if not yet available, @ref
+ *     JXL_DEC_ERROR if the profile doesn't exist or the output size is not
+ *     large enough.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsICCProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, uint8_t* icc_profile, size_t size);
+
+/** Sets the desired output color profile of the decoded image by calling
+ * @ref JxlDecoderSetOutputColorProfile, passing on @c color_encoding and
+ * setting @c icc_data to NULL. See @ref JxlDecoderSetOutputColorProfile for
+ * details.
+ *
+ * @param dec decoder object
+ * @param color_encoding the default color encoding to set
+ * @return @ref JXL_DEC_SUCCESS if the preference was set successfully, @ref
+ *     JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreferredColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding);
+
+/** Requests that the decoder perform tone mapping to the peak display luminance
+ * passed as @c desired_intensity_target, if appropriate.
+ * @note This is provided for convenience and the exact tone mapping that is
+ * performed is not meant to be considered authoritative in any way. It may
+ * change from version to version.
+ * @param dec decoder object
+ * @param desired_intensity_target the intended target peak luminance
+ * @return @ref JXL_DEC_SUCCESS if the preference was set successfully, @ref
+ * JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetDesiredIntensityTarget(
+    JxlDecoder* dec, float desired_intensity_target);
+
+/**
+ * Sets the desired output color profile of the decoded image either from a
+ * color encoding or an ICC profile. Valid calls of this function have either @c
+ * color_encoding or @c icc_data set to NULL and @c icc_size must be 0 if and
+ * only if @c icc_data is NULL.
+ *
+ * Depending on whether a color management system (CMS) has been set the
+ * behavior is as follows:
+ *
+ * If a color management system (CMS) has been set with @ref JxlDecoderSetCms,
+ * and the CMS supports output to the desired color encoding or ICC profile,
+ * then it will provide the output in that color encoding or ICC profile. If the
+ * desired color encoding or the ICC is not supported, then an error will be
+ * returned.
+ *
+ * If no CMS has been set with @ref JxlDecoderSetCms, there are two cases:
+ *
+ * (1) Calling this function with a color encoding will convert XYB images to
+ * the desired color encoding. In this case, if the requested color encoding has
+ * a narrower gamut, or the white points differ, then the resulting image can
+ * have significant color distortion. Non-XYB images will not be converted to
+ * the desired color space.
+ *
+ * (2) Calling this function with an ICC profile will result in an error.
+ *
+ * If called with an ICC profile (after a call to @ref JxlDecoderSetCms), the
+ * ICC profile has to be a valid RGB or grayscale color profile.
+ *
+ * Can only be set after the @ref JXL_DEC_COLOR_ENCODING event occurred and
+ * before any other event occurred, and should be used before getting
+ * JXL_COLOR_PROFILE_TARGET_DATA.
+ *
+ * This function must not be called before JxlDecoderSetCms.
+ *
+ * @param dec decoder orbject
+ * @param color_encoding the output color encoding
+ * @param icc_data bytes of the icc profile
+ * @param icc_size size of the icc profile in bytes
+ * @return @ref JXL_DEC_SUCCESS if the color profile was set successfully, @ref
+ *     JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetOutputColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding,
+    const uint8_t* icc_data, size_t icc_size);
+
+/**
+ * Sets the color management system (CMS) that will be used for color
+ * conversion (if applicable) during decoding. May only be set before starting
+ * decoding and must not be called after @ref JxlDecoderSetOutputColorProfile.
+ *
+ * See @ref JxlDecoderSetOutputColorProfile for how color conversions are done
+ * depending on whether or not a CMS has been set with @ref JxlDecoderSetCms.
+ *
+ * @param dec decoder object.
+ * @param cms structure representing a CMS implementation. See @ref
+ * JxlCmsInterface for more details.
+ */
+JXL_EXPORT void JxlDecoderSetCms(JxlDecoder* dec, JxlCmsInterface cms);
+// TODO(firsching): add a function JxlDecoderSetDefaultCms() for setting a
+// default in case libjxl is build with a CMS.
+
+/**
+ * Returns the minimum size in bytes of the preview image output pixel buffer
+ * for the given format. This is the buffer for @ref
+ * JxlDecoderSetPreviewOutBuffer. Requires the preview header information is
+ * available in the decoder.
+ *
+ * @param dec decoder object
+ * @param format format of pixels
+ * @param size output value, buffer size in bytes
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     information not available yet.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
+
+/**
+ * Sets the buffer to write the small resolution preview image
+ * to. The size of the buffer must be at least as large as given by @ref
+ * JxlDecoderPreviewOutBufferSize. The buffer follows the format described
+ * by JxlPixelFormat. The preview image dimensions are given by the
+ * JxlPreviewHeader. The buffer is owned by the caller.
+ *
+ * @param dec decoder object
+ * @param format format of pixels. Object owned by user and its contents are
+ *     copied internally.
+ * @param buffer buffer type to output the pixel data to
+ * @param size size of buffer in bytes
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     size too small.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer(
+    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
+
+/**
+ * Outputs the information from the frame, such as duration when have_animation.
+ * This function can be called when @ref JXL_DEC_FRAME occurred for the current
+ * frame, even when have_animation in the JxlBasicInfo is JXL_FALSE.
+ *
+ * @param dec decoder object
+ * @param header struct to copy the information into, or NULL to only check
+ *     whether the information is available through the return value.
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR in
+ *     case of other error conditions.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
+                                                     JxlFrameHeader* header);
+
+/**
+ * Outputs name for the current frame. The buffer for name must have at least
+ * name_length + 1 bytes allocated, gotten from the associated JxlFrameHeader.
+ *
+ * @param dec decoder object
+ * @param name buffer to copy the name into
+ * @param size size of the name buffer in bytes, including zero termination
+ *    character, so this must be at least JxlFrameHeader.name_length + 1.
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref
+ *     JXL_DEC_NEED_MORE_INPUT if not yet available, @ref JXL_DEC_ERROR in
+ *     case of other error conditions.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
+                                                   char* name, size_t size);
+
+/**
+ * Outputs the blend information for the current frame for a specific extra
+ * channel. This function can be called when @ref JXL_DEC_FRAME occurred for the
+ * current frame, even when have_animation in the JxlBasicInfo is JXL_FALSE.
+ * This information is only useful if coalescing is disabled; otherwise the
+ * decoder will have performed blending already.
+ *
+ * @param dec decoder object
+ * @param index the index of the extra channel
+ * @param blend_info struct to copy the information into
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
+    const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
+
+/**
+ * Returns the minimum size in bytes of the image output pixel buffer for the
+ * given format. This is the buffer for @ref JxlDecoderSetImageOutBuffer.
+ * Requires that the basic image information is available in the decoder in the
+ * case of coalescing enabled (default). In case coalescing is disabled, this
+ * can only be called after the @ref JXL_DEC_FRAME event occurs. In that case,
+ * it will return the size required to store the possibly cropped frame (which
+ * can be larger or smaller than the image dimensions).
+ *
+ * @param dec decoder object
+ * @param format format of the pixels.
+ * @param size output value, buffer size in bytes
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     information not available yet.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
+
+/**
+ * Sets the buffer to write the full resolution image to. This can be set when
+ * the @ref JXL_DEC_FRAME event occurs, must be set when the @ref
+ * JXL_DEC_NEED_IMAGE_OUT_BUFFER event occurs, and applies only for the
+ * current frame. The size of the buffer must be at least as large as given
+ * by @ref JxlDecoderImageOutBufferSize. The buffer follows the format described
+ * by JxlPixelFormat. The buffer is owned by the caller.
+ *
+ * @param dec decoder object
+ * @param format format of the pixels. Object owned by user and its contents
+ *     are copied internally.
+ * @param buffer buffer type to output the pixel data to
+ * @param size size of buffer in bytes
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     size too small.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetImageOutBuffer(
+    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
+
+/**
+ * Function type for @ref JxlDecoderSetImageOutCallback.
+ *
+ * The callback may be called simultaneously by different threads when using a
+ * threaded parallel runner, on different pixels.
+ *
+ * @param opaque optional user data, as given to @ref
+ *     JxlDecoderSetImageOutCallback.
+ * @param x horizontal position of leftmost pixel of the pixel data.
+ * @param y vertical position of the pixel data.
+ * @param num_pixels amount of pixels included in the pixel data, horizontally.
+ *     This is not the same as xsize of the full image, it may be smaller.
+ * @param pixels pixel data as a horizontal stripe, in the format passed to @ref
+ *     JxlDecoderSetImageOutCallback. The memory is not owned by the user, and
+ *     is only valid during the time the callback is running.
+ */
+typedef void (*JxlImageOutCallback)(void* opaque, size_t x, size_t y,
+                                    size_t num_pixels, const void* pixels);
+
+/**
+ * Initialization callback for @ref JxlDecoderSetMultithreadedImageOutCallback.
+ *
+ * @param init_opaque optional user data, as given to @ref
+ *     JxlDecoderSetMultithreadedImageOutCallback.
+ * @param num_threads maximum number of threads that will call the @c run
+ *     callback concurrently.
+ * @param num_pixels_per_thread maximum number of pixels that will be passed in
+ *     one call to @c run.
+ * @return a pointer to data that will be passed to the @c run callback, or
+ *     @c NULL if initialization failed.
+ */
+typedef void* (*JxlImageOutInitCallback)(void* init_opaque, size_t num_threads,
+                                         size_t num_pixels_per_thread);
+
+/**
+ * Worker callback for @ref JxlDecoderSetMultithreadedImageOutCallback.
+ *
+ * @param run_opaque user data returned by the @c init callback.
+ * @param thread_id number in `[0, num_threads)` identifying the thread of the
+ *     current invocation of the callback.
+ * @param x horizontal position of the first (leftmost) pixel of the pixel data.
+ * @param y vertical position of the pixel data.
+ * @param num_pixels number of pixels in the pixel data. May be less than the
+ *     full @c xsize of the image, and will be at most equal to the @c
+ *     num_pixels_per_thread that was passed to @c init.
+ * @param pixels pixel data as a horizontal stripe, in the format passed to @ref
+ *     JxlDecoderSetMultithreadedImageOutCallback. The data pointed to
+ *     remains owned by the caller and is only guaranteed to outlive the current
+ *     callback invocation.
+ */
+typedef void (*JxlImageOutRunCallback)(void* run_opaque, size_t thread_id,
+                                       size_t x, size_t y, size_t num_pixels,
+                                       const void* pixels);
+
+/**
+ * Destruction callback for @ref JxlDecoderSetMultithreadedImageOutCallback,
+ * called after all invocations of the @c run callback to perform any
+ * appropriate clean-up of the @c run_opaque data returned by @c init.
+ *
+ * @param run_opaque user data returned by the @c init callback.
+ */
+typedef void (*JxlImageOutDestroyCallback)(void* run_opaque);
+
+/**
+ * Sets pixel output callback. This is an alternative to @ref
+ * JxlDecoderSetImageOutBuffer. This can be set when the @ref JXL_DEC_FRAME
+ * event occurs, must be set when the @ref JXL_DEC_NEED_IMAGE_OUT_BUFFER event
+ * occurs, and applies only for the current frame. Only one of @ref
+ * JxlDecoderSetImageOutBuffer or @ref JxlDecoderSetImageOutCallback may be used
+ * for the same frame, not both at the same time.
+ *
+ * The callback will be called multiple times, to receive the image
+ * data in small chunks. The callback receives a horizontal stripe of pixel
+ * data, 1 pixel high, xsize pixels wide, called a scanline. The xsize here is
+ * not the same as the full image width, the scanline may be a partial section,
+ * and xsize may differ between calls. The user can then process and/or copy the
+ * partial scanline to an image buffer. The callback may be called
+ * simultaneously by different threads when using a threaded parallel runner, on
+ * different pixels.
+ *
+ * If @ref JxlDecoderFlushImage is not used, then each pixel will be visited
+ * exactly once by the different callback calls, during processing with one or
+ * more @ref JxlDecoderProcessInput calls. These pixels are decoded to full
+ * detail, they are not part of a lower resolution or lower quality progressive
+ * pass, but the final pass.
+ *
+ * If @ref JxlDecoderFlushImage is used, then in addition each pixel will be
+ * visited zero or one times during the blocking @ref JxlDecoderFlushImage call.
+ * Pixels visited as a result of @ref JxlDecoderFlushImage may represent a lower
+ * resolution or lower quality intermediate progressive pass of the image. Any
+ * visited pixel will be of a quality at least as good or better than previous
+ * visits of this pixel. A pixel may be visited zero times if it cannot be
+ * decoded yet or if it was already decoded to full precision (this behavior is
+ * not guaranteed).
+ *
+ * @param dec decoder object
+ * @param format format of the pixels. Object owned by user; its contents are
+ *     copied internally.
+ * @param callback the callback function receiving partial scanlines of pixel
+ *     data.
+ * @param opaque optional user data, which will be passed on to the callback,
+ *     may be NULL.
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such
+ *     as @ref JxlDecoderSetImageOutBuffer already set.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetImageOutCallback(JxlDecoder* dec, const JxlPixelFormat* format,
+                              JxlImageOutCallback callback, void* opaque);
+
+/** Similar to @ref JxlDecoderSetImageOutCallback except that the callback is
+ * allowed an initialization phase during which it is informed of how many
+ * threads will call it concurrently, and those calls are further informed of
+ * which thread they are occurring in.
+ *
+ * @param dec decoder object
+ * @param format format of the pixels. Object owned by user; its contents are
+ *     copied internally.
+ * @param init_callback initialization callback.
+ * @param run_callback the callback function receiving partial scanlines of
+ *     pixel data.
+ * @param destroy_callback clean-up callback invoked after all calls to @c
+ *     run_callback. May be NULL if no clean-up is necessary.
+ * @param init_opaque optional user data passed to @c init_callback, may be NULL
+ *     (unlike the return value from @c init_callback which may only be NULL if
+ *     initialization failed).
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such
+ *     as @ref JxlDecoderSetImageOutBuffer having already been called.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetMultithreadedImageOutCallback(
+    JxlDecoder* dec, const JxlPixelFormat* format,
+    JxlImageOutInitCallback init_callback, JxlImageOutRunCallback run_callback,
+    JxlImageOutDestroyCallback destroy_callback, void* init_opaque);
+
+/**
+ * Returns the minimum size in bytes of an extra channel pixel buffer for the
+ * given format. This is the buffer for @ref JxlDecoderSetExtraChannelBuffer.
+ * Requires the basic image information is available in the decoder.
+ *
+ * @param dec decoder object
+ * @param format format of the pixels. The num_channels value is ignored and is
+ *     always treated to be 1.
+ * @param size output value, buffer size in bytes
+ * @param index which extra channel to get, matching the index used in @ref
+ *     JxlDecoderGetExtraChannelInfo. Must be smaller than num_extra_channels in
+ *     the associated JxlBasicInfo.
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     information not available yet or invalid index.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderExtraChannelBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size,
+    uint32_t index);
+
+/**
+ * Sets the buffer to write an extra channel to. This can be set when
+ * the @ref JXL_DEC_FRAME or @ref JXL_DEC_NEED_IMAGE_OUT_BUFFER event occurs,
+ * and applies only for the current frame. The size of the buffer must be at
+ * least as large as given by @ref JxlDecoderExtraChannelBufferSize. The buffer
+ * follows the format described by JxlPixelFormat, but where num_channels is 1.
+ * The buffer is owned by the caller. The amount of extra channels is given by
+ * the num_extra_channels field in the associated JxlBasicInfo, and the
+ * information of individual extra channels can be queried with @ref
+ * JxlDecoderGetExtraChannelInfo. To get multiple extra channels, this function
+ * must be called multiple times, once for each wanted index. Not all images
+ * have extra channels. The alpha channel is an extra channel and can be gotten
+ * as part of the color channels when using an RGBA pixel buffer with @ref
+ * JxlDecoderSetImageOutBuffer, but additionally also can be gotten
+ * separately as extra channel. The color channels themselves cannot be gotten
+ * this way.
+ *
+ *
+ * @param dec decoder object
+ * @param format format of the pixels. Object owned by user and its contents
+ *     are copied internally. The num_channels value is ignored and is always
+ *     treated to be 1.
+ * @param buffer buffer type to output the pixel data to
+ * @param size size of buffer in bytes
+ * @param index which extra channel to get, matching the index used in @ref
+ *     JxlDecoderGetExtraChannelInfo. Must be smaller than num_extra_channels in
+ *     the associated JxlBasicInfo.
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     size too small or invalid index.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetExtraChannelBuffer(JxlDecoder* dec, const JxlPixelFormat* format,
+                                void* buffer, size_t size, uint32_t index);
+
+/**
+ * Sets output buffer for reconstructed JPEG codestream.
+ *
+ * The data is owned by the caller and may be used by the decoder until @ref
+ * JxlDecoderReleaseJPEGBuffer is called or the decoder is destroyed or
+ * reset so must be kept alive until then.
+ *
+ * If a JPEG buffer was set before and released with @ref
+ * JxlDecoderReleaseJPEGBuffer, bytes that the decoder has already output
+ * should not be included, only the remaining bytes output must be set.
+ *
+ * @param dec decoder object
+ * @param data pointer to next bytes to write to
+ * @param size amount of bytes available starting from data
+ * @return @ref JXL_DEC_ERROR if output buffer was already set and @ref
+ *     JxlDecoderReleaseJPEGBuffer was not called on it, @ref JXL_DEC_SUCCESS
+ *     otherwise
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec,
+                                                    uint8_t* data, size_t size);
+
+/**
+ * Releases buffer which was provided with @ref JxlDecoderSetJPEGBuffer.
+ *
+ * Calling @ref JxlDecoderReleaseJPEGBuffer is required whenever
+ * a buffer is already set and a new buffer needs to be added with @ref
+ * JxlDecoderSetJPEGBuffer, but is not required before @ref
+ * JxlDecoderDestroy or @ref JxlDecoderReset.
+ *
+ * Calling @ref JxlDecoderReleaseJPEGBuffer when no buffer is set is
+ * not an error and returns 0.
+ *
+ * @param dec decoder object
+ * @return the amount of bytes the decoder has not yet written to of the data
+ *     set by @ref JxlDecoderSetJPEGBuffer, or 0 if no buffer is set or @ref
+ *     JxlDecoderReleaseJPEGBuffer was already called.
+ */
+JXL_EXPORT size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec);
+
+/**
+ * Sets output buffer for box output codestream.
+ *
+ * The data is owned by the caller and may be used by the decoder until @ref
+ * JxlDecoderReleaseBoxBuffer is called or the decoder is destroyed or
+ * reset so must be kept alive until then.
+ *
+ * If for the current box a box buffer was set before and released with @ref
+ * JxlDecoderReleaseBoxBuffer, bytes that the decoder has already output
+ * should not be included, only the remaining bytes output must be set.
+ *
+ * The @ref JxlDecoderReleaseBoxBuffer must be used at the next @ref JXL_DEC_BOX
+ * event or final @ref JXL_DEC_SUCCESS event to compute the size of the output
+ * box bytes.
+ *
+ * @param dec decoder object
+ * @param data pointer to next bytes to write to
+ * @param size amount of bytes available starting from data
+ * @return @ref JXL_DEC_ERROR if output buffer was already set and @ref
+ *     JxlDecoderReleaseBoxBuffer was not called on it, @ref JXL_DEC_SUCCESS
+ *     otherwise
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetBoxBuffer(JxlDecoder* dec,
+                                                   uint8_t* data, size_t size);
+
+/**
+ * Releases buffer which was provided with @ref JxlDecoderSetBoxBuffer.
+ *
+ * Calling @ref JxlDecoderReleaseBoxBuffer is required whenever
+ * a buffer is already set and a new buffer needs to be added with @ref
+ * JxlDecoderSetBoxBuffer, but is not required before @ref
+ * JxlDecoderDestroy or @ref JxlDecoderReset.
+ *
+ * Calling @ref JxlDecoderReleaseBoxBuffer when no buffer is set is
+ * not an error and returns 0.
+ *
+ * @param dec decoder object
+ * @return the amount of bytes the decoder has not yet written to of the data
+ *     set by @ref JxlDecoderSetBoxBuffer, or 0 if no buffer is set or @ref
+ *     JxlDecoderReleaseBoxBuffer was already called.
+ */
+JXL_EXPORT size_t JxlDecoderReleaseBoxBuffer(JxlDecoder* dec);
+
+/**
+ * Configures whether to get boxes in raw mode or in decompressed mode. In raw
+ * mode, boxes are output as their bytes appear in the container file, which may
+ * be decompressed, or compressed if their type is "brob". In decompressed mode,
+ * "brob" boxes are decompressed with Brotli before outputting them. The size of
+ * the decompressed stream is not known before the decompression has already
+ * finished.
+ *
+ * The default mode is raw. This setting can only be changed before decoding, or
+ * directly after a @ref JXL_DEC_BOX event, and is remembered until the decoder
+ * is reset or destroyed.
+ *
+ * Enabling decompressed mode requires Brotli support from the library.
+ *
+ * @param dec decoder object
+ * @param decompress JXL_TRUE to transparently decompress, JXL_FALSE to get
+ *     boxes in raw mode.
+ * @return @ref JXL_DEC_ERROR if decompressed mode is set and Brotli is not
+ *     available, @ref JXL_DEC_SUCCESS otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetDecompressBoxes(JxlDecoder* dec,
+                                                         JXL_BOOL decompress);
+
+/**
+ * Outputs the type of the current box, after a @ref JXL_DEC_BOX event occurred,
+ * as 4 characters without null termination character. In case of a compressed
+ * "brob" box, this will return "brob" if the decompressed argument is
+ * JXL_FALSE, or the underlying box type if the decompressed argument is
+ * JXL_TRUE.
+ *
+ * The following box types are currently described in ISO/IEC 18181-2:
+ *  - "Exif": a box with EXIF metadata.  Starts with a 4-byte tiff header offset
+ *    (big-endian uint32) that indicates the start of the actual EXIF data
+ *    (which starts with a tiff header). Usually the offset will be zero and the
+ *    EXIF data starts immediately after the offset field. The Exif orientation
+ *    should be ignored by applications; the JPEG XL codestream orientation
+ *    takes precedence and libjxl will by default apply the correct orientation
+ *    automatically (see @ref JxlDecoderSetKeepOrientation).
+ *  - "xml ": a box with XML data, in particular XMP metadata.
+ *  - "jumb": a JUMBF superbox (JPEG Universal Metadata Box Format, ISO/IEC
+ *    19566-5).
+ *  - "JXL ": mandatory signature box, must come first, 12 bytes long including
+ *    the box header
+ *  - "ftyp": a second mandatory signature box, must come second, 20 bytes long
+ *    including the box header
+ *  - "jxll": a JXL level box. This indicates if the codestream is level 5 or
+ *    level 10 compatible. If not present, it is level 5. Level 10 allows more
+ *    features such as very high image resolution and bit-depths above 16 bits
+ *    per channel. Added automatically by the encoder when
+ *    JxlEncoderSetCodestreamLevel is used
+ *  - "jxlc": a box with the image codestream, in case the codestream is not
+ *    split across multiple boxes. The codestream contains the JPEG XL image
+ *    itself, including the basic info such as image dimensions, ICC color
+ *    profile, and all the pixel data of all the image frames.
+ *  - "jxlp": a codestream box in case it is split across multiple boxes.
+ *    The contents are the same as in case of a jxlc box, when concatenated.
+ *  - "brob": a Brotli-compressed box, which otherwise represents an existing
+ *    type of box such as Exif or "xml ". When @ref JxlDecoderSetDecompressBoxes
+ *    is set to JXL_TRUE, these boxes will be transparently decompressed by the
+ *    decoder.
+ *  - "jxli": frame index box, can list the keyframes in case of a JPEG XL
+ *    animation allowing the decoder to jump to individual frames more
+ *    efficiently.
+ *  - "jbrd": JPEG reconstruction box, contains the information required to
+ *    byte-for-byte losslessly recontruct a JPEG-1 image. The JPEG DCT
+ *    coefficients (pixel content) themselves as well as the ICC profile are
+ *    encoded in the JXL codestream (jxlc or jxlp) itself. EXIF, XMP and JUMBF
+ *    metadata is encoded in the corresponding boxes. The jbrd box itself
+ *    contains information such as the remaining app markers of the JPEG-1 file
+ *    and everything else required to fit the information together into the
+ *    exact original JPEG file.
+ *
+ * Other application-specific boxes can exist. Their typename should not begin
+ * with "jxl" or "JXL" or conflict with other existing typenames.
+ *
+ * The signature, jxl* and jbrd boxes are processed by the decoder and would
+ * typically be ignored by applications. The typical way to use this function is
+ * to check if an encountered box contains metadata that the application is
+ * interested in (e.g. EXIF or XMP metadata), in order to conditionally set a
+ * box buffer.
+ *
+ * @param dec decoder object
+ * @param type buffer to copy the type into
+ * @param decompressed which box type to get: JXL_FALSE to get the raw box type,
+ *     which can be "brob", JXL_TRUE, get the underlying box type.
+ * @return @ref JXL_DEC_SUCCESS if the value is available, @ref JXL_DEC_ERROR if
+ *     not, for example the JXL file does not use the container format.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetBoxType(JxlDecoder* dec,
+                                                 JxlBoxType type,
+                                                 JXL_BOOL decompressed);
+
+/**
+ * Returns the size of a box as it appears in the container file, after the @ref
+ * JXL_DEC_BOX event. For a non-compressed box, this is the size of the
+ * contents, excluding the 4 bytes indicating the box type. For a compressed
+ * "brob" box, this is the size of the compressed box contents plus the
+ * additional 4 byte indicating the underlying box type, but excluding the 4
+ * bytes indicating "brob". This function gives the size of the data that will
+ * be written in the output buffer when getting boxes in the default raw
+ * compressed mode. When @ref JxlDecoderSetDecompressBoxes is enabled, the
+ * return value of function does not change, and the decompressed size is not
+ * known before it has already been decompressed and output.
+ *
+ * @param dec decoder object
+ * @param size raw size of the box in bytes
+ * @return @ref JXL_DEC_ERROR if no box size is available, @ref JXL_DEC_SUCCESS
+ *     otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetBoxSizeRaw(const JxlDecoder* dec,
+                                                    uint64_t* size);
+
+/**
+ * Configures at which progressive steps in frame decoding these @ref
+ * JXL_DEC_FRAME_PROGRESSION event occurs. The default value for the level
+ * of detail if this function is never called is `kDC`.
+ *
+ * @param dec decoder object
+ * @param detail at which level of detail to trigger @ref
+ *     JXL_DEC_FRAME_PROGRESSION
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     an invalid value for the progressive detail.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetProgressiveDetail(JxlDecoder* dec, JxlProgressiveDetail detail);
+
+/**
+ * Returns the intended downsampling ratio for the progressive frame produced
+ * by @ref JxlDecoderFlushImage after the latest @ref JXL_DEC_FRAME_PROGRESSION
+ * event.
+ *
+ * @param dec decoder object
+ * @return The intended downsampling ratio, can be 1, 2, 4 or 8.
+ */
+JXL_EXPORT size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec);
+
+/**
+ * Outputs progressive step towards the decoded image so far when only partial
+ * input was received. If the flush was successful, the buffer set with @ref
+ * JxlDecoderSetImageOutBuffer will contain partial image data.
+ *
+ * Can be called when @ref JxlDecoderProcessInput returns @ref
+ * JXL_DEC_NEED_MORE_INPUT, after the @ref JXL_DEC_FRAME event already occurred
+ * and before the @ref JXL_DEC_FULL_IMAGE event occurred for a frame.
+ *
+ * @param dec decoder object
+ * @return @ref JXL_DEC_SUCCESS if image data was flushed to the output buffer,
+ *     or @ref JXL_DEC_ERROR when no flush was done, e.g. if not enough image
+ *     data was available yet even for flush, or no output buffer was set yet.
+ *     This error is not fatal, it only indicates no flushed image is available
+ *     right now. Regular decoding can still be performed.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec);
+
+/**
+ * Sets the bit depth of the output buffer or callback.
+ *
+ * Can be called after @ref JxlDecoderSetImageOutBuffer or @ref
+ * JxlDecoderSetImageOutCallback. For float pixel data types, only the default
+ * @ref JXL_BIT_DEPTH_FROM_PIXEL_FORMAT setting is supported.
+ *
+ * @param dec decoder object
+ * @param bit_depth the bit depth setting of the pixel output
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     incompatible custom bit depth and pixel data type.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetImageOutBitDepth(JxlDecoder* dec, const JxlBitDepth* bit_depth);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_DECODE_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h b/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h
new file mode 100644
index 0000000000..bc6e8a3789
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/// @addtogroup libjxl_decoder
+/// @{
+///
+/// @file decode_cxx.h
+/// @brief C++ header-only helper for @ref decode.h.
+///
+/// There's no binary library associated with the header since this is a header
+/// only library.
+
+#ifndef JXL_DECODE_CXX_H_
+#define JXL_DECODE_CXX_H_
+
+#include <jxl/decode.h>
+
+#include <memory>
+
+#if !(defined(__cplusplus) || defined(c_plusplus))
+#error "This a C++ only header. Use jxl/decode.h from C sources."
+#endif
+
+/// Struct to call JxlDecoderDestroy from the JxlDecoderPtr unique_ptr.
+struct JxlDecoderDestroyStruct {
+  /// Calls @ref JxlDecoderDestroy() on the passed decoder.
+  void operator()(JxlDecoder* decoder) { JxlDecoderDestroy(decoder); }
+};
+
+/// std::unique_ptr<> type that calls JxlDecoderDestroy() when releasing the
+/// decoder.
+///
+/// Use this helper type from C++ sources to ensure the decoder is destroyed and
+/// their internal resources released.
+typedef std::unique_ptr<JxlDecoder, JxlDecoderDestroyStruct> JxlDecoderPtr;
+
+/// Creates an instance of JxlDecoder into a JxlDecoderPtr and initializes it.
+///
+/// This function returns a unique_ptr that will call JxlDecoderDestroy() when
+/// releasing the pointer. See @ref JxlDecoderCreate for details on the
+/// instance creation.
+///
+/// @param memory_manager custom allocator function. It may be NULL. The memory
+///        manager will be copied internally.
+/// @return a @c NULL JxlDecoderPtr if the instance can not be allocated or
+///         initialized
+/// @return initialized JxlDecoderPtr instance otherwise.
+static inline JxlDecoderPtr JxlDecoderMake(
+    const JxlMemoryManager* memory_manager) {
+  return JxlDecoderPtr(JxlDecoderCreate(memory_manager));
+}
+
+#endif  // JXL_DECODE_CXX_H_
+
+/// @}
diff --git a/third_party/jpeg-xl/lib/include/jxl/encode.h b/third_party/jpeg-xl/lib/include/jxl/encode.h
new file mode 100644
index 0000000000..bd2906e20a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/encode.h
@@ -0,0 +1,1213 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_encoder
+ * @{
+ * @file encode.h
+ * @brief Encoding API for JPEG XL.
+ */
+
+#ifndef JXL_ENCODE_H_
+#define JXL_ENCODE_H_
+
+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
+#include <jxl/jxl_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/version.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * Encoder library version.
+ *
+ * @return the encoder library version as an integer:
+ * MAJOR_VERSION * 1000000 + MINOR_VERSION * 1000 + PATCH_VERSION. For example,
+ * version 1.2.3 would return 1002003.
+ */
+JXL_EXPORT uint32_t JxlEncoderVersion(void);
+
+/**
+ * Opaque structure that holds the JPEG XL encoder.
+ *
+ * Allocated and initialized with JxlEncoderCreate().
+ * Cleaned up and deallocated with JxlEncoderDestroy().
+ */
+typedef struct JxlEncoderStruct JxlEncoder;
+
+/**
+ * Settings and metadata for a single image frame. This includes encoder options
+ * for a frame such as compression quality and speed.
+ *
+ * Allocated and initialized with JxlEncoderFrameSettingsCreate().
+ * Cleaned up and deallocated when the encoder is destroyed with
+ * JxlEncoderDestroy().
+ */
+typedef struct JxlEncoderFrameSettingsStruct JxlEncoderFrameSettings;
+
+/** DEPRECATED: Use JxlEncoderFrameSettings instead.
+ */
+typedef JxlEncoderFrameSettings JxlEncoderOptions;
+
+/**
+ * Return value for multiple encoder functions.
+ */
+typedef enum {
+  /** Function call finished successfully, or encoding is finished and there is
+   * nothing more to be done.
+   */
+  JXL_ENC_SUCCESS = 0,
+
+  /** An error occurred, for example out of memory.
+   */
+  JXL_ENC_ERROR = 1,
+
+  /** The encoder needs more output buffer to continue encoding.
+   */
+  JXL_ENC_NEED_MORE_OUTPUT = 2,
+
+  /** DEPRECATED: the encoder does not return this status and there is no need
+   * to handle or expect it.
+   * Instead, JXL_ENC_ERROR is returned with error condition
+   * JXL_ENC_ERR_NOT_SUPPORTED.
+   */
+  JXL_ENC_NOT_SUPPORTED = 3,
+
+} JxlEncoderStatus;
+
+/**
+ * Error conditions:
+ * API usage errors have the 0x80 bit set to 1
+ * Other errors have the 0x80 bit set to 0
+ */
+typedef enum {
+  /** No error
+   */
+  JXL_ENC_ERR_OK = 0,
+
+  /** Generic encoder error due to unspecified cause
+   */
+  JXL_ENC_ERR_GENERIC = 1,
+
+  /** Out of memory
+   *  TODO(jon): actually catch this and return this error
+   */
+  JXL_ENC_ERR_OOM = 2,
+
+  /** JPEG bitstream reconstruction data could not be
+   *  represented (e.g. too much tail data)
+   */
+  JXL_ENC_ERR_JBRD = 3,
+
+  /** Input is invalid (e.g. corrupt JPEG file or ICC profile)
+   */
+  JXL_ENC_ERR_BAD_INPUT = 4,
+
+  /** The encoder doesn't (yet) support this. Either no version of libjxl
+   * supports this, and the API is used incorrectly, or the libjxl version
+   * should have been checked before trying to do this.
+   */
+  JXL_ENC_ERR_NOT_SUPPORTED = 0x80,
+
+  /** The encoder API is used in an incorrect way.
+   *  In this case, a debug build of libjxl should output a specific error
+   * message. (if not, please open an issue about it)
+   */
+  JXL_ENC_ERR_API_USAGE = 0x81,
+
+} JxlEncoderError;
+
+/**
+ * Id of encoder options for a frame. This includes options such as setting
+ * encoding effort/speed or overriding the use of certain coding tools, for this
+ * frame. This does not include non-frame related encoder options such as for
+ * boxes.
+ */
+typedef enum {
+  /** Sets encoder effort/speed level without affecting decoding speed. Valid
+   * values are, from faster to slower speed: 1:lightning 2:thunder 3:falcon
+   * 4:cheetah 5:hare 6:wombat 7:squirrel 8:kitten 9:tortoise.
+   * Default: squirrel (7).
+   */
+  JXL_ENC_FRAME_SETTING_EFFORT = 0,
+
+  /** Sets the decoding speed tier for the provided options. Minimum is 0
+   * (slowest to decode, best quality/density), and maximum is 4 (fastest to
+   * decode, at the cost of some quality/density). Default is 0.
+   */
+  JXL_ENC_FRAME_SETTING_DECODING_SPEED = 1,
+
+  /** Sets resampling option. If enabled, the image is downsampled before
+   * compression, and upsampled to original size in the decoder. Integer option,
+   * use -1 for the default behavior (resampling only applied for low quality),
+   * 1 for no downsampling (1x1), 2 for 2x2 downsampling, 4 for 4x4
+   * downsampling, 8 for 8x8 downsampling.
+   */
+  JXL_ENC_FRAME_SETTING_RESAMPLING = 2,
+
+  /** Similar to JXL_ENC_FRAME_SETTING_RESAMPLING, but for extra channels.
+   * Integer option, use -1 for the default behavior (depends on encoder
+   * implementation), 1 for no downsampling (1x1), 2 for 2x2 downsampling, 4 for
+   * 4x4 downsampling, 8 for 8x8 downsampling.
+   */
+  JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING = 3,
+
+  /** Indicates the frame added with @ref JxlEncoderAddImageFrame is already
+   * downsampled by the downsampling factor set with @ref
+   * JXL_ENC_FRAME_SETTING_RESAMPLING. The input frame must then be given in the
+   * downsampled resolution, not the full image resolution. The downsampled
+   * resolution is given by ceil(xsize / resampling), ceil(ysize / resampling)
+   * with xsize and ysize the dimensions given in the basic info, and resampling
+   * the factor set with @ref JXL_ENC_FRAME_SETTING_RESAMPLING.
+   * Use 0 to disable, 1 to enable. Default value is 0.
+   */
+  JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED = 4,
+
+  /** Adds noise to the image emulating photographic film noise, the higher the
+   * given number, the grainier the image will be. As an example, a value of 100
+   * gives low noise whereas a value of 3200 gives a lot of noise. The default
+   * value is 0.
+   */
+  JXL_ENC_FRAME_SETTING_PHOTON_NOISE = 5,
+
+  /** Enables adaptive noise generation. This setting is not recommended for
+   * use, please use JXL_ENC_FRAME_SETTING_PHOTON_NOISE instead. Use -1 for the
+   * default (encoder chooses), 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_NOISE = 6,
+
+  /** Enables or disables dots generation. Use -1 for the default (encoder
+   * chooses), 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_DOTS = 7,
+
+  /** Enables or disables patches generation. Use -1 for the default (encoder
+   * chooses), 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_PATCHES = 8,
+
+  /** Edge preserving filter level, -1 to 3. Use -1 for the default (encoder
+   * chooses), 0 to 3 to set a strength.
+   */
+  JXL_ENC_FRAME_SETTING_EPF = 9,
+
+  /** Enables or disables the gaborish filter. Use -1 for the default (encoder
+   * chooses), 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_GABORISH = 10,
+
+  /** Enables modular encoding. Use -1 for default (encoder
+   * chooses), 0 to enforce VarDCT mode (e.g. for photographic images), 1 to
+   * enforce modular mode (e.g. for lossless images).
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR = 11,
+
+  /** Enables or disables preserving color of invisible pixels. Use -1 for the
+   * default (1 if lossless, 0 if lossy), 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE = 12,
+
+  /** Determines the order in which 256x256 regions are stored in the codestream
+   * for progressive rendering. Use -1 for the encoder
+   * default, 0 for scanline order, 1 for center-first order.
+   */
+  JXL_ENC_FRAME_SETTING_GROUP_ORDER = 13,
+
+  /** Determines the horizontal position of center for the center-first group
+   * order. Use -1 to automatically use the middle of the image, 0..xsize to
+   * specifically set it.
+   */
+  JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X = 14,
+
+  /** Determines the center for the center-first group order. Use -1 to
+   * automatically use the middle of the image, 0..ysize to specifically set it.
+   */
+  JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y = 15,
+
+  /** Enables or disables progressive encoding for modular mode. Use -1 for the
+   * encoder default, 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_RESPONSIVE = 16,
+
+  /** Set the progressive mode for the AC coefficients of VarDCT, using spectral
+   * progression from the DCT coefficients. Use -1 for the encoder default, 0 to
+   * disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC = 17,
+
+  /** Set the progressive mode for the AC coefficients of VarDCT, using
+   * quantization of the least significant bits. Use -1 for the encoder default,
+   * 0 to disable, 1 to enable.
+   */
+  JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC = 18,
+
+  /** Set the progressive mode using lower-resolution DC images for VarDCT. Use
+   * -1 for the encoder default, 0 to disable, 1 to have an extra 64x64 lower
+   * resolution pass, 2 to have a 512x512 and 64x64 lower resolution pass.
+   */
+  JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC = 19,
+
+  /** Use Global channel palette if the amount of colors is smaller than this
+   * percentage of range. Use 0-100 to set an explicit percentage, -1 to use the
+   * encoder default. Used for modular encoding.
+   */
+  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT = 20,
+
+  /** Use Local (per-group) channel palette if the amount of colors is smaller
+   * than this percentage of range. Use 0-100 to set an explicit percentage, -1
+   * to use the encoder default. Used for modular encoding.
+   */
+  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT = 21,
+
+  /** Use color palette if amount of colors is smaller than or equal to this
+   * amount, or -1 to use the encoder default. Used for modular encoding.
+   */
+  JXL_ENC_FRAME_SETTING_PALETTE_COLORS = 22,
+
+  /** Enables or disables delta palette. Use -1 for the default (encoder
+   * chooses), 0 to disable, 1 to enable. Used in modular mode.
+   */
+  JXL_ENC_FRAME_SETTING_LOSSY_PALETTE = 23,
+
+  /** Color transform for internal encoding: -1 = default, 0=XYB, 1=none (RGB),
+   * 2=YCbCr. The XYB setting performs the forward XYB transform. None and
+   * YCbCr both perform no transform, but YCbCr is used to indicate that the
+   * encoded data losslessly represents YCbCr values.
+   */
+  JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM = 24,
+
+  /** Reversible color transform for modular encoding: -1=default, 0-41=RCT
+   * index, e.g. index 0 = none, index 6 = YCoCg.
+   * If this option is set to a non-default value, the RCT will be globally
+   * applied to the whole frame.
+   * The default behavior is to try several RCTs locally per modular group,
+   * depending on the speed and distance setting.
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE = 25,
+
+  /** Group size for modular encoding: -1=default, 0=128, 1=256, 2=512, 3=1024.
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE = 26,
+
+  /** Predictor for modular encoding. -1 = default, 0=zero, 1=left, 2=top,
+   * 3=avg0, 4=select, 5=gradient, 6=weighted, 7=topright, 8=topleft,
+   * 9=leftleft, 10=avg1, 11=avg2, 12=avg3, 13=toptop predictive average 14=mix
+   * 5 and 6, 15=mix everything.
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR = 27,
+
+  /** Fraction of pixels used to learn MA trees as a percentage. -1 = default,
+   * 0 = no MA and fast decode, 50 = default value, 100 = all, values above
+   * 100 are also permitted. Higher values use more encoder memory.
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT = 28,
+
+  /** Number of extra (previous-channel) MA tree properties to use. -1 =
+   * default, 0-11 = valid values. Recommended values are in the range 0 to 3,
+   * or 0 to amount of channels minus 1 (including all extra channels, and
+   * excluding color channels when using VarDCT mode). Higher value gives slower
+   * encoding and slower decoding.
+   */
+  JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS = 29,
+
+  /** Enable or disable CFL (chroma-from-luma) for lossless JPEG recompression.
+   * -1 = default, 0 = disable CFL, 1 = enable CFL.
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL = 30,
+
+  /** Prepare the frame for indexing in the frame index box.
+   * 0 = ignore this frame (same as not setting a value),
+   * 1 = index this frame within the Frame Index Box.
+   * If any frames are indexed, the first frame needs to
+   * be indexed, too. If the first frame is not indexed, and
+   * a later frame is attempted to be indexed, JXL_ENC_ERROR will occur.
+   * If non-keyframes, i.e., frames with cropping, blending or patches are
+   * attempted to be indexed, JXL_ENC_ERROR will occur.
+   */
+  JXL_ENC_FRAME_INDEX_BOX = 31,
+
+  /** Sets brotli encode effort for use in JPEG recompression and compressed
+   * metadata boxes (brob). Can be -1 (default) or 0 (fastest) to 11 (slowest).
+   * Default is based on the general encode effort in case of JPEG
+   * recompression, and 4 for brob boxes.
+   */
+  JXL_ENC_FRAME_SETTING_BROTLI_EFFORT = 32,
+
+  /** Enables or disables brotli compression of metadata boxes derived from
+   * a JPEG frame when using JxlEncoderAddJPEGFrame. This has no effect on boxes
+   * added using JxlEncoderAddBox.
+   * -1 = default, 0 = disable compression, 1 = enable compression.
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES = 33,
+
+  /** Enum value not to be used as an option. This value is added to force the
+   * C compiler to have the enum to take a known size.
+   */
+  JXL_ENC_FRAME_SETTING_FILL_ENUM = 65535,
+
+} JxlEncoderFrameSettingId;
+
+/**
+ * Creates an instance of JxlEncoder and initializes it.
+ *
+ * @p memory_manager will be used for all the library dynamic allocations made
+ * from this instance. The parameter may be NULL, in which case the default
+ * allocator will be used. See jpegxl/memory_manager.h for details.
+ *
+ * @param memory_manager custom allocator function. It may be NULL. The memory
+ *        manager will be copied internally.
+ * @return @c NULL if the instance can not be allocated or initialized
+ * @return pointer to initialized JxlEncoder otherwise
+ */
+JXL_EXPORT JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager);
+
+/**
+ * Re-initializes a JxlEncoder instance, so it can be re-used for encoding
+ * another image. All state and settings are reset as if the object was
+ * newly created with JxlEncoderCreate, but the memory manager is kept.
+ *
+ * @param enc instance to be re-initialized.
+ */
+JXL_EXPORT void JxlEncoderReset(JxlEncoder* enc);
+
+/**
+ * Deinitializes and frees JxlEncoder instance.
+ *
+ * @param enc instance to be cleaned up and deallocated.
+ */
+JXL_EXPORT void JxlEncoderDestroy(JxlEncoder* enc);
+
+/**
+ * Sets the color management system (CMS) that will be used for color conversion
+ * (if applicable) during encoding. May only be set before starting encoding. If
+ * left unset, the default CMS implementation will be used.
+ *
+ * @param enc encoder object.
+ * @param cms structure representing a CMS implementation. See JxlCmsInterface
+ * for more details.
+ */
+JXL_EXPORT void JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms);
+
+/**
+ * Set the parallel runner for multithreading. May only be set before starting
+ * encoding.
+ *
+ * @param enc encoder object.
+ * @param parallel_runner function pointer to runner for multithreading. It may
+ *        be NULL to use the default, single-threaded, runner. A multithreaded
+ *        runner should be set to reach fast performance.
+ * @param parallel_runner_opaque opaque pointer for parallel_runner.
+ * @return JXL_ENC_SUCCESS if the runner was set, JXL_ENC_ERROR
+ * otherwise (the previous runner remains set).
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderSetParallelRunner(JxlEncoder* enc, JxlParallelRunner parallel_runner,
+                            void* parallel_runner_opaque);
+
+/**
+ * Get the (last) error code in case JXL_ENC_ERROR was returned.
+ *
+ * @param enc encoder object.
+ * @return the JxlEncoderError that caused the (last) JXL_ENC_ERROR to be
+ * returned.
+ */
+JXL_EXPORT JxlEncoderError JxlEncoderGetError(JxlEncoder* enc);
+
+/**
+ * Encodes JPEG XL file using the available bytes. @p *avail_out indicates how
+ * many output bytes are available, and @p *next_out points to the input bytes.
+ * *avail_out will be decremented by the amount of bytes that have been
+ * processed by the encoder and *next_out will be incremented by the same
+ * amount, so *next_out will now point at the amount of *avail_out unprocessed
+ * bytes.
+ *
+ * The returned status indicates whether the encoder needs more output bytes.
+ * When the return value is not JXL_ENC_ERROR or JXL_ENC_SUCCESS, the encoding
+ * requires more JxlEncoderProcessOutput calls to continue.
+ *
+ * The caller must guarantee that *avail_out >= 32 when calling
+ * JxlEncoderProcessOutput; otherwise, JXL_ENC_NEED_MORE_OUTPUT will be
+ * returned. It is guaranteed that, if *avail_out >= 32, at least one byte of
+ * output will be written.
+ *
+ * This encodes the frames and/or boxes added so far. If the last frame or last
+ * box has been added, @ref JxlEncoderCloseInput, @ref JxlEncoderCloseFrames
+ * and/or @ref JxlEncoderCloseBoxes must be called before the next
+ * @ref JxlEncoderProcessOutput call, or the codestream won't be encoded
+ * correctly.
+ *
+ * @param enc encoder object.
+ * @param next_out pointer to next bytes to write to.
+ * @param avail_out amount of bytes available starting from *next_out.
+ * @return JXL_ENC_SUCCESS when encoding finished and all events handled.
+ * @return JXL_ENC_ERROR when encoding failed, e.g. invalid input.
+ * @return JXL_ENC_NEED_MORE_OUTPUT more output buffer is necessary.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc,
+                                                    uint8_t** next_out,
+                                                    size_t* avail_out);
+
+/**
+ * Sets the frame information for this frame to the encoder. This includes
+ * animation information such as frame duration to store in the frame header.
+ * The frame header fields represent the frame as passed to the encoder, but not
+ * necessarily the exact values as they will be encoded file format: the encoder
+ * could change crop and blending options of a frame for more efficient encoding
+ * or introduce additional internal frames. Animation duration and time code
+ * information is not altered since those are immutable metadata of the frame.
+ *
+ * It is not required to use this function, however if have_animation is set
+ * to true in the basic info, then this function should be used to set the
+ * time duration of this individual frame. By default individual frames have a
+ * time duration of 0, making them form a composite still. See @ref
+ * JxlFrameHeader for more information.
+ *
+ * This information is stored in the JxlEncoderFrameSettings and so is used for
+ * any frame encoded with these JxlEncoderFrameSettings. It is ok to change
+ * between @ref JxlEncoderAddImageFrame calls, each added image frame will have
+ * the frame header that was set in the options at the time of calling
+ * JxlEncoderAddImageFrame.
+ *
+ * The is_last and name_length fields of the JxlFrameHeader are ignored, use
+ * @ref JxlEncoderCloseFrames to indicate last frame, and @ref
+ * JxlEncoderSetFrameName to indicate the name and its length instead.
+ * Calling this function will clear any name that was previously set with @ref
+ * JxlEncoderSetFrameName.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param frame_header frame header data to set. Object owned by the caller and
+ * does not need to be kept in memory, its information is copied internally.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderSetFrameHeader(JxlEncoderFrameSettings* frame_settings,
+                         const JxlFrameHeader* frame_header);
+
+/**
+ * Sets blend info of an extra channel. The blend info of extra channels is set
+ * separately from that of the color channels, the color channels are set with
+ * @ref JxlEncoderSetFrameHeader.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param index index of the extra channel to use.
+ * @param blend_info blend info to set for the extra channel
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
+    JxlEncoderFrameSettings* frame_settings, size_t index,
+    const JxlBlendInfo* blend_info);
+
+/**
+ * Sets the name of the animation frame. This function is optional, frames are
+ * not required to have a name. This setting is a part of the frame header, and
+ * the same principles as for @ref JxlEncoderSetFrameHeader apply. The
+ * name_length field of JxlFrameHeader is ignored by the encoder, this function
+ * determines the name length instead as the length in bytes of the C string.
+ *
+ * The maximum possible name length is 1071 bytes (excluding terminating null
+ * character).
+ *
+ * Calling @ref JxlEncoderSetFrameHeader clears any name that was
+ * previously set.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param frame_name name of the next frame to be encoded, as a UTF-8 encoded C
+ * string (zero terminated). Owned by the caller, and copied internally.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameName(
+    JxlEncoderFrameSettings* frame_settings, const char* frame_name);
+
+/**
+ * Sets the bit depth of the input buffer.
+ *
+ * For float pixel formats, only the default JXL_BIT_DEPTH_FROM_PIXEL_FORMAT
+ * setting is allowed, while for unsigned pixel formats,
+ * JXL_BIT_DEPTH_FROM_CODESTREAM setting is also allowed. See the comment on
+ * @ref JxlEncoderAddImageFrame for the effects of the bit depth setting.
+
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param bit_depth the bit depth setting of the pixel input
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth);
+
+/**
+ * Sets the buffer to read JPEG encoded bytes from for the next frame to encode.
+ *
+ * If JxlEncoderSetBasicInfo has not yet been called, calling
+ * JxlEncoderAddJPEGFrame will implicitly call it with the parameters of the
+ * added JPEG frame.
+ *
+ * If JxlEncoderSetColorEncoding or JxlEncoderSetICCProfile has not yet been
+ * called, calling JxlEncoderAddJPEGFrame will implicitly call it with the
+ * parameters of the added JPEG frame.
+ *
+ * If the encoder is set to store JPEG reconstruction metadata using @ref
+ * JxlEncoderStoreJPEGMetadata and a single JPEG frame is added, it will be
+ * possible to losslessly reconstruct the JPEG codestream.
+ *
+ * If this is the last frame, @ref JxlEncoderCloseInput or @ref
+ * JxlEncoderCloseFrames must be called before the next
+ * @ref JxlEncoderProcessOutput call.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param buffer bytes to read JPEG from. Owned by the caller and its contents
+ * are copied internally.
+ * @param size size of buffer in bytes.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderAddJPEGFrame(const JxlEncoderFrameSettings* frame_settings,
+                       const uint8_t* buffer, size_t size);
+
+/**
+ * Sets the buffer to read pixels from for the next image to encode. Must call
+ * JxlEncoderSetBasicInfo before JxlEncoderAddImageFrame.
+ *
+ * Currently only some data types for pixel formats are supported:
+ * - JXL_TYPE_UINT8, with range 0..255
+ * - JXL_TYPE_UINT16, with range 0..65535
+ * - JXL_TYPE_FLOAT16, with nominal range 0..1
+ * - JXL_TYPE_FLOAT, with nominal range 0..1
+ *
+ * Note: the sample data type in pixel_format is allowed to be different from
+ * what is described in the JxlBasicInfo. The type in pixel_format, together
+ * with an optional @ref JxlBitDepth parameter set by @ref
+ * JxlEncoderSetFrameBitDepth describes the format of the uncompressed pixel
+ * buffer. The bits_per_sample and exponent_bits_per_sample in the JxlBasicInfo
+ * describes what will actually be encoded in the JPEG XL codestream.
+ * For example, to encode a 12-bit image, you would set bits_per_sample to 12,
+ * while the input frame buffer can be in the following formats:
+ *  - if pixel format is in JXL_TYPE_UINT16 with default bit depth setting
+ *    (i.e. JXL_BIT_DEPTH_FROM_PIXEL_FORMAT), input sample values are rescaled
+ *    to 16-bit, i.e. multiplied by 65535/4095;
+ *  - if pixel format is in JXL_TYPE_UINT16 with JXL_BIT_DEPTH_FROM_CODESTREAM
+ *    bit depth setting, input sample values are provided unscaled;
+ *  - if pixel format is in JXL_TYPE_FLOAT, input sample values are rescaled
+ *    to 0..1, i.e.  multiplied by 1.f/4095.f.
+ * While it is allowed, it is obviously not recommended to use a pixel_format
+ * with lower precision than what is specified in the JxlBasicInfo.
+ *
+ * We support interleaved channels as described by the JxlPixelFormat:
+ * - single-channel data, e.g. grayscale
+ * - single-channel + alpha
+ * - trichromatic, e.g. RGB
+ * - trichromatic + alpha
+ *
+ * Extra channels not handled here need to be set by @ref
+ * JxlEncoderSetExtraChannelBuffer.
+ * If the image has alpha, and alpha is not passed here, it will implicitly be
+ * set to all-opaque (an alpha value of 1.0 everywhere).
+ *
+ * The pixels are assumed to be encoded in the original profile that is set with
+ * JxlEncoderSetColorEncoding or JxlEncoderSetICCProfile. If none of these
+ * functions were used, the pixels are assumed to be nonlinear sRGB for integer
+ * data types (JXL_TYPE_UINT8, JXL_TYPE_UINT16), and linear sRGB for floating
+ * point data types (JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT).
+ *
+ * Sample values in floating-point pixel formats are allowed to be outside the
+ * nominal range, e.g. to represent out-of-sRGB-gamut colors in the
+ * uses_original_profile=false case. They are however not allowed to be NaN or
+ * +-infinity.
+ *
+ * If this is the last frame, @ref JxlEncoderCloseInput or @ref
+ * JxlEncoderCloseFrames must be called before the next
+ * @ref JxlEncoderProcessOutput call.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param pixel_format format for pixels. Object owned by the caller and its
+ * contents are copied internally.
+ * @param buffer buffer type to input the pixel data from. Owned by the caller
+ * and its contents are copied internally.
+ * @param size size of buffer in bytes. This size should match what is implied
+ * by the frame dimensions and the pixel format.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderAddImageFrame(
+    const JxlEncoderFrameSettings* frame_settings,
+    const JxlPixelFormat* pixel_format, const void* buffer, size_t size);
+
+/**
+ * Sets the buffer to read pixels from for an extra channel at a given index.
+ * The index must be smaller than the num_extra_channels in the associated
+ * JxlBasicInfo. Must call @ref JxlEncoderSetExtraChannelInfo before
+ * JxlEncoderSetExtraChannelBuffer.
+ *
+ * TODO(firsching): mention what data types in pixel formats are supported.
+ *
+ * It is required to call this function for every extra channel, except for the
+ * alpha channel if that was already set through @ref JxlEncoderAddImageFrame.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param pixel_format format for pixels. Object owned by the caller and its
+ * contents are copied internally. The num_channels value is ignored, since the
+ * number of channels for an extra channel is always assumed to be one.
+ * @param buffer buffer type to input the pixel data from. Owned by the caller
+ * and its contents are copied internally.
+ * @param size size of buffer in bytes. This size should match what is implied
+ * by the frame dimensions and the pixel format.
+ * @param index index of the extra channel to use.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
+    const JxlEncoderFrameSettings* frame_settings,
+    const JxlPixelFormat* pixel_format, const void* buffer, size_t size,
+    uint32_t index);
+
+/** Adds a metadata box to the file format. JxlEncoderProcessOutput must be used
+ * to effectively write the box to the output. @ref JxlEncoderUseBoxes must
+ * be enabled before using this function.
+ *
+ * Boxes allow inserting application-specific data and metadata (Exif, XML/XMP,
+ * JUMBF and user defined boxes).
+ *
+ * The box format follows ISO BMFF and shares features and box types with other
+ * image and video formats, including the Exif, XML and JUMBF boxes. The box
+ * format for JPEG XL is specified in ISO/IEC 18181-2.
+ *
+ * Boxes in general don't contain other boxes inside, except a JUMBF superbox.
+ * Boxes follow each other sequentially and are byte-aligned. If the container
+ * format is used, the JXL stream consists of concatenated boxes.
+ * It is also possible to use a direct codestream without boxes, but in that
+ * case metadata cannot be added.
+ *
+ * Each box generally has the following byte structure in the file:
+ * - 4 bytes: box size including box header (Big endian. If set to 0, an
+ *   8-byte 64-bit size follows instead).
+ * - 4 bytes: type, e.g. "JXL " for the signature box, "jxlc" for a codestream
+ *   box.
+ * - N bytes: box contents.
+ *
+ * Only the box contents are provided to the contents argument of this function,
+ * the encoder encodes the size header itself. Most boxes are written
+ * automatically by the encoder as needed ("JXL ", "ftyp", "jxll", "jxlc",
+ * "jxlp", "jxli", "jbrd"), and this function only needs to be called to add
+ * optional metadata when encoding from pixels (using JxlEncoderAddImageFrame).
+ * When recompressing JPEG files (using JxlEncoderAddJPEGFrame), if the input
+ * JPEG contains EXIF, XMP or JUMBF metadata, the corresponding boxes are
+ * already added automatically.
+ *
+ * Box types are given by 4 characters. The following boxes can be added with
+ * this function:
+ * - "Exif": a box with EXIF metadata, can be added by libjxl users, or is
+ *   automatically added when needed for JPEG reconstruction. The contents of
+ *   this box must be prepended by a 4-byte tiff header offset, which may
+ *   be 4 zero bytes in case the tiff header follows immediately.
+ *   The EXIF metadata must be in sync with what is encoded in the JPEG XL
+ *   codestream, specifically the image orientation. While this is not
+ *   recommended in practice, in case of conflicting metadata, the JPEG XL
+ *   codestream takes precedence.
+ * - "xml ": a box with XML data, in particular XMP metadata, can be added by
+ *   libjxl users, or is automatically added when needed for JPEG reconstruction
+ * - "jumb": a JUMBF superbox, which can contain boxes with different types of
+ *   metadata inside. This box type can be added by the encoder transparently,
+ *   and other libraries to create and handle JUMBF content exist.
+ * - Application-specific boxes. Their typename should not begin with "jxl" or
+ *   "JXL" or conflict with other existing typenames, and they should be
+ *   registered with MP4RA (mp4ra.org).
+ *
+ * These boxes can be stored uncompressed or Brotli-compressed (using a "brob"
+ * box), depending on the compress_box parameter.
+ *
+ * @param enc encoder object.
+ * @param type the box type, e.g. "Exif" for EXIF metadata, "xml " for XMP or
+ * IPTC metadata, "jumb" for JUMBF metadata.
+ * @param contents the full contents of the box, for example EXIF
+ * data. ISO BMFF box header must not be included, only the contents. Owned by
+ * the caller and its contents are copied internally.
+ * @param size size of the box contents.
+ * @param compress_box Whether to compress this box as a "brob" box. Requires
+ * Brotli support.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error, such as when
+ * using this function without JxlEncoderUseContainer, or adding a box type
+ * that would result in an invalid file format.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc,
+                                             const JxlBoxType type,
+                                             const uint8_t* contents,
+                                             size_t size,
+                                             JXL_BOOL compress_box);
+
+/**
+ * Indicates the intention to add metadata boxes. This allows @ref
+ * JxlEncoderAddBox to be used. When using this function, then it is required
+ * to use @ref JxlEncoderCloseBoxes at the end.
+ *
+ * By default the encoder assumes no metadata boxes will be added.
+ *
+ * This setting can only be set at the beginning, before encoding starts.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc);
+
+/**
+ * Declares that no further boxes will be added with @ref JxlEncoderAddBox.
+ * This function must be called after the last box is added so the encoder knows
+ * the stream will be finished. It is not necessary to use this function if
+ * @ref JxlEncoderUseBoxes is not used. Further frames may still be added.
+ *
+ * Must be called between JxlEncoderAddBox of the last box
+ * and the next call to JxlEncoderProcessOutput, or @ref JxlEncoderProcessOutput
+ * won't output the last box correctly.
+ *
+ * NOTE: if you don't need to close frames and boxes at separate times, you can
+ * use @ref JxlEncoderCloseInput instead to close both at once.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderCloseBoxes(JxlEncoder* enc);
+
+/**
+ * Declares that no frames will be added and @ref JxlEncoderAddImageFrame and
+ * @ref JxlEncoderAddJPEGFrame won't be called anymore. Further metadata boxes
+ * may still be added. This function or @ref JxlEncoderCloseInput must be called
+ * after adding the last frame and the next call to
+ * @ref JxlEncoderProcessOutput, or the frame won't be properly marked as last.
+ *
+ * NOTE: if you don't need to close frames and boxes at separate times, you can
+ * use @ref JxlEncoderCloseInput instead to close both at once.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderCloseFrames(JxlEncoder* enc);
+
+/**
+ * Closes any input to the encoder, equivalent to calling JxlEncoderCloseFrames
+ * as well as calling JxlEncoderCloseBoxes if needed. No further input of any
+ * kind may be given to the encoder, but further @ref JxlEncoderProcessOutput
+ * calls should be done to create the final output.
+ *
+ * The requirements of both @ref JxlEncoderCloseFrames and @ref
+ * JxlEncoderCloseBoxes apply to this function. Either this function or the
+ * other two must be called after the final frame and/or box, and the next
+ * @ref JxlEncoderProcessOutput call, or the codestream won't be encoded
+ * correctly.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderCloseInput(JxlEncoder* enc);
+
+/**
+ * Sets the original color encoding of the image encoded by this encoder. This
+ * is an alternative to JxlEncoderSetICCProfile and only one of these two must
+ * be used. This one sets the color encoding as a @ref JxlColorEncoding, while
+ * the other sets it as ICC binary data.
+ * Must be called after JxlEncoderSetBasicInfo.
+ *
+ * @param enc encoder object.
+ * @param color color encoding. Object owned by the caller and its contents are
+ * copied internally.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR or
+ * JXL_ENC_NOT_SUPPORTED otherwise
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderSetColorEncoding(JxlEncoder* enc, const JxlColorEncoding* color);
+
+/**
+ * Sets the original color encoding of the image encoded by this encoder as an
+ * ICC color profile. This is an alternative to JxlEncoderSetColorEncoding and
+ * only one of these two must be used. This one sets the color encoding as ICC
+ * binary data, while the other defines it as a @ref JxlColorEncoding.
+ * Must be called after JxlEncoderSetBasicInfo.
+ *
+ * @param enc encoder object.
+ * @param icc_profile bytes of the original ICC profile
+ * @param size size of the icc_profile buffer in bytes
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR or
+ * JXL_ENC_NOT_SUPPORTED otherwise
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
+                                                    const uint8_t* icc_profile,
+                                                    size_t size);
+
+/**
+ * Initializes a JxlBasicInfo struct to default values.
+ * For forwards-compatibility, this function has to be called before values
+ * are assigned to the struct fields.
+ * The default values correspond to an 8-bit RGB image, no alpha or any
+ * other extra channels.
+ *
+ * @param info global image metadata. Object owned by the caller.
+ */
+JXL_EXPORT void JxlEncoderInitBasicInfo(JxlBasicInfo* info);
+
+/**
+ * Initializes a JxlFrameHeader struct to default values.
+ * For forwards-compatibility, this function has to be called before values
+ * are assigned to the struct fields.
+ * The default values correspond to a frame with no animation duration and the
+ * 'replace' blend mode. After using this function, For animation duration must
+ * be set, for composite still blend settings must be set.
+ *
+ * @param frame_header frame metadata. Object owned by the caller.
+ */
+JXL_EXPORT void JxlEncoderInitFrameHeader(JxlFrameHeader* frame_header);
+
+/**
+ * Initializes a JxlBlendInfo struct to default values.
+ * For forwards-compatibility, this function has to be called before values
+ * are assigned to the struct fields.
+ *
+ * @param blend_info blending info. Object owned by the caller.
+ */
+JXL_EXPORT void JxlEncoderInitBlendInfo(JxlBlendInfo* blend_info);
+
+/**
+ * Sets the global metadata of the image encoded by this encoder.
+ *
+ * If the JxlBasicInfo contains information of extra channels beyond an alpha
+ * channel, then @ref JxlEncoderSetExtraChannelInfo must be called between
+ * JxlEncoderSetBasicInfo and @ref JxlEncoderAddImageFrame. In order to indicate
+ * extra channels, the value of `info.num_extra_channels` should be set to the
+ * number of extra channels, also counting the alpha channel if present.
+ *
+ * @param enc encoder object.
+ * @param info global image metadata. Object owned by the caller and its
+ * contents are copied internally.
+ * @return JXL_ENC_SUCCESS if the operation was successful,
+ * JXL_ENC_ERROR or JXL_ENC_NOT_SUPPORTED otherwise
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
+                                                   const JxlBasicInfo* info);
+
+/**
+ * Initializes a JxlExtraChannelInfo struct to default values.
+ * For forwards-compatibility, this function has to be called before values
+ * are assigned to the struct fields.
+ * The default values correspond to an 8-bit channel of the provided type.
+ *
+ * @param type type of the extra channel.
+ * @param info global extra channel metadata. Object owned by the caller and its
+ * contents are copied internally.
+ */
+JXL_EXPORT void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type,
+                                               JxlExtraChannelInfo* info);
+
+/**
+ * Sets information for the extra channel at the given index. The index
+ * must be smaller than num_extra_channels in the associated JxlBasicInfo.
+ *
+ * @param enc encoder object
+ * @param index index of the extra channel to set.
+ * @param info global extra channel metadata. Object owned by the caller and its
+ * contents are copied internally.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
+    JxlEncoder* enc, size_t index, const JxlExtraChannelInfo* info);
+
+/**
+ * Sets the name for the extra channel at the given index in UTF-8. The index
+ * must be smaller than the num_extra_channels in the associated JxlBasicInfo.
+ *
+ * TODO(lode): remove size parameter for consistency with
+ * JxlEncoderSetFrameName
+ *
+ * @param enc encoder object
+ * @param index index of the extra channel to set.
+ * @param name buffer with the name of the extra channel.
+ * @param size size of the name buffer in bytes, not counting the terminating
+ * character.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
+                                                          size_t index,
+                                                          const char* name,
+                                                          size_t size);
+
+/**
+ * Sets a frame-specific option of integer type to the encoder options.
+ * The JxlEncoderFrameSettingId argument determines which option is set.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param option ID of the option to set.
+ * @param value Integer value to set for this option.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR in
+ * case of an error, such as invalid or unknown option id, or invalid integer
+ * value for the given option. If an error is returned, the state of the
+ * JxlEncoderFrameSettings object is still valid and is the same as before this
+ * function was called.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    int64_t value);
+
+/**
+ * Sets a frame-specific option of float type to the encoder options.
+ * The JxlEncoderFrameSettingId argument determines which option is set.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param option ID of the option to set.
+ * @param value Float value to set for this option.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR in
+ * case of an error, such as invalid or unknown option id, or invalid integer
+ * value for the given option. If an error is returned, the state of the
+ * JxlEncoderFrameSettings object is still valid and is the same as before this
+ * function was called.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    float value);
+
+/** Forces the encoder to use the box-based container format (BMFF) even
+ * when not necessary.
+ *
+ * When using @ref JxlEncoderUseBoxes, @ref JxlEncoderStoreJPEGMetadata or @ref
+ * JxlEncoderSetCodestreamLevel with level 10, the encoder will automatically
+ * also use the container format, it is not necessary to use
+ * JxlEncoderUseContainer for those use cases.
+ *
+ * By default this setting is disabled.
+ *
+ * This setting can only be set at the beginning, before encoding starts.
+ *
+ * @param enc encoder object.
+ * @param use_container true if the encoder should always output the JPEG XL
+ * container format, false to only output it when necessary.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc,
+                                                   JXL_BOOL use_container);
+
+/**
+ * Configure the encoder to store JPEG reconstruction metadata in the JPEG XL
+ * container.
+ *
+ * If this is set to true and a single JPEG frame is added, it will be
+ * possible to losslessly reconstruct the JPEG codestream.
+ *
+ * This setting can only be set at the beginning, before encoding starts.
+ *
+ * @param enc encoder object.
+ * @param store_jpeg_metadata true if the encoder should store JPEG metadata.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderStoreJPEGMetadata(JxlEncoder* enc, JXL_BOOL store_jpeg_metadata);
+
+/** Sets the feature level of the JPEG XL codestream. Valid values are 5 and
+ * 10, or -1 (to choose automatically). Using the minimum required level, or
+ * level 5 in most cases, is recommended for compatibility with all decoders.
+ *
+ * Level 5: for end-user image delivery, this level is the most widely
+ * supported level by image decoders and the recommended level to use unless a
+ * level 10 feature is absolutely necessary. Supports a maximum resolution
+ * 268435456 pixels total with a maximum width or height of 262144 pixels,
+ * maximum 16-bit color channel depth, maximum 120 frames per second for
+ * animation, maximum ICC color profile size of 4 MiB, it allows all color
+ * models and extra channel types except CMYK and the JXL_CHANNEL_BLACK extra
+ * channel, and a maximum of 4 extra channels in addition to the 3 color
+ * channels. It also sets boundaries to certain internally used coding tools.
+ *
+ * Level 10: this level removes or increases the bounds of most of the level
+ * 5 limitations, allows CMYK color and up to 32 bits per color channel, but
+ * may be less widely supported.
+ *
+ * The default value is -1. This means the encoder will automatically choose
+ * between level 5 and level 10 based on what information is inside the @ref
+ * JxlBasicInfo structure. Do note that some level 10 features, particularly
+ * those used by animated JPEG XL codestreams, might require level 10, even
+ * though the @ref JxlBasicInfo only suggests level 5. In this case, the level
+ * must be explicitly set to 10, otherwise the encoder will return an error.
+ * The encoder will restrict internal encoding choices to those compatible with
+ * the level setting.
+ *
+ * This setting can only be set at the beginning, before encoding starts.
+ *
+ * @param enc encoder object.
+ * @param level the level value to set, must be -1, 5, or 10.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc,
+                                                         int level);
+
+/** Returns the codestream level required to support the currently configured
+ * settings and basic info. This function can only be used at the beginning,
+ * before encoding starts, but after setting basic info.
+ *
+ * This does not support per-frame settings, only global configuration, such as
+ * the image dimensions, that are known at the time of writing the header of
+ * the JPEG XL file.
+ *
+ * If this returns 5, nothing needs to be done and the codestream can be
+ * compatible with any decoder. If this returns 10, JxlEncoderSetCodestreamLevel
+ * has to be used to set the codestream level to 10, or the encoder can be
+ * configured differently to allow using the more compatible level 5.
+ *
+ * @param enc encoder object.
+ * @return -1 if no level can support the configuration (e.g. image dimensions
+ * larger than even level 10 supports), 5 if level 5 is supported, 10 if setting
+ * the codestream level to 10 is required.
+ *
+ */
+JXL_EXPORT int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc);
+
+/**
+ * Enables lossless encoding.
+ *
+ * This is not an option like the others on itself, but rather while enabled it
+ * overrides a set of existing options (such as distance, modular mode and
+ * color transform) that enables bit-for-bit lossless encoding.
+ *
+ * When disabled, those options are not overridden, but since those options
+ * could still have been manually set to a combination that operates losslessly,
+ * using this function with lossless set to JXL_DEC_FALSE does not guarantee
+ * lossy encoding, though the default set of options is lossy.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param lossless whether to override options for lossless mode
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameLossless(
+    JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless);
+
+/** DEPRECATED: use JxlEncoderSetFrameLossless instead.
+ */
+JXL_EXPORT JxlEncoderStatus
+JxlEncoderOptionsSetLossless(JxlEncoderFrameSettings*, JXL_BOOL);
+
+/**
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param effort the effort value to set.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ *
+ * DEPRECATED: use JxlEncoderFrameSettingsSetOption(frame_settings,
+ * JXL_ENC_FRAME_SETTING_EFFORT, effort) instead.
+ */
+JXL_DEPRECATED JXL_EXPORT JxlEncoderStatus
+JxlEncoderOptionsSetEffort(JxlEncoderFrameSettings* frame_settings, int effort);
+
+/**
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param tier the decoding speed tier to set.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ *
+ * DEPRECATED: use JxlEncoderFrameSettingsSetOption(frame_settings,
+ * JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier) instead.
+ */
+JXL_DEPRECATED JXL_EXPORT JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(
+    JxlEncoderFrameSettings* frame_settings, int tier);
+
+/**
+ * Sets the distance level for lossy compression: target max butteraugli
+ * distance, lower = higher quality. Range: 0 .. 15.
+ * 0.0 = mathematically lossless (however, use JxlEncoderSetFrameLossless
+ * instead to use true lossless, as setting distance to 0 alone is not the only
+ * requirement). 1.0 = visually lossless. Recommended range: 0.5 .. 3.0. Default
+ * value: 1.0.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param distance the distance value to set.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance);
+
+/** DEPRECATED: use JxlEncoderSetFrameDistance instead.
+ */
+JXL_DEPRECATED JXL_EXPORT JxlEncoderStatus
+JxlEncoderOptionsSetDistance(JxlEncoderFrameSettings*, float);
+
+/**
+ * Sets the distance level for lossy compression of extra channels.
+ * The distance is as in JxlEncoderSetFrameDistance (lower = higher quality).
+ * If not set, or if set to the special value -1, the distance that was set with
+ * JxlEncoderSetFrameDistance will be used.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param index index of the extra channel to set a distance value for.
+ * @param distance the distance value to set.
+ * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
+ * otherwise.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelDistance(
+    JxlEncoderFrameSettings* frame_settings, size_t index, float distance);
+
+/**
+ * Create a new set of encoder options, with all values initially copied from
+ * the @p source options, or set to default if @p source is NULL.
+ *
+ * The returned pointer is an opaque struct tied to the encoder and it will be
+ * deallocated by the encoder when JxlEncoderDestroy() is called. For functions
+ * taking both a @ref JxlEncoder and a @ref JxlEncoderFrameSettings, only
+ * JxlEncoderFrameSettings created with this function for the same encoder
+ * instance can be used.
+ *
+ * @param enc encoder object.
+ * @param source source options to copy initial values from, or NULL to get
+ * defaults initialized to defaults.
+ * @return the opaque struct pointer identifying a new set of encoder options.
+ */
+JXL_EXPORT JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
+    JxlEncoder* enc, const JxlEncoderFrameSettings* source);
+
+/** DEPRECATED: use JxlEncoderFrameSettingsCreate instead.
+ */
+JXL_DEPRECATED JXL_EXPORT JxlEncoderFrameSettings* JxlEncoderOptionsCreate(
+    JxlEncoder*, const JxlEncoderFrameSettings*);
+
+/**
+ * Sets a color encoding to be sRGB.
+ *
+ * @param color_encoding color encoding instance.
+ * @param is_gray whether the color encoding should be gray scale or color.
+ */
+JXL_EXPORT void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding,
+                                          JXL_BOOL is_gray);
+
+/**
+ * Sets a color encoding to be linear sRGB.
+ *
+ * @param color_encoding color encoding instance.
+ * @param is_gray whether the color encoding should be gray scale or color.
+ */
+JXL_EXPORT void JxlColorEncodingSetToLinearSRGB(
+    JxlColorEncoding* color_encoding, JXL_BOOL is_gray);
+
+/**
+ * Enables usage of expert options.
+ *
+ * At the moment, the only expert option is setting an effort value of 10,
+ * which gives the best compression for pixel-lossless modes but is very slow.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderAllowExpertOptions(JxlEncoder* enc);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_ENCODE_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h b/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h
new file mode 100644
index 0000000000..3889e12c14
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/// @addtogroup libjxl_encoder
+///@{
+///
+/// @file encode_cxx.h
+/// @brief C++ header-only helper for @ref encode.h.
+///
+/// There's no binary library associated with the header since this is a header
+/// only library.
+
+#ifndef JXL_ENCODE_CXX_H_
+#define JXL_ENCODE_CXX_H_
+
+#include <jxl/encode.h>
+
+#include <memory>
+
+#if !(defined(__cplusplus) || defined(c_plusplus))
+#error "This a C++ only header. Use jxl/encode.h from C sources."
+#endif
+
+/// Struct to call JxlEncoderDestroy from the JxlEncoderPtr unique_ptr.
+struct JxlEncoderDestroyStruct {
+  /// Calls @ref JxlEncoderDestroy() on the passed encoder.
+  void operator()(JxlEncoder* encoder) { JxlEncoderDestroy(encoder); }
+};
+
+/// std::unique_ptr<> type that calls JxlEncoderDestroy() when releasing the
+/// encoder.
+///
+/// Use this helper type from C++ sources to ensure the encoder is destroyed and
+/// their internal resources released.
+typedef std::unique_ptr<JxlEncoder, JxlEncoderDestroyStruct> JxlEncoderPtr;
+
+/// Creates an instance of JxlEncoder into a JxlEncoderPtr and initializes it.
+///
+/// This function returns a unique_ptr that will call JxlEncoderDestroy() when
+/// releasing the pointer. See @ref JxlEncoderCreate for details on the
+/// instance creation.
+///
+/// @param memory_manager custom allocator function. It may be NULL. The memory
+///        manager will be copied internally.
+/// @return a @c NULL JxlEncoderPtr if the instance can not be allocated or
+///         initialized
+/// @return initialized JxlEncoderPtr instance otherwise.
+static inline JxlEncoderPtr JxlEncoderMake(
+    const JxlMemoryManager* memory_manager) {
+  return JxlEncoderPtr(JxlEncoderCreate(memory_manager));
+}
+
+#endif  // JXL_ENCODE_CXX_H_
+
+/// @}
diff --git a/third_party/jpeg-xl/lib/include/jxl/memory_manager.h b/third_party/jpeg-xl/lib/include/jxl/memory_manager.h
new file mode 100644
index 0000000000..52640a8beb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/memory_manager.h
@@ -0,0 +1,72 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file memory_manager.h
+ * @brief Abstraction functions used by JPEG XL to allocate memory.
+ */
+
+#ifndef JXL_MEMORY_MANAGER_H_
+#define JXL_MEMORY_MANAGER_H_
+
+#include <stddef.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * Allocating function for a memory region of a given size.
+ *
+ * Allocates a contiguous memory region of size @p size bytes. The returned
+ * memory may not be aligned to a specific size or initialized at all.
+ *
+ * @param opaque custom memory manager handle provided by the caller.
+ * @param size in bytes of the requested memory region.
+ * @return @c NULL if the memory can not be allocated,
+ * @return pointer to the memory otherwise.
+ */
+typedef void* (*jpegxl_alloc_func)(void* opaque, size_t size);
+
+/**
+ * Deallocating function pointer type.
+ *
+ * This function @b MUST do nothing if @p address is @c NULL.
+ *
+ * @param opaque custom memory manager handle provided by the caller.
+ * @param address memory region pointer returned by ::jpegxl_alloc_func, or @c
+ * NULL.
+ */
+typedef void (*jpegxl_free_func)(void* opaque, void* address);
+
+/**
+ * Memory Manager struct.
+ * These functions, when provided by the caller, will be used to handle memory
+ * allocations.
+ */
+typedef struct JxlMemoryManagerStruct {
+  /** The opaque pointer that will be passed as the first parameter to all the
+   * functions in this struct. */
+  void* opaque;
+
+  /** Memory allocation function. This can be NULL if and only if also the
+   * free() member in this class is NULL. All dynamic memory will be allocated
+   * and freed with these functions if they are not NULL. */
+  jpegxl_alloc_func alloc;
+  /** Free function matching the alloc() member. */
+  jpegxl_free_func free;
+
+  /* TODO(deymo): Add cache-aligned alloc/free functions here. */
+} JxlMemoryManager;
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_MEMORY_MANAGER_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h b/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h
new file mode 100644
index 0000000000..45394e972c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h
@@ -0,0 +1,156 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ *  @{
+ */
+/**
+ * @file parallel_runner.h
+ */
+
+/** API for running data operations in parallel in a multi-threaded environment.
+ * This module allows the JPEG XL caller to define their own way of creating and
+ * assigning threads.
+ *
+ * The JxlParallelRunner function type defines a parallel data processing
+ * runner that may be implemented by the caller to allow the library to process
+ * in multiple threads. The multi-threaded processing in this library only
+ * requires to run the same function over each number of a range, possibly
+ * running each call in a different thread. The JPEG XL caller is responsible
+ * for implementing this logic using the thread APIs available in their system.
+ * For convenience, a C++ implementation based on std::thread is provided in
+ * jpegxl/parallel_runner_thread.h (part of the jpegxl_threads library).
+ *
+ * Thread pools usually store small numbers of heterogeneous tasks in a queue.
+ * When tasks are identical or differ only by an integer input parameter, it is
+ * much faster to store just one function of an integer parameter and call it
+ * for each value. Conventional vector-of-tasks can be run in parallel using a
+ * lambda function adapter that simply calls task_funcs[task].
+ *
+ * If no multi-threading is desired, a @c NULL value of JxlParallelRunner
+ * will use an internal implementation without multi-threading.
+ */
+
+#ifndef JXL_PARALLEL_RUNNER_H_
+#define JXL_PARALLEL_RUNNER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Return code used in the JxlParallel* functions as return value. A value
+ * of 0 means success and any other value means error. The special value
+ * JXL_PARALLEL_RET_RUNNER_ERROR can be used by the runner to indicate any
+ * other error.
+ */
+typedef int JxlParallelRetCode;
+
+/**
+ * General error returned by the JxlParallelRunInit function to indicate
+ * an error.
+ */
+#define JXL_PARALLEL_RET_RUNNER_ERROR (-1)
+
+/**
+ * Parallel run initialization callback. See JxlParallelRunner for details.
+ *
+ * This function MUST be called by the JxlParallelRunner only once, on the
+ * same thread that called JxlParallelRunner, before any parallel execution.
+ * The purpose of this call is to provide the maximum number of threads that the
+ * JxlParallelRunner will use, which can be used by JPEG XL to allocate
+ * per-thread storage if needed.
+ *
+ * @param jpegxl_opaque the @p jpegxl_opaque handle provided to
+ * JxlParallelRunner() must be passed here.
+ * @param num_threads the maximum number of threads. This value must be
+ * positive.
+ * @return 0 if the initialization process was successful.
+ * @return an error code if there was an error, which should be returned by
+ * JxlParallelRunner().
+ */
+typedef JxlParallelRetCode (*JxlParallelRunInit)(void* jpegxl_opaque,
+                                                 size_t num_threads);
+
+/**
+ * Parallel run data processing callback. See JxlParallelRunner for details.
+ *
+ * This function MUST be called once for every number in the range [start_range,
+ * end_range) (including start_range but not including end_range) passing this
+ * number as the @p value. Calls for different value may be executed from
+ * different threads in parallel.
+ *
+ * @param jpegxl_opaque the @p jpegxl_opaque handle provided to
+ * JxlParallelRunner() must be passed here.
+ * @param value the number in the range [start_range, end_range) of the call.
+ * @param thread_id the thread number where this function is being called from.
+ * This must be lower than the @p num_threads value passed to
+ * JxlParallelRunInit.
+ */
+typedef void (*JxlParallelRunFunction)(void* jpegxl_opaque, uint32_t value,
+                                       size_t thread_id);
+
+/**
+ * JxlParallelRunner function type. A parallel runner implementation can be
+ * provided by a JPEG XL caller to allow running computations in multiple
+ * threads. This function must call the initialization function @p init in the
+ * same thread that called it and then call the passed @p func once for every
+ * number in the range [start_range, end_range) (including start_range but not
+ * including end_range) possibly from different multiple threads in parallel.
+ *
+ * The JxlParallelRunner function does not need to be re-entrant. This means
+ * that the same JxlParallelRunner function with the same runner_opaque
+ * provided parameter will not be called from the library from either @p init or
+ * @p func in the same decoder or encoder instance. However, a single decoding
+ * or encoding instance may call the provided JxlParallelRunner multiple
+ * times for different parts of the decoding or encoding process.
+ *
+ * @return 0 if the @p init call succeeded (returned 0) and no other error
+ * occurred in the runner code.
+ * @return JXL_PARALLEL_RET_RUNNER_ERROR if an error occurred in the runner
+ * code, for example, setting up the threads.
+ * @return the return value of @p init() if non-zero.
+ */
+typedef JxlParallelRetCode (*JxlParallelRunner)(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+/* The following is an example of a JxlParallelRunner that doesn't use any
+ * multi-threading. Note that this implementation doesn't store any state
+ * between multiple calls of the ExampleSequentialRunner function, so the
+ * runner_opaque value is not used.
+
+  JxlParallelRetCode ExampleSequentialRunner(void* runner_opaque,
+                                                void* jpegxl_opaque,
+                                                JxlParallelRunInit init,
+                                                JxlParallelRunFunction func,
+                                                uint32_t start_range,
+                                                uint32_t end_range) {
+    // We only use one thread (the currently running thread).
+    JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1);
+    if (init_ret != 0) return init_ret;
+
+    // In case of other initialization error (for example when initializing the
+    // threads) one can return JXL_PARALLEL_RET_RUNNER_ERROR.
+
+    for (uint32_t i = start_range; i < end_range; i++) {
+      // Every call is in the thread number 0. These don't need to be in any
+      // order.
+      (*func)(jpegxl_opaque, i, 0);
+    }
+    return 0;
+  }
+ */
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_PARALLEL_RUNNER_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner.h b/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner.h
new file mode 100644
index 0000000000..196e66d30a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner.h
@@ -0,0 +1,78 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_threads
+ * @{
+ * @file resizable_parallel_runner.h
+ * @brief implementation using std::thread of a resizeable ::JxlParallelRunner.
+ */
+
+/** Implementation of JxlParallelRunner than can be used to enable
+ * multithreading when using the JPEG XL library. This uses std::thread
+ * internally and related synchronization functions. The number of threads
+ * created can be changed after creation of the thread pool; the threads
+ * (including the main thread) are re-used for every
+ * ResizableParallelRunner::Runner call. Only one concurrent
+ * JxlResizableParallelRunner call per instance is allowed at a time.
+ *
+ * This is a scalable, lower-overhead thread pool runner, especially suitable
+ * for data-parallel computations in the fork-join model, where clients need to
+ * know when all tasks have completed.
+ *
+ * Compared to the implementation in @ref thread_parallel_runner.h, this
+ * implementation is tuned for execution on lower-powered systems, including
+ * for example ARM CPUs with big.LITTLE computation models.
+ */
+
+#ifndef JXL_RESIZABLE_PARALLEL_RUNNER_H_
+#define JXL_RESIZABLE_PARALLEL_RUNNER_H_
+
+#include <jxl/jxl_threads_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Parallel runner internally using std::thread. Use as JxlParallelRunner.
+ */
+JXL_THREADS_EXPORT JxlParallelRetCode JxlResizableParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+/** Creates the runner for JxlResizableParallelRunner. Use as the opaque
+ * runner. The runner will execute tasks on the calling thread until
+ * @ref JxlResizableParallelRunnerSetThreads is called.
+ */
+JXL_THREADS_EXPORT void* JxlResizableParallelRunnerCreate(
+    const JxlMemoryManager* memory_manager);
+
+/** Changes the number of threads for JxlResizableParallelRunner.
+ */
+JXL_THREADS_EXPORT void JxlResizableParallelRunnerSetThreads(
+    void* runner_opaque, size_t num_threads);
+
+/** Suggests a number of threads to use for an image of given size.
+ */
+JXL_THREADS_EXPORT uint32_t
+JxlResizableParallelRunnerSuggestThreads(uint64_t xsize, uint64_t ysize);
+
+/** Destroys the runner created by JxlResizableParallelRunnerCreate.
+ */
+JXL_THREADS_EXPORT void JxlResizableParallelRunnerDestroy(void* runner_opaque);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_RESIZABLE_PARALLEL_RUNNER_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner_cxx.h b/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner_cxx.h
new file mode 100644
index 0000000000..39bbbd283a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner_cxx.h
@@ -0,0 +1,64 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/// @addtogroup libjxl_threads
+/// @{
+///
+/// @file resizable_parallel_runner_cxx.h
+/// @ingroup libjxl_threads
+/// @brief C++ header-only helper for @ref resizable_parallel_runner.h.
+///
+/// There's no binary library associated with the header since this is a header
+/// only library.
+
+#ifndef JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
+#define JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
+
+#include <jxl/resizable_parallel_runner.h>
+
+#include <memory>
+
+#if !(defined(__cplusplus) || defined(c_plusplus))
+#error \
+    "This a C++ only header. Use jxl/jxl_resizable_parallel_runner.h from C" \
+    "sources."
+#endif
+
+/// Struct to call JxlResizableParallelRunnerDestroy from the
+/// JxlResizableParallelRunnerPtr unique_ptr.
+struct JxlResizableParallelRunnerDestroyStruct {
+  /// Calls @ref JxlResizableParallelRunnerDestroy() on the passed runner.
+  void operator()(void* runner) { JxlResizableParallelRunnerDestroy(runner); }
+};
+
+/// std::unique_ptr<> type that calls JxlResizableParallelRunnerDestroy() when
+/// releasing the runner.
+///
+/// Use this helper type from C++ sources to ensure the runner is destroyed and
+/// their internal resources released.
+typedef std::unique_ptr<void, JxlResizableParallelRunnerDestroyStruct>
+    JxlResizableParallelRunnerPtr;
+
+/// Creates an instance of JxlResizableParallelRunner into a
+/// JxlResizableParallelRunnerPtr and initializes it.
+///
+/// This function returns a unique_ptr that will call
+/// JxlResizableParallelRunnerDestroy() when releasing the pointer. See @ref
+/// JxlResizableParallelRunnerCreate for details on the instance creation.
+///
+/// @param memory_manager custom allocator function. It may be NULL. The memory
+///        manager will be copied internally.
+/// @return a @c NULL JxlResizableParallelRunnerPtr if the instance can not be
+/// allocated or initialized
+/// @return initialized JxlResizableParallelRunnerPtr instance otherwise.
+static inline JxlResizableParallelRunnerPtr JxlResizableParallelRunnerMake(
+    const JxlMemoryManager* memory_manager) {
+  return JxlResizableParallelRunnerPtr(
+      JxlResizableParallelRunnerCreate(memory_manager));
+}
+
+#endif  // JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
+
+/// @}
diff --git a/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h b/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h
new file mode 100644
index 0000000000..715648b256
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h
@@ -0,0 +1,72 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_threads
+ * @{
+ * @file thread_parallel_runner.h
+ * @brief implementation using std::thread of a ::JxlParallelRunner.
+ */
+
+/** Implementation of JxlParallelRunner than can be used to enable
+ * multithreading when using the JPEG XL library. This uses std::thread
+ * internally and related synchronization functions. The number of threads
+ * created is fixed at construction time and the threads are re-used for every
+ * ThreadParallelRunner::Runner call. Only one concurrent
+ * JxlThreadParallelRunner call per instance is allowed at a time.
+ *
+ * This is a scalable, lower-overhead thread pool runner, especially suitable
+ * for data-parallel computations in the fork-join model, where clients need to
+ * know when all tasks have completed.
+ *
+ * This thread pool can efficiently load-balance millions of tasks using an
+ * atomic counter, thus avoiding per-task virtual or system calls. With 48
+ * hyperthreads and 1M tasks that add to an atomic counter, overall runtime is
+ * 10-20x higher when using std::async, and ~200x for a queue-based thread
+ */
+
+#ifndef JXL_THREAD_PARALLEL_RUNNER_H_
+#define JXL_THREAD_PARALLEL_RUNNER_H_
+
+#include <jxl/jxl_threads_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/** Parallel runner internally using std::thread. Use as JxlParallelRunner.
+ */
+JXL_THREADS_EXPORT JxlParallelRetCode JxlThreadParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+/** Creates the runner for JxlThreadParallelRunner. Use as the opaque
+ * runner.
+ */
+JXL_THREADS_EXPORT void* JxlThreadParallelRunnerCreate(
+    const JxlMemoryManager* memory_manager, size_t num_worker_threads);
+
+/** Destroys the runner created by JxlThreadParallelRunnerCreate.
+ */
+JXL_THREADS_EXPORT void JxlThreadParallelRunnerDestroy(void* runner_opaque);
+
+/** Returns a default num_worker_threads value for
+ * JxlThreadParallelRunnerCreate.
+ */
+JXL_THREADS_EXPORT size_t JxlThreadParallelRunnerDefaultNumWorkerThreads();
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_THREAD_PARALLEL_RUNNER_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h b/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h
new file mode 100644
index 0000000000..4974ffee87
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h
@@ -0,0 +1,64 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/// @addtogroup libjxl_threads
+/// @{
+///
+/// @file thread_parallel_runner_cxx.h
+/// @brief C++ header-only helper for @ref thread_parallel_runner.h.
+///
+/// There's no binary library associated with the header since this is a header
+/// only library.
+
+#ifndef JXL_THREAD_PARALLEL_RUNNER_CXX_H_
+#define JXL_THREAD_PARALLEL_RUNNER_CXX_H_
+
+#include <jxl/thread_parallel_runner.h>
+
+#include <memory>
+
+#if !(defined(__cplusplus) || defined(c_plusplus))
+#error \
+    "This a C++ only header. Use jxl/jxl_thread_parallel_runner.h from C" \
+    "sources."
+#endif
+
+/// Struct to call JxlThreadParallelRunnerDestroy from the
+/// JxlThreadParallelRunnerPtr unique_ptr.
+struct JxlThreadParallelRunnerDestroyStruct {
+  /// Calls @ref JxlThreadParallelRunnerDestroy() on the passed runner.
+  void operator()(void* runner) { JxlThreadParallelRunnerDestroy(runner); }
+};
+
+/// std::unique_ptr<> type that calls JxlThreadParallelRunnerDestroy() when
+/// releasing the runner.
+///
+/// Use this helper type from C++ sources to ensure the runner is destroyed and
+/// their internal resources released.
+typedef std::unique_ptr<void, JxlThreadParallelRunnerDestroyStruct>
+    JxlThreadParallelRunnerPtr;
+
+/// Creates an instance of JxlThreadParallelRunner into a
+/// JxlThreadParallelRunnerPtr and initializes it.
+///
+/// This function returns a unique_ptr that will call
+/// JxlThreadParallelRunnerDestroy() when releasing the pointer. See @ref
+/// JxlThreadParallelRunnerCreate for details on the instance creation.
+///
+/// @param memory_manager custom allocator function. It may be NULL. The memory
+///        manager will be copied internally.
+/// @param num_worker_threads the number of worker threads to create.
+/// @return a @c NULL JxlThreadParallelRunnerPtr if the instance can not be
+/// allocated or initialized
+/// @return initialized JxlThreadParallelRunnerPtr instance otherwise.
+static inline JxlThreadParallelRunnerPtr JxlThreadParallelRunnerMake(
+    const JxlMemoryManager* memory_manager, size_t num_worker_threads) {
+  return JxlThreadParallelRunnerPtr(
+      JxlThreadParallelRunnerCreate(memory_manager, num_worker_threads));
+}
+
+#endif  // JXL_THREAD_PARALLEL_RUNNER_CXX_H_
+
+/// @}
diff --git a/third_party/jpeg-xl/lib/include/jxl/types.h b/third_party/jpeg-xl/lib/include/jxl/types.h
new file mode 100644
index 0000000000..9ffb4c6868
--- /dev/null
+++ b/third_party/jpeg-xl/lib/include/jxl/types.h
@@ -0,0 +1,186 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file types.h
+ * @brief Data types for the JPEG XL API, for both encoding and decoding.
+ */
+
+#ifndef JXL_TYPES_H_
+#define JXL_TYPES_H_
+
+#include <jxl/jxl_export.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * A portable @c bool replacement.
+ *
+ * ::JXL_BOOL is a "documentation" type: actually it is @c int, but in API it
+ * denotes a type, whose only values are ::JXL_TRUE and ::JXL_FALSE.
+ */
+#define JXL_BOOL int
+/** Portable @c true replacement. */
+#define JXL_TRUE 1
+/** Portable @c false replacement. */
+#define JXL_FALSE 0
+
+/** Data type for the sample values per channel per pixel.
+ */
+typedef enum {
+  /** Use 32-bit single-precision floating point values, with range 0.0-1.0
+   * (within gamut, may go outside this range for wide color gamut). Floating
+   * point output, either JXL_TYPE_FLOAT or JXL_TYPE_FLOAT16, is recommended
+   * for HDR and wide gamut images when color profile conversion is required. */
+  JXL_TYPE_FLOAT = 0,
+
+  /** Use type uint8_t. May clip wide color gamut data.
+   */
+  JXL_TYPE_UINT8 = 2,
+
+  /** Use type uint16_t. May clip wide color gamut data.
+   */
+  JXL_TYPE_UINT16 = 3,
+
+  /** Use 16-bit IEEE 754 half-precision floating point values */
+  JXL_TYPE_FLOAT16 = 5,
+} JxlDataType;
+
+/* DEPRECATED: bit-packed 1-bit data type. Use JXL_TYPE_UINT8 instead.
+ */
+JXL_DEPRECATED static const int JXL_TYPE_BOOLEAN = 1;
+
+/* DEPRECATED: uint32_t data type. Use JXL_TYPE_FLOAT instead.
+ */
+JXL_DEPRECATED static const int JXL_TYPE_UINT32 = 4;
+
+/** Ordering of multi-byte data.
+ */
+typedef enum {
+  /** Use the endianness of the system, either little endian or big endian,
+   * without forcing either specific endianness. Do not use if pixel data
+   * should be exported to a well defined format.
+   */
+  JXL_NATIVE_ENDIAN = 0,
+  /** Force little endian */
+  JXL_LITTLE_ENDIAN = 1,
+  /** Force big endian */
+  JXL_BIG_ENDIAN = 2,
+} JxlEndianness;
+
+/** Data type for the sample values per channel per pixel for the output buffer
+ * for pixels. This is not necessarily the same as the data type encoded in the
+ * codestream. The channels are interleaved per pixel. The pixels are
+ * organized row by row, left to right, top to bottom.
+ * TODO(lode): support different channel orders if needed (RGB, BGR, ...)
+ */
+typedef struct {
+  /** Amount of channels available in a pixel buffer.
+   * 1: single-channel data, e.g. grayscale or a single extra channel
+   * 2: single-channel + alpha
+   * 3: trichromatic, e.g. RGB
+   * 4: trichromatic + alpha
+   * TODO(lode): this needs finetuning. It is not yet defined how the user
+   * chooses output color space. CMYK+alpha needs 5 channels.
+   */
+  uint32_t num_channels;
+
+  /** Data type of each channel.
+   */
+  JxlDataType data_type;
+
+  /** Whether multi-byte data types are represented in big endian or little
+   * endian format. This applies to JXL_TYPE_UINT16, JXL_TYPE_UINT32
+   * and JXL_TYPE_FLOAT.
+   */
+  JxlEndianness endianness;
+
+  /** Align scanlines to a multiple of align bytes, or 0 to require no
+   * alignment at all (which has the same effect as value 1)
+   */
+  size_t align;
+} JxlPixelFormat;
+
+/** Settings for the interpretation of the input and output buffers.
+ */
+typedef enum {
+  /** This is the default setting, where the encoder expects the input pixels
+   * to use the full range of the pixel format data type (e.g. for UINT16, the
+   * input range is 0 .. 65535 and the value 65535 is mapped to 1.0 when
+   * converting to float), and the decoder uses the full range to output
+   * pixels. If the bit depth in the basic info is different from this, the
+   * encoder expects the values to be rescaled accordingly (e.g. multiplied by
+   * 65535/4095 for a 12-bit image using UINT16 input data type). */
+  JXL_BIT_DEPTH_FROM_PIXEL_FORMAT = 0,
+
+  /** If this setting is selected, the encoder expects the input pixels to be
+   * in the range defined by the bits_per_sample value of the basic info (e.g.
+   * for 12-bit images using UINT16 input data types, the allowed range is
+   * 0 .. 4095 and the value 4095 is mapped to 1.0 when converting to float),
+   * and the decoder outputs pixels in this range. */
+  JXL_BIT_DEPTH_FROM_CODESTREAM = 1,
+
+  /** This setting can only be used in the decoder to select a custom range for
+   * pixel output */
+  JXL_BIT_DEPTH_CUSTOM = 2,
+} JxlBitDepthType;
+
+/** Data type for describing the interpretation of the input and output buffers
+ * in terms of the range of allowed input and output pixel values. */
+typedef struct {
+  /** Bit depth setting, see comment on @ref JxlBitDepthType */
+  JxlBitDepthType type;
+
+  /** Custom bits per sample */
+  uint32_t bits_per_sample;
+
+  /** Custom exponent bits per sample */
+  uint32_t exponent_bits_per_sample;
+} JxlBitDepth;
+
+/** Data type holding the 4-character type name of an ISOBMFF box.
+ */
+typedef char JxlBoxType[4];
+
+/** Types of progressive detail.
+ * Setting a progressive detail with value N implies all progressive details
+ * with smaller or equal value. Currently only the following level of
+ * progressive detail is implemented:
+ *  - kDC (which implies kFrames)
+ *  - kLastPasses (which implies kDC and kFrames)
+ *  - kPasses (which implies kLastPasses, kDC and kFrames)
+ */
+typedef enum {
+  // after completed kRegularFrames
+  kFrames = 0,
+  // after completed DC (1:8)
+  kDC = 1,
+  // after completed AC passes that are the last pass for their resolution
+  // target.
+  kLastPasses = 2,
+  // after completed AC passes that are not the last pass for their resolution
+  // target.
+  kPasses = 3,
+  // during DC frame when lower resolution are completed (1:32, 1:16)
+  kDCProgressive = 4,
+  // after completed groups
+  kDCGroups = 5,
+  // after completed groups
+  kGroups = 6,
+} JxlProgressiveDetail;
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_TYPES_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/jpegli.cmake b/third_party/jpeg-xl/lib/jpegli.cmake
new file mode 100644
index 0000000000..4b19375a76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli.cmake
@@ -0,0 +1,106 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(compatibility.cmake)
+include(jxl_lists.cmake)
+
+set(JPEGLI_INTERNAL_LIBS
+  hwy
+  Threads::Threads
+  ${ATOMICS_LIBRARIES}
+)
+
+add_library(jpegli-static STATIC EXCLUDE_FROM_ALL "${JPEGXL_INTERNAL_JPEGLI_SOURCES}")
+target_compile_options(jpegli-static PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+target_compile_options(jpegli-static PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jpegli-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jpegli-static PUBLIC
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
+  "${JXL_HWY_INCLUDE_DIRS}"
+)
+target_include_directories(jpegli-static PUBLIC "${JPEG_INCLUDE_DIRS}")
+target_link_libraries(jpegli-static PUBLIC ${JPEGLI_INTERNAL_LIBS})
+
+#
+# Tests for jpegli-static
+#
+
+if(BUILD_TESTING)
+# TODO(eustas): merge into jxl_tests.cmake?
+# Individual test binaries:
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+foreach (TESTFILE IN LISTS JPEGXL_INTERNAL_JPEGLI_TESTS)
+  # The TESTNAME is the name without the extension or directory.
+  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+  add_executable(${TESTNAME} ${TESTFILE} ${JPEGXL_INTERNAL_JPEGLI_TESTLIB_FILES})
+  target_compile_options(${TESTNAME} PRIVATE
+    ${JPEGXL_INTERNAL_FLAGS}
+    # Add coverage flags to the test binary so code in the private headers of
+    # the library is also instrumented when running tests that execute it.
+    ${JPEGXL_COVERAGE_FLAGS}
+  )
+  target_compile_definitions(${TESTNAME} PRIVATE
+    -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
+  target_include_directories(${TESTNAME} PRIVATE "${PROJECT_SOURCE_DIR}")
+  target_link_libraries(${TESTNAME}
+    hwy
+    jpegli-static
+    gmock
+    GTest::GTest
+    GTest::Main
+    ${JPEG_LIBRARIES}
+  )
+  set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "${JPEGXL_COVERAGE_LINK_FLAGS}")
+  # Output test targets in the test directory.
+  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+  if (WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set_target_properties(${TESTNAME} PROPERTIES COMPILE_FLAGS "-Wno-error")
+  endif ()
+  jxl_discover_tests(${TESTNAME})
+endforeach ()
+endif()
+
+#
+# Build libjpeg.so that links to libjpeg-static
+#
+
+if (JPEGXL_ENABLE_JPEGLI_LIBJPEG AND NOT APPLE AND NOT WIN32 AND NOT JPEGXL_EMSCRIPTEN)
+add_library(jpegli-libjpeg-obj OBJECT "${JPEGXL_INTERNAL_JPEGLI_WRAPPER_SOURCES}")
+target_compile_options(jpegli-libjpeg-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jpegli-libjpeg-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jpegli-libjpeg-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jpegli-libjpeg-obj PUBLIC "${PROJECT_SOURCE_DIR}")
+target_compile_definitions(jpegli-libjpeg-obj PUBLIC
+  ${JPEGLI_LIBJPEG_OBJ_COMPILE_DEFINITIONS}
+)
+set(JPEGLI_LIBJPEG_INTERNAL_OBJECTS $<TARGET_OBJECTS:jpegli-libjpeg-obj>)
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/jpegli)
+add_library(jpeg SHARED ${JPEGLI_LIBJPEG_INTERNAL_OBJECTS})
+target_link_libraries(jpeg PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+target_link_libraries(jpeg PRIVATE jpegli-static)
+set_target_properties(jpeg PROPERTIES
+  VERSION ${JPEGLI_LIBJPEG_LIBRARY_VERSION}
+  SOVERSION ${JPEGLI_LIBJPEG_LIBRARY_SOVERSION}
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/jpegli"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/jpegli")
+
+# Add a jpeg.version file as a version script to tag symbols with the
+# appropriate version number.
+set_target_properties(jpeg PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jpegli/jpeg.version.${JPEGLI_LIBJPEG_LIBRARY_SOVERSION})
+set_property(TARGET jpeg APPEND_STRING PROPERTY
+  LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/jpegli/jpeg.version.${JPEGLI_LIBJPEG_LIBRARY_SOVERSION}")
+
+# This hides the default visibility symbols from static libraries bundled into
+# the shared library. In particular this prevents exposing symbols from hwy
+# in the shared library.
+if(LINKER_SUPPORT_EXCLUDE_LIBS)
+  set_property(TARGET jpeg APPEND_STRING PROPERTY
+    LINK_FLAGS " ${LINKER_EXCLUDE_LIBS_FLAG}")
+endif()
+endif()
diff --git a/third_party/jpeg-xl/lib/jpegli/README.md b/third_party/jpeg-xl/lib/jpegli/README.md
new file mode 100644
index 0000000000..1eef402eef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/README.md
@@ -0,0 +1,28 @@
+# Improved JPEG decoder implementation
+
+This subdirectory contains a JPEG decoder implementation that is API and ABI
+compatible with libjpeg62.
+
+*NOTE*: This is still a work in progress, currently only API functions called
+from libjxl's benchmark_xl tool are implemented.
+
+To decompress an ```input.jpg``` file with this new library:
+
+```
+(from the libjxl root directory)
+$ ./ci.sh opt
+$ LD_PRELOAD=./build/libjpeg.so.62 ./build/tools/benchmark_xl --input input.jpg --codec=jpeg --decode_only --save_decompressed --output_dir .
+```
+
+The decompressed file will be saved as ```input.jpg.jpeg.png```.
+
+To benchmark the jpeg encoding-decoding round-trip on an ```input.png``` with
+the new library, first build a statically linked ```cjpeg-static``` binary,
+which is found in ```$PATH```, and then run:
+
+```
+(from the libjxl root directory)
+$ ./ci.sh opt
+$ LD_PRELOAD=./build/libjpeg.so.62 ./build/tools/benchmark_xl --input input.png --codec=jpeg:cjpeg-static:q90
+```
+
diff --git a/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc
new file mode 100644
index 0000000000..a1c0b89ad3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc
@@ -0,0 +1,563 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/adaptive_quantization.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/adaptive_quantization.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+static constexpr float kInputScaling = 1.0f / 255.0f;
+
+// Primary template: default to actual division.
+template <typename T, class V>
+struct FastDivision {
+  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
+};
+// Partial specialization for float vectors.
+template <class V>
+struct FastDivision<float, V> {
+  // One Newton-Raphson iteration.
+  static HWY_INLINE V ReciprocalNR(const V x) {
+    const auto rcp = ApproximateReciprocal(x);
+    const auto sum = Add(rcp, rcp);
+    const auto x_rcp = Mul(x, rcp);
+    return NegMulAdd(x_rcp, rcp, sum);
+  }
+
+  V operator()(const V n, const V d) const {
+#if 1  // Faster on SKX
+    return Div(n, d);
+#else
+    return n * ReciprocalNR(d);
+#endif
+  }
+};
+
+// Approximates smooth functions via rational polynomials (i.e. dividing two
+// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
+// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
+// specify constants (replicated 4x) independently of the lane count.
+template <size_t NP, size_t NQ, class D, class V, typename T>
+HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
+                                                     const T (&p)[NP],
+                                                     const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(d, &p[kDegP * 4]);
+  auto yq = LoadDup128(d, &q[kDegQ * 4]);
+  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
+  // compiler warning that the index is out of bounds since we are already
+  // checking that it is not out of bounds with (kDegP >= n) and the access
+  // will be optimized away. Similarly with q and kDegQ.
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
+  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
+  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
+  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
+  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
+
+  return FastDivision<T, V>()(yp, yq);
+}
+
+// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
+// L1 error ~3.9E-6
+template <class DF, class V>
+V FastLog2f(const DF df, V x) {
+  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
+  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
+                                          HWY_REP4(1.4287160470083755E+00f),
+                                          HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
+                                          HWY_REP4(1.0096718572241148E+00f),
+                                          HWY_REP4(1.7409343003366853E-01f)};
+
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+
+  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+  // Shifted exponent = log2; also used to clear mantissa.
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
+             exp_val);
+}
+
+// max relative error ~3e-7
+template <class DF, class V>
+V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
+  return Div(num, den);
+}
+
+inline float FastPow2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPow2f(D, Set(D, f)));
+}
+
+// The following functions modulate an exponent (out_val) and return the updated
+// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
+
+template <class D, class V>
+V ComputeMask(const D d, const V out_val) {
+  const auto kBase = Set(d, -0.74174993f);
+  const auto kMul4 = Set(d, 3.2353257320940401f);
+  const auto kMul2 = Set(d, 12.906028311180409f);
+  const auto kOffset2 = Set(d, 305.04035728311436f);
+  const auto kMul3 = Set(d, 5.0220313103171232f);
+  const auto kOffset3 = Set(d, 2.1925739705298404f);
+  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
+  const auto kMul0 = Set(d, 0.74760422233706747f);
+  const auto k1 = Set(d, 1.0f);
+
+  // Avoid division by zero.
+  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
+  const auto v2 = Div(k1, Add(v1, kOffset2));
+  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
+  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
+  // TODO(jyrki):
+  // A log or two here could make sense. In butteraugli we have effectively
+  // log(log(x + C)) for this kind of use, as a single log is used in
+  // saturating visual masking and here the modulation values are exponential,
+  // another log would counter that.
+  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
+}
+
+// mul and mul2 represent a scaling difference between jxl and butteraugli.
+static const float kSGmul = 226.0480446705883f;
+static const float kSGmul2 = 1.0f / 73.377132366608819f;
+static const float kLog2 = 0.693147181f;
+// Includes correction factor for std::log -> log2.
+static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
+static const float kSGVOffset = 7.14672470003f;
+
+template <bool invert, typename D, typename V>
+V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
+  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
+  // is related to the number of photons.
+  //
+  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
+  // This ratio allows quantization to move from jxl's opsin space to
+  // butteraugli's log-gamma space.
+  static const float kEpsilon = 1e-2;
+  static const float kNumOffset = kEpsilon / kInputScaling / kInputScaling;
+  static const float kNumMul = kSGRetMul * 3 * kSGmul;
+  static const float kVOffset = (kSGVOffset * kLog2 + kEpsilon) / kInputScaling;
+  static const float kDenMul = kLog2 * kSGmul * kInputScaling * kInputScaling;
+
+  v = ZeroIfNegative(v);
+  const auto num_mul = Set(d, kNumMul);
+  const auto num_offset = Set(d, kNumOffset);
+  const auto den_offset = Set(d, kVOffset);
+  const auto den_mul = Set(d, kDenMul);
+
+  const auto v2 = Mul(v, v);
+
+  const auto num = MulAdd(num_mul, v2, num_offset);
+  const auto den = MulAdd(Mul(den_mul, v), v2, den_offset);
+  return invert ? Div(num, den) : Div(den, num);
+}
+
+template <bool invert = false>
+static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(
+      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
+}
+
+// TODO(veluca): this function computes an approximation of the derivative of
+// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
+// exact derivatives. For reference, SimpleGamma was:
+/*
+template <typename D, typename V>
+V SimpleGamma(const D d, V v) {
+  // A simple HDR compatible gamma function.
+  const auto mul = Set(d, kSGmul);
+  const auto kRetMul = Set(d, kSGRetMul);
+  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
+  const auto kVOffset = Set(d, kSGVOffset);
+
+  v *= mul;
+
+  // This should happen rarely, but may lead to a NaN, which is rather
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
+  v = ZeroIfNegative(v);
+  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
+}
+*/
+
+template <class D, class V>
+V GammaModulation(const D d, const size_t x, const size_t y,
+                  const RowBuffer<float>& input, const V out_val) {
+  static const float kBias = 0.16f / kInputScaling;
+  static const float kScale = kInputScaling / 64.0f;
+  auto overall_ratio = Zero(d);
+  const auto bias = Set(d, kBias);
+  const auto scale = Set(d, kScale);
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in = block_start + dy * input.stride();
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto iny = Add(Load(d, row_in + dx), bias);
+      const auto ratio_g =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, iny);
+      overall_ratio = Add(overall_ratio, ratio_g);
+    }
+  }
+  overall_ratio = Mul(SumOfLanes(d, overall_ratio), scale);
+  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
+  // less than that.
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
+}
+
+// Change precision in 8x8 blocks that have high frequency content.
+template <class D, class V>
+V HfModulation(const D d, const size_t x, const size_t y,
+               const RowBuffer<float>& input, const V out_val) {
+  // Zero out the invalid differences for the rightmost value per row.
+  const Rebind<uint32_t, D> du;
+  HWY_ALIGN constexpr uint32_t kMaskRight[8] = {~0u, ~0u, ~0u, ~0u,
+                                                ~0u, ~0u, ~0u, 0};
+
+  auto sum = Zero(d);  // sum of absolute differences with right and below
+  static const float kSumCoeff = -2.0052193233688884f * kInputScaling / 112.0;
+  auto sumcoeff = Set(d, kSumCoeff);
+
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* JXL_RESTRICT row_in = block_start + dy * input.stride();
+    const float* JXL_RESTRICT row_in_next =
+        dy == 7 ? row_in : row_in + input.stride();
+
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto p = Load(d, row_in + dx);
+      const auto pr = LoadU(d, row_in + dx + 1);
+      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
+      sum = Add(sum, And(mask, AbsDiff(p, pr)));
+      const auto pd = Load(d, row_in_next + dx);
+      sum = Add(sum, AbsDiff(p, pd));
+    }
+  }
+
+  sum = SumOfLanes(d, sum);
+  return MulAdd(sum, sumcoeff, out_val);
+}
+
+void PerBlockModulations(const float y_quant_01, const RowBuffer<float>& input,
+                         const size_t yb0, const size_t yblen,
+                         RowBuffer<float>* aq_map) {
+  static const float kAcQuant = 0.841f;
+  float base_level = 0.48f * kAcQuant;
+  float kDampenRampStart = 9.0f;
+  float kDampenRampEnd = 65.0f;
+  float dampen = 1.0f;
+  if (y_quant_01 >= kDampenRampStart) {
+    dampen = 1.0f - ((y_quant_01 - kDampenRampStart) /
+                     (kDampenRampEnd - kDampenRampStart));
+    if (dampen < 0) {
+      dampen = 0;
+    }
+  }
+  const float mul = kAcQuant * dampen;
+  const float add = (1.0f - dampen) * base_level;
+  for (size_t iy = 0; iy < yblen; iy++) {
+    const size_t yb = yb0 + iy;
+    const size_t y = yb * 8;
+    float* const JXL_RESTRICT row_out = aq_map->Row(yb);
+    const HWY_CAPPED(float, 8) df;
+    for (size_t ix = 0; ix < aq_map->xsize(); ix++) {
+      size_t x = ix * 8;
+      auto out_val = Set(df, row_out[ix]);
+      out_val = ComputeMask(df, out_val);
+      out_val = HfModulation(df, x, y, input, out_val);
+      out_val = GammaModulation(df, x, y, input, out_val);
+      // We want multiplicative quantization field, so everything
+      // until this point has been modulating the exponent.
+      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
+    }
+  }
+}
+
+template <typename D, typename V>
+V MaskingSqrt(const D d, V v) {
+  static const float kLogOffset = 28;
+  static const float kMul = 211.50759899638012f;
+  const auto mul_v = Set(d, kMul * 1e8);
+  const auto offset_v = Set(d, kLogOffset);
+  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
+}
+
+template <typename V>
+void Sort4(V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Min(min0, min1);
+  const auto tmp1 = Max(min0, min1);
+  const auto tmp2 = Min(min2, min3);
+  const auto tmp3 = Max(min2, min3);
+  const auto tmp4 = Max(tmp0, tmp2);
+  const auto tmp5 = Min(tmp1, tmp3);
+  min0 = Min(tmp0, tmp2);
+  min1 = Min(tmp4, tmp5);
+  min2 = Max(tmp4, tmp5);
+  min3 = Max(tmp1, tmp3);
+}
+
+template <typename V>
+void UpdateMin4(const V v, V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Max(min0, v);
+  const auto tmp1 = Max(min1, tmp0);
+  const auto tmp2 = Max(min2, tmp1);
+  min0 = Min(min0, v);
+  min1 = Min(min1, tmp0);
+  min2 = Min(min2, tmp1);
+  min3 = Min(min3, tmp2);
+}
+
+// Computes a linear combination of the 4 lowest values of the 3x3 neighborhood
+// of each pixel. Output is downsampled 2x.
+void FuzzyErosion(const RowBuffer<float>& pre_erosion, const size_t yb0,
+                  const size_t yblen, RowBuffer<float>* tmp,
+                  RowBuffer<float>* aq_map) {
+  int xsize_blocks = aq_map->xsize();
+  int xsize = pre_erosion.xsize();
+  HWY_FULL(float) d;
+  const auto mul0 = Set(d, 0.125f);
+  const auto mul1 = Set(d, 0.075f);
+  const auto mul2 = Set(d, 0.06f);
+  const auto mul3 = Set(d, 0.05f);
+  for (size_t iy = 0; iy < 2 * yblen; ++iy) {
+    size_t y = 2 * yb0 + iy;
+    const float* JXL_RESTRICT rowt = pre_erosion.Row(y - 1);
+    const float* JXL_RESTRICT rowm = pre_erosion.Row(y);
+    const float* JXL_RESTRICT rowb = pre_erosion.Row(y + 1);
+    float* row_out = tmp->Row(y);
+    for (int x = 0; x < xsize; x += Lanes(d)) {
+      int xm1 = x - 1;
+      int xp1 = x + 1;
+      auto min0 = LoadU(d, rowm + x);
+      auto min1 = LoadU(d, rowm + xm1);
+      auto min2 = LoadU(d, rowm + xp1);
+      auto min3 = LoadU(d, rowt + xm1);
+      Sort4(min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + xp1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xm1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xp1), min0, min1, min2, min3);
+      const auto v = Add(Add(Mul(mul0, min0), Mul(mul1, min1)),
+                         Add(Mul(mul2, min2), Mul(mul3, min3)));
+      Store(v, d, row_out + x);
+    }
+    if (iy % 2 == 1) {
+      const float* JXL_RESTRICT row_out0 = tmp->Row(y - 1);
+      float* JXL_RESTRICT aq_out = aq_map->Row(yb0 + iy / 2);
+      for (int bx = 0, x = 0; bx < xsize_blocks; ++bx, x += 2) {
+        aq_out[bx] =
+            (row_out[x] + row_out[x + 1] + row_out0[x] + row_out0[x + 1]);
+      }
+    }
+  }
+}
+
+void ComputePreErosion(const RowBuffer<float>& input, const size_t xsize,
+                       const size_t y0, const size_t ylen, int border,
+                       float* diff_buffer, RowBuffer<float>* pre_erosion) {
+  const size_t xsize_out = xsize / 4;
+  const size_t y0_out = y0 / 4;
+
+  // The XYB gamma is 3.0 to be able to decode faster with two muls.
+  // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
+  // We approximate the gamma difference by adding one cubic root into
+  // the adaptive quantization. This gives us a total gamma of 2.6666
+  // for quantization uses.
+  static const float match_gamma_offset = 0.019 / kInputScaling;
+
+  const HWY_CAPPED(float, 8) df;
+
+  static const float limit = 0.2f;
+  // Computes image (padded to multiple of 8x8) of local pixel differences.
+  // Subsample both directions by 4.
+  for (size_t iy = 0; iy < ylen; ++iy) {
+    size_t y = y0 + iy;
+    const float* row_in = input.Row(y);
+    const float* row_in1 = input.Row(y + 1);
+    const float* row_in2 = input.Row(y - 1);
+    float* JXL_RESTRICT row_out = diff_buffer;
+    const auto match_gamma_offset_v = Set(df, match_gamma_offset);
+    const auto quarter = Set(df, 0.25f);
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto in = LoadU(df, row_in + x);
+      const auto in_r = LoadU(df, row_in + x + 1);
+      const auto in_l = LoadU(df, row_in + x - 1);
+      const auto in_t = LoadU(df, row_in2 + x);
+      const auto in_b = LoadU(df, row_in1 + x);
+      const auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
+      const auto gammacv =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
+              df, Add(in, match_gamma_offset_v));
+      auto diff = Mul(gammacv, Sub(in, base));
+      diff = Mul(diff, diff);
+      diff = Min(diff, Set(df, limit));
+      diff = MaskingSqrt(df, diff);
+      if ((iy & 3) != 0) {
+        diff = Add(diff, LoadU(df, row_out + x));
+      }
+      StoreU(diff, df, row_out + x);
+    }
+    if (iy % 4 == 3) {
+      size_t y_out = y0_out + iy / 4;
+      float* row_dout = pre_erosion->Row(y_out);
+      for (size_t x = 0; x < xsize_out; x++) {
+        row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
+                       row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
+                      0.25f;
+      }
+      pre_erosion->PadRow(y_out, xsize_out, border);
+    }
+  }
+}
+
+}  // namespace
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+HWY_EXPORT(ComputePreErosion);
+HWY_EXPORT(FuzzyErosion);
+HWY_EXPORT(PerBlockModulations);
+
+namespace {
+
+static constexpr int kPreErosionBorder = 1;
+
+}  // namespace
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!m->use_adaptive_quantization) {
+    return;
+  }
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  int y_quant_01 = cinfo->quant_tbl_ptrs[y_comp->quant_tbl_no]->quantval[1];
+  if (m->next_iMCU_row == 0) {
+    m->input_buffer[y_channel].CopyRow(-1, 0, 1);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+    m->input_buffer[y_channel].CopyRow(last_row + 1, last_row, 1);
+  }
+  const RowBuffer<float>& input = m->input_buffer[y_channel];
+  const size_t xsize_blocks = y_comp->width_in_blocks;
+  const size_t xsize = xsize_blocks * DCTSIZE;
+  const size_t yb0 = m->next_iMCU_row * cinfo->max_v_samp_factor;
+  const size_t yblen = cinfo->max_v_samp_factor;
+  size_t y0 = yb0 * DCTSIZE;
+  size_t ylen = cinfo->max_v_samp_factor * DCTSIZE;
+  if (y0 == 0) {
+    ylen += 4;
+  } else {
+    y0 += 4;
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    ylen -= 4;
+  }
+  HWY_DYNAMIC_DISPATCH(ComputePreErosion)
+  (input, xsize, y0, ylen, kPreErosionBorder, m->diff_buffer, &m->pre_erosion);
+  if (y0 == 0) {
+    m->pre_erosion.CopyRow(-1, 0, kPreErosionBorder);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * 2 - 1;
+    m->pre_erosion.CopyRow(last_row + 1, last_row, kPreErosionBorder);
+  }
+  HWY_DYNAMIC_DISPATCH(FuzzyErosion)
+  (m->pre_erosion, yb0, yblen, &m->fuzzy_erosion_tmp, &m->quant_field);
+  HWY_DYNAMIC_DISPATCH(PerBlockModulations)
+  (y_quant_01, input, yb0, yblen, &m->quant_field);
+  for (int y = 0; y < cinfo->max_v_samp_factor; ++y) {
+    float* row = m->quant_field.Row(yb0 + y);
+    for (size_t x = 0; x < xsize_blocks; ++x) {
+      row[x] = std::max(0.0f, (0.6f / row[x]) - 1.0f);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h
new file mode 100644
index 0000000000..71f2fcc0af
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/adaptive_quantization.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+#define LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stddef.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/bit_writer.cc b/third_party/jpeg-xl/lib/jpegli/bit_writer.cc
new file mode 100644
index 0000000000..9788f35b8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bit_writer.cc
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bit_writer.h"
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void JpegBitWriterInit(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  size_t buffer_size = m->blocks_per_iMCU_row * (DCTSIZE2 * 16 + 8) + (1 << 16);
+  bw->cinfo = cinfo;
+  bw->data = Allocate<uint8_t>(cinfo, buffer_size, JPOOL_IMAGE);
+  bw->len = buffer_size;
+  bw->pos = 0;
+  bw->output_pos = 0;
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+  bw->healthy = true;
+}
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw) {
+  while (bw->output_pos < bw->pos) {
+    j_compress_ptr cinfo = bw->cinfo;
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      return false;
+    }
+    size_t buflen = bw->pos - bw->output_pos;
+    size_t copylen = std::min<size_t>(cinfo->dest->free_in_buffer, buflen);
+    memcpy(cinfo->dest->next_output_byte, bw->data + bw->output_pos, copylen);
+    bw->output_pos += copylen;
+    cinfo->dest->free_in_buffer -= copylen;
+    cinfo->dest->next_output_byte += copylen;
+  }
+  bw->output_pos = bw->pos = 0;
+  return true;
+}
+
+void JumpToByteBoundary(JpegBitWriter* bw) {
+  size_t n_bits = bw->free_bits & 7u;
+  if (n_bits > 0) {
+    WriteBits(bw, n_bits, (1u << n_bits) - 1);
+  }
+  bw->put_buffer <<= bw->free_bits;
+  while (bw->free_bits <= 56) {
+    int c = (bw->put_buffer >> 56) & 0xFF;
+    EmitByte(bw, c);
+    bw->put_buffer <<= 8;
+    bw->free_bits += 8;
+  }
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/bit_writer.h b/third_party/jpeg-xl/lib/jpegli/bit_writer.h
new file mode 100644
index 0000000000..0affcdabd3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bit_writer.h
@@ -0,0 +1,107 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BIT_WRITER_H_
+#define LIB_JPEGLI_BIT_WRITER_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdint.h>
+#include <string.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+// Handles the packing of bits into output bytes.
+struct JpegBitWriter {
+  j_compress_ptr cinfo;
+  uint8_t* data;
+  size_t len;
+  size_t pos;
+  size_t output_pos;
+  uint64_t put_buffer;
+  int free_bits;
+  bool healthy;
+};
+
+void JpegBitWriterInit(j_compress_ptr cinfo);
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw);
+
+void JumpToByteBoundary(JpegBitWriter* bw);
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+/**
+ * Writes the given byte to the output, writes an extra zero if byte is 0xFF.
+ *
+ * This method is "careless" - caller must make sure that there is enough
+ * space in the output buffer. Emits up to 2 bytes to buffer.
+ */
+static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
+  bw->data[bw->pos++] = byte;
+  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+}
+
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
+  // At this point we are ready to emit the bytes of put_buffer to the output.
+  // The JPEG format requires that after every 0xff byte in the entropy
+  // coded section, there is a zero byte, therefore we first check if any of
+  // the bytes of put_buffer is 0xFF.
+  if (HasZeroByte(~bw->put_buffer)) {
+    // We have a 0xFF byte somewhere, examine each byte and append a zero
+    // byte if necessary.
+    EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 48) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 40) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 8) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 0) & 0xFF);
+  } else {
+    // We don't have any 0xFF bytes, output all 6 bytes without checking.
+    bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF;
+    bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF;
+    bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF;
+    bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF;
+    bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF;
+    bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF;
+    bw->data[bw->pos + 6] = (bw->put_buffer >> 8) & 0xFF;
+    bw->data[bw->pos + 7] = (bw->put_buffer >> 0) & 0xFF;
+    bw->pos += 8;
+  }
+}
+
+static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
+  // This is an optimization; if everything goes well,
+  // then |nbits| is positive; if non-existing Huffman symbol is going to be
+  // encoded, its length should be zero; later encoder could check the
+  // "health" of JpegBitWriter.
+  if (nbits == 0) {
+    bw->healthy = false;
+    return;
+  }
+  bw->free_bits -= nbits;
+  if (bw->free_bits < 0) {
+    bw->put_buffer <<= (bw->free_bits + nbits);
+    bw->put_buffer |= (bits >> -bw->free_bits);
+    DischargeBitBuffer(bw);
+    bw->free_bits += 64;
+    bw->put_buffer = nbits;
+  }
+  bw->put_buffer <<= nbits;
+  bw->put_buffer |= bits;
+}
+
+}  // namespace jpegli
+#endif  // LIB_JPEGLI_BIT_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/bitstream.cc b/third_party/jpeg-xl/lib/jpegli/bitstream.cc
new file mode 100644
index 0000000000..0313ed3071
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bitstream.cc
@@ -0,0 +1,1136 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bitstream.h"
+
+#include <cmath>
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/bits.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/bitstream.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Compress;
+using hwy::HWY_NAMESPACE::CountTrue;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::MaskFromVec;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Not;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Shl;
+using hwy::HWY_NAMESPACE::Sub;
+
+using DI = HWY_FULL(int32_t);
+constexpr DI di;
+
+int NumNonZero8x8ExceptDC(const coeff_t* block) {
+  const HWY_CAPPED(coeff_t, 8) di;
+
+  const auto zero = Zero(di);
+  // Add FFFF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+  {
+    // First row has DC, so mask
+    const size_t y = 0;
+    HWY_ALIGN const coeff_t dc_mask_lanes[8] = {-1};
+
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto dc_mask = Load(di, dc_mask_lanes + x);
+
+      // DC counts as zero so we don't include it in nzeros.
+      const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x]));
+
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+  // Remaining rows: no mask
+  for (size_t y = 1; y < 8; y++) {
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * 8 + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want 64 - sum_zero, add because neg_sum_zero is already negated.
+  return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero));
+}
+
+void ZigZagShuffle(int32_t* JXL_RESTRICT block) {
+  // TODO(szabadka) SIMDify this.
+  int32_t tmp[DCTSIZE2];
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    tmp[k] = block[kJPEGNaturalOrder[k]];
+  }
+  memcpy(block, tmp, DCTSIZE2 * sizeof(tmp[0]));
+}
+
+template <typename DI, class V>
+JXL_INLINE V NumBits(DI di, const V x) {
+  // TODO(szabadka) Add faster implementations for some specific architectures.
+  const auto b1 = And(x, Set(di, 1));
+  const auto b2 = And(x, Set(di, 2));
+  const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1));
+  const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4));
+  const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11));
+  const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26));
+  const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57));
+  const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120));
+  const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247));
+  const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502));
+  const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013));
+  const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036));
+  return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))),
+             Max(Max(b9, b10), Max(b11, b12)));
+}
+
+// Coefficient indexes pre-multiplied by 16 for the symbol calculation.
+HWY_ALIGN constexpr int32_t kIndexes[64] = {
+    0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176,  192,
+    208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384,  400,
+    416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592,  608,
+    624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800,  816,
+    832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008,
+};
+
+JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block,
+                            int32_t* JXL_RESTRICT nonzero_idx) {
+  const auto zero = Zero(di);
+  HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1};
+  const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes));
+  int num_nonzeros = 0;
+  int k = 0;
+  {
+    const auto coef = Load(di, block);
+    const auto idx = Load(di, kIndexes);
+    const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero)));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+    k += Lanes(di);
+  }
+  for (; k < DCTSIZE2; k += Lanes(di)) {
+    const auto coef = Load(di, &block[k]);
+    const auto idx = Load(di, &kIndexes[k]);
+    const auto nonzero_mask = Not(Eq(coef, zero));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+  }
+  return num_nonzeros;
+}
+
+JXL_INLINE void ComputeSymbols(const int num_nonzeros,
+                               int32_t* JXL_RESTRICT nonzero_idx,
+                               int32_t* JXL_RESTRICT block,
+                               int32_t* JXL_RESTRICT symbols) {
+  nonzero_idx[-1] = -16;
+  const auto one = Set(di, 1);
+  const auto offset = Set(di, 16);
+  for (int i = 0; i < num_nonzeros; i += Lanes(di)) {
+    const auto idx = Load(di, &nonzero_idx[i]);
+    const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]);
+    const auto coeff = Load(di, &block[i]);
+    const auto nbits = NumBits(di, Abs(coeff));
+    const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff);
+    const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one));
+    const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset));
+    Store(symbol, di, symbols + i);
+    Store(bits, di, block + i);
+  }
+}
+
+void WriteBlock(int32_t* JXL_RESTRICT block, int32_t* JXL_RESTRICT symbols,
+                int32_t* JXL_RESTRICT nonzero_idx, HuffmanCodeTable* dc_huff,
+                HuffmanCodeTable* ac_huff, JpegBitWriter* bw) {
+  ZigZagShuffle(block);
+  int num_nonzeros = CompactBlock(block, nonzero_idx);
+  ComputeSymbols(num_nonzeros, nonzero_idx, block, symbols);
+  int symbol = symbols[0];
+  WriteBits(bw, dc_huff->depth[symbol], dc_huff->code[symbol] | block[0]);
+  for (int i = 1; i < num_nonzeros; ++i) {
+    symbol = symbols[i];
+    while (symbol > 255) {
+      WriteBits(bw, ac_huff->depth[0xf0], ac_huff->code[0xf0]);
+      symbol -= 256;
+    }
+    WriteBits(bw, ac_huff->depth[symbol], ac_huff->code[symbol] | block[i]);
+  }
+  if (nonzero_idx[num_nonzeros - 1] < 1008) {
+    WriteBits(bw, ac_huff->depth[0], ac_huff->code[0]);
+  }
+}
+
+void WriteiMCURow(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  int xsize_mcus = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  int mcu_y = m->next_iMCU_row;
+  int32_t* block = m->block_tmp;
+  int32_t* symbols = m->block_tmp + DCTSIZE2;
+  int32_t* nonzero_idx = m->block_tmp + 3 * DCTSIZE2;
+  coeff_t* JXL_RESTRICT last_dc_coeff = m->last_dc_coeff;
+  const float* imcu_start[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    imcu_start[c] = m->raw_data[c]->Row(mcu_y * comp->v_samp_factor * DCTSIZE);
+  }
+  const float* qf = nullptr;
+  if (m->use_adaptive_quantization) {
+    qf = m->quant_field.Row(0);
+  }
+  const size_t qf_stride = m->quant_field.stride();
+  for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      HuffmanCodeTable* dc_huff = &m->huff_tables[comp->dc_tbl_no];
+      HuffmanCodeTable* ac_huff = &m->huff_tables[comp->ac_tbl_no + 4];
+      float* JXL_RESTRICT qmc = m->quant_mul[c];
+      const size_t stride = m->raw_data[c]->stride();
+      const int h_factor = m->h_factor[c];
+      const float* zero_bias_offset = m->zero_bias_offset[c];
+      const float* zero_bias_mul = m->zero_bias_mul[c];
+      float aq_strength = 0.0f;
+      for (int iy = 0; iy < comp->v_samp_factor; ++iy) {
+        for (int ix = 0; ix < comp->h_samp_factor; ++ix) {
+          size_t by = mcu_y * comp->v_samp_factor + iy;
+          size_t bx = mcu_x * comp->h_samp_factor + ix;
+          if (bx >= comp->width_in_blocks || by >= comp->height_in_blocks) {
+            WriteBits(bw, dc_huff->depth[0], dc_huff->code[0]);
+            WriteBits(bw, ac_huff->depth[0], ac_huff->code[0]);
+            continue;
+          }
+          if (m->use_adaptive_quantization) {
+            aq_strength = qf[iy * qf_stride + bx * h_factor];
+          }
+          const float* pixels = imcu_start[c] + (iy * stride + bx) * DCTSIZE;
+          ComputeCoefficientBlock(pixels, stride, qmc, aq_strength,
+                                  zero_bias_offset, zero_bias_mul,
+                                  m->dct_buffer, block);
+          block[0] -= last_dc_coeff[c];
+          last_dc_coeff[c] += block[0];
+          WriteBlock(block, symbols, nonzero_idx, dc_huff, ac_huff, bw);
+        }
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+HWY_EXPORT(NumNonZero8x8ExceptDC);
+
+// Holds data that is buffered between 8x8 blocks in progressive mode.
+struct DCTCodingState {
+  // The run length of end-of-band symbols in a progressive scan.
+  int eob_run_;
+  // The huffman table to be used when flushing the state.
+  HuffmanCodeTable* cur_ac_huff_;
+  // The sequence of currently buffered refinement bits for a successive
+  // approximation scan (one where Ah > 0).
+  std::vector<int> refinement_bits_;
+};
+
+void DCTCodingStateInit(DCTCodingState* s) {
+  s->eob_run_ = 0;
+  s->cur_ac_huff_ = nullptr;
+  s->refinement_bits_.clear();
+  s->refinement_bits_.reserve(kJPEGMaxCorrectionBits);
+}
+
+static JXL_INLINE void WriteSymbol(int symbol, const HuffmanCodeTable* table,
+                                   JpegBitWriter* bw) {
+  WriteBits(bw, table->depth[symbol], table->code[symbol]);
+}
+
+// Emit all buffered data to the bit stream using the given Huffman code and
+// bit writer.
+static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) {
+  if (s->eob_run_ > 0) {
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(s->eob_run_);
+    int symbol = nbits << 4u;
+    WriteSymbol(symbol, s->cur_ac_huff_, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1));
+    }
+    s->eob_run_ = 0;
+  }
+  for (size_t i = 0; i < s->refinement_bits_.size(); ++i) {
+    WriteBits(bw, 1, s->refinement_bits_[i]);
+  }
+  s->refinement_bits_.clear();
+}
+
+// Buffer some more data at the end-of-band (the last non-zero or newly
+// non-zero coefficient within the [Ss, Se] spectral band).
+static JXL_INLINE void BufferEndOfBand(DCTCodingState* s,
+                                       HuffmanCodeTable* ac_huff,
+                                       const std::vector<int>* new_bits,
+                                       JpegBitWriter* bw) {
+  if (s->eob_run_ == 0) {
+    s->cur_ac_huff_ = ac_huff;
+  }
+  ++s->eob_run_;
+  if (new_bits) {
+    s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(),
+                               new_bits->end());
+  }
+  if (s->eob_run_ == 0x7FFF ||
+      s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    Flush(s, bw);
+  }
+}
+
+bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff, HuffmanCodeTable* table,
+                           bool pre_shifted = false) {
+  int huff_code[kJpegHuffmanAlphabetSize];
+  // +1 for a sentinel element.
+  uint32_t huff_size[kJpegHuffmanAlphabetSize + 1];
+  int p = 0;
+  for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = huff.counts[l];
+    if (p + i > kJpegHuffmanAlphabetSize + 1) {
+      return false;
+    }
+    while (i--) huff_size[p++] = l;
+  }
+
+  if (p == 0) {
+    return true;
+  }
+
+  // Reuse sentinel element.
+  int last_p = p - 1;
+  huff_size[last_p] = 0;
+
+  int code = 0;
+  uint32_t si = huff_size[0];
+  p = 0;
+  while (huff_size[p]) {
+    while ((huff_size[p]) == si) {
+      huff_code[p++] = code;
+      code++;
+    }
+    code <<= 1;
+    si++;
+  }
+  for (p = 0; p < last_p; p++) {
+    int i = huff.values[p];
+    table->depth[i] = huff_size[p];
+    table->code[i] = huff_code[p];
+    if (pre_shifted) {
+      int nbits = i & 0xf;
+      table->depth[i] += nbits;
+      table->code[i] <<= nbits;
+    }
+  }
+  return true;
+}
+
+bool EncodeDCTBlockSequential(const coeff_t* block, HuffmanCodeTable* dc_huff,
+                              HuffmanCodeTable* ac_huff, coeff_t* last_dc_coeff,
+                              JpegBitWriter* bw) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = block[0];
+  temp = temp2 - *last_dc_coeff;
+  if (temp == 0) {
+    WriteSymbol(0, dc_huff, bw);
+  } else {
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      temp2--;
+    }
+    int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int dc_mask = (1 << dc_nbits) - 1;
+    WriteSymbol(dc_nbits, dc_huff, bw);
+    WriteBits(bw, dc_nbits, temp2 & dc_mask);
+  }
+  int num_nonzeros = HWY_DYNAMIC_DISPATCH(NumNonZero8x8ExceptDC)(block);
+  for (int k = 1; k < 64; ++k) {
+    if (num_nonzeros == 0) {
+      WriteSymbol(0, ac_huff, bw);
+      break;
+    }
+    int r = 0;
+    while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      k++;
+    }
+    --num_nonzeros;
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int ac_mask = (1 << ac_nbits) - 1;
+    int symbol = (r << 4u) + ac_nbits;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, ac_nbits, temp2 & ac_mask);
+  }
+  return true;
+}
+
+bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                               HuffmanCodeTable* ac_huff, int Ss, int Se,
+                               int Al, DCTCodingState* coding_state,
+                               coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+    WriteSymbol(nbits, dc_huff, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    }
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    Flush(coding_state, bw);
+    while (r > 15) {
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    r = 0;
+  }
+  if (r > 0) {
+    BufferEndOfBand(coding_state, ac_huff, nullptr, bw);
+    if (!eob_run_allowed) {
+      Flush(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff,
+                          int Ss, int Se, int Al, DCTCodingState* coding_state,
+                          JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    // Emit next bit of DC component.
+    WriteBits(bw, 1, (coeffs[0] >> Al) & 1);
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  std::vector<int> refinement_bits;
+  refinement_bits.reserve(kDCTBlockSize);
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      Flush(coding_state, bw);
+      WriteSymbol(0xf0, ac_huff, bw);
+      r -= 16;
+      for (int bit : refinement_bits) {
+        WriteBits(bw, 1, bit);
+      }
+      refinement_bits.clear();
+    }
+    if (abs_values[k] > 1) {
+      refinement_bits.push_back(abs_values[k] & 1u);
+      continue;
+    }
+    Flush(coding_state, bw);
+    int symbol = (r << 4u) + 1;
+    int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1;
+    WriteSymbol(symbol, ac_huff, bw);
+    WriteBits(bw, 1, new_non_zero_bit);
+    for (int bit : refinement_bits) {
+      WriteBits(bw, 1, bit);
+    }
+    refinement_bits.clear();
+    r = 0;
+  }
+  if (r > 0 || !refinement_bits.empty()) {
+    BufferEndOfBand(coding_state, ac_huff, &refinement_bits, bw);
+    if (!eob_run_allowed) {
+      Flush(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize) {
+  size_t pos = 0;
+  while (pos < bufsize) {
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      JPEGLI_ERROR("Destination suspension is not supported in markers.");
+    }
+    size_t len = std::min<size_t>(cinfo->dest->free_in_buffer, bufsize - pos);
+    memcpy(cinfo->dest->next_output_byte, buf + pos, len);
+    pos += len;
+    cinfo->dest->free_in_buffer -= len;
+    cinfo->dest->next_output_byte += len;
+  }
+}
+
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes) {
+  WriteOutput(cinfo, bytes.data(), bytes.size());
+}
+
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes) {
+  WriteOutput(cinfo, bytes.begin(), bytes.size());
+}
+
+void EncodeAPP0(j_compress_ptr cinfo) {
+  WriteOutput(cinfo,
+              {0xff, 0xe0, 0, 16, 'J', 'F', 'I', 'F', '\0',
+               cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
+               cinfo->density_unit, static_cast<uint8_t>(cinfo->X_density >> 8),
+               static_cast<uint8_t>(cinfo->X_density & 0xff),
+               static_cast<uint8_t>(cinfo->Y_density >> 8),
+               static_cast<uint8_t>(cinfo->Y_density & 0xff), 0, 0});
+}
+
+void EncodeAPP14(j_compress_ptr cinfo) {
+  uint8_t color_transform = cinfo->jpeg_color_space == JCS_YCbCr  ? 1
+                            : cinfo->jpeg_color_space == JCS_YCCK ? 2
+                                                                  : 0;
+  WriteOutput(cinfo, {0xff, 0xee, 0, 14, 'A', 'd', 'o', 'b', 'e', 0, 100, 0, 0,
+                      0, 0, color_transform});
+}
+
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline) {
+  if (cinfo->data_precision != kJpegPrecision) {
+    is_baseline = false;
+    JPEGLI_ERROR("Unsupported data precision %d", cinfo->data_precision);
+  }
+  const uint8_t marker = cinfo->progressive_mode ? 0xc2
+                         : is_baseline           ? 0xc0
+                                                 : 0xc1;
+  const size_t n_comps = cinfo->num_components;
+  const size_t marker_len = 8 + 3 * n_comps;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = marker;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = cinfo->image_height >> 8u;
+  data[pos++] = cinfo->image_height & 0xFFu;
+  data[pos++] = cinfo->image_width >> 8u;
+  data[pos++] = cinfo->image_width & 0xFFu;
+  data[pos++] = n_comps;
+  for (size_t i = 0; i < n_comps; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    data[pos++] = comp->component_id;
+    data[pos++] = ((comp->h_samp_factor << 4u) | (comp->v_samp_factor));
+    const uint32_t quant_idx = comp->quant_tbl_no;
+    if (cinfo->quant_tbl_ptrs[quant_idx] == nullptr) {
+      JPEGLI_ERROR("Invalid component quant table index %u.", quant_idx);
+    }
+    data[pos++] = quant_idx;
+  }
+  WriteOutput(cinfo, data);
+}
+
+void EncodeSOS(j_compress_ptr cinfo, int scan_index) {
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const ScanCodingInfo& sci = cinfo->master->scan_coding_info[scan_index];
+  const size_t marker_len = 6 + 2 * scan_info->comps_in_scan;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDA;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = scan_info->comps_in_scan;
+  for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+    int comp_idx = scan_info->component_index[i];
+    data[pos++] = cinfo->comp_info[comp_idx].component_id;
+    data[pos++] = (sci.dc_tbl_idx[i] << 4u) + (sci.ac_tbl_idx[i] - 4);
+  }
+  data[pos++] = scan_info->Ss;
+  data[pos++] = scan_info->Se;
+  data[pos++] = ((scan_info->Ah << 4u) | (scan_info->Al));
+  WriteOutput(cinfo, data);
+}
+
+void EncodeDHT(j_compress_ptr cinfo, const JPEGHuffmanCode* huffman_codes,
+               size_t num_huffman_codes, bool pre_shifted) {
+  if (num_huffman_codes == 0) {
+    return;
+  }
+
+  size_t marker_len = 2;
+  for (size_t i = 0; i < num_huffman_codes; ++i) {
+    const JPEGHuffmanCode& huff = huffman_codes[i];
+    if (huff.sent_table) continue;
+    marker_len += kJpegHuffmanMaxBitLength;
+    for (size_t j = 0; j <= kJpegHuffmanMaxBitLength; ++j) {
+      marker_len += huff.counts[j];
+    }
+  }
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xC4;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  for (size_t i = 0; i < num_huffman_codes; ++i) {
+    const JPEGHuffmanCode& huff = huffman_codes[i];
+    size_t index = huff.slot_id;
+    HuffmanCodeTable* huff_table;
+    if (index & 0x10) {
+      huff_table = &cinfo->master->huff_tables[index - 12];
+    } else {
+      huff_table = &cinfo->master->huff_tables[index];
+    }
+    // TODO(eustas): cache
+    // TODO(eustas): set up non-existing symbols
+    if (!BuildHuffmanCodeTable(huff, huff_table, pre_shifted)) {
+      JPEGLI_ERROR("Failed to build Huffman code table.");
+    }
+    if (huff.sent_table) continue;
+    size_t total_count = 0;
+    size_t max_length = 0;
+    for (size_t i = 0; i <= kJpegHuffmanMaxBitLength; ++i) {
+      if (huff.counts[i] != 0) {
+        max_length = i;
+      }
+      total_count += huff.counts[i];
+    }
+    --total_count;
+    data[pos++] = huff.slot_id;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]);
+    }
+    for (size_t i = 0; i < total_count; ++i) {
+      data[pos++] = huff.values[i];
+    }
+  }
+  if (marker_len > 2) {
+    WriteOutput(cinfo, data);
+  }
+}
+
+void EncodeDQT(j_compress_ptr cinfo, bool write_all_tables, bool* is_baseline) {
+  uint8_t data[4 + NUM_QUANT_TBLS * (1 + 2 * DCTSIZE2)];  // 520 bytes
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDB;
+  pos += 2;  // Length will be filled in later.
+
+  int send_table[NUM_QUANT_TBLS] = {};
+  if (write_all_tables) {
+    for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+      if (cinfo->quant_tbl_ptrs[i]) send_table[i] = 1;
+    }
+  } else {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      send_table[cinfo->comp_info[c].quant_tbl_no] = 1;
+    }
+  }
+
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!send_table[i]) continue;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[i];
+    if (quant_table == nullptr) {
+      JPEGLI_ERROR("Missing quant table %d", i);
+    }
+    int precision = 0;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      if (quant_table->quantval[k] > 255) {
+        precision = 1;
+        *is_baseline = false;
+      }
+    }
+    if (quant_table->sent_table) {
+      continue;
+    }
+    data[pos++] = (precision << 4) + i;
+    for (size_t j = 0; j < DCTSIZE2; ++j) {
+      int val_idx = kJPEGNaturalOrder[j];
+      int val = quant_table->quantval[val_idx];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      if (precision) {
+        data[pos++] = val >> 8;
+      }
+      data[pos++] = val & 0xFFu;
+    }
+    quant_table->sent_table = TRUE;
+  }
+  if (pos > 4) {
+    data[2] = (pos - 2) >> 8u;
+    data[3] = (pos - 2) & 0xFFu;
+    WriteOutput(cinfo, data, pos);
+  }
+}
+
+bool EncodeDRI(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xDD, 0, 4,
+                      static_cast<uint8_t>(cinfo->restart_interval >> 8),
+                      static_cast<uint8_t>(cinfo->restart_interval & 0xFF)});
+  return true;
+}
+
+static JXL_INLINE void EmitMarker(JpegBitWriter* bw, int marker) {
+  bw->data[bw->pos++] = 0xFF;
+  bw->data[bw->pos++] = marker;
+}
+
+void ProgressMonitorEncodePass(j_compress_ptr cinfo, size_t scan_index,
+                               size_t mcu_y) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 1 + scan_index;
+  cinfo->progress->pass_counter = mcu_y;
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+bool EncodeScan(j_compress_ptr cinfo, int scan_index) {
+  jpeg_comp_master* m = cinfo->master;
+  const int restart_interval = cinfo->restart_interval;
+  int restarts_to_go = restart_interval;
+  int next_restart_marker = 0;
+
+  JpegBitWriter* bw = &m->bw;
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  DCTCodingState coding_state;
+  DCTCodingStateInit(&coding_state);
+
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const ScanCodingInfo& sci = m->scan_coding_info[scan_index];
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info->comps_in_scan > 1);
+  jpeg_component_info* base_comp =
+      &cinfo->comp_info[scan_info->component_index[0]];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_comp->v_samp_factor;
+  int MCUs_per_row =
+      DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+  int MCU_rows =
+      DivCeil(cinfo->image_height * v_group, 8 * cinfo->max_v_samp_factor);
+  const bool is_progressive = cinfo->progressive_mode;
+  const int Al = scan_info->Al;
+  const int Ah = scan_info->Ah;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  HWY_ALIGN constexpr coeff_t kDummyBlock[DCTSIZE2] = {0};
+
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    ProgressMonitorEncodePass(cinfo, scan_index, mcu_y);
+    for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+      int comp_idx = scan_info->component_index[i];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+      int by0 = mcu_y * n_blocks_y;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(n_blocks_y, block_rows_left);
+      ba[i] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx],
+          by0, max_block_rows, false);
+    }
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        Flush(&coding_state, bw);
+        JumpToByteBoundary(bw);
+        EmitMarker(bw, 0xD0 + next_restart_marker);
+        next_restart_marker += 1;
+        next_restart_marker &= 0x7;
+        restarts_to_go = restart_interval;
+        memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+      }
+      // Encode one MCU
+      for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+        int comp_idx = scan_info->component_index[i];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        HuffmanCodeTable* dc_huff = &m->huff_tables[sci.dc_tbl_idx[i]];
+        HuffmanCodeTable* ac_huff = &m->huff_tables[sci.ac_tbl_idx[i]];
+        int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? comp->h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            size_t block_y = mcu_y * n_blocks_y + iy;
+            size_t block_x = mcu_x * n_blocks_x + ix;
+            const coeff_t* block;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              block = kDummyBlock;
+            } else {
+              block = &ba[i][iy][block_x][0];
+            }
+            bool ok;
+            if (!is_progressive) {
+              ok = EncodeDCTBlockSequential(block, dc_huff, ac_huff,
+                                            last_dc_coeff + i, bw);
+            } else if (Ah == 0) {
+              ok = EncodeDCTBlockProgressive(block, dc_huff, ac_huff, Ss, Se,
+                                             Al, &coding_state,
+                                             last_dc_coeff + i, bw);
+            } else {
+              ok = EncodeRefinementBits(block, ac_huff, Ss, Se, Al,
+                                        &coding_state, bw);
+            }
+            if (!ok) return false;
+          }
+        }
+      }
+      --restarts_to_go;
+    }
+    if (!EmptyBitWriterBuffer(bw)) {
+      JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+    }
+  }
+  Flush(&coding_state, bw);
+  JumpToByteBoundary(bw);
+  if (!EmptyBitWriterBuffer(bw)) {
+    JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+  }
+  if (!bw->healthy) return false;
+
+  return true;
+}
+
+struct Token {
+  uint8_t histo_idx;
+  uint8_t symbol;
+  uint16_t bits;
+  Token(int i, int s, int b) : histo_idx(i), symbol(s), bits(b) {}
+};
+
+void ComputeTokensForBlock(const coeff_t* block, int histo_dc, int histo_ac,
+                           coeff_t* last_dc_coeff, Token** tokens_ptr) {
+  Token* next_token = *tokens_ptr;
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = block[0];
+  temp = temp2 - *last_dc_coeff;
+  if (temp == 0) {
+    *next_token++ = Token(histo_dc, 0, 0);
+  } else {
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      temp2--;
+    }
+    int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int dc_mask = (1 << dc_nbits) - 1;
+    *next_token++ = Token(histo_dc, dc_nbits, temp2 & dc_mask);
+  }
+  int num_nonzeros = HWY_DYNAMIC_DISPATCH(NumNonZero8x8ExceptDC)(block);
+  for (int k = 1; k < 64; ++k) {
+    if (num_nonzeros == 0) {
+      *next_token++ = Token(histo_ac, 0, 0);
+      break;
+    }
+    int r = 0;
+    while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      k++;
+    }
+    --num_nonzeros;
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      *next_token++ = Token(histo_ac, 0xf0, 0);
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int ac_mask = (1 << ac_nbits) - 1;
+    int symbol = (r << 4u) + ac_nbits;
+    *next_token++ = Token(histo_ac, symbol, temp2 & ac_mask);
+  }
+  *tokens_ptr = next_token;
+}
+
+struct TokenArray {
+  Token* tokens = nullptr;
+  size_t num_tokens = 0;
+};
+
+size_t MaxNumTokensPerMCURow(j_compress_ptr cinfo) {
+  int MCUs_per_row = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  size_t blocks_per_mcu = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    blocks_per_mcu += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  return kDCTBlockSize * blocks_per_mcu * MCUs_per_row;
+}
+
+size_t EstimateNumTokens(j_compress_ptr cinfo, size_t mcu_y, size_t ysize_mcus,
+                         size_t num_tokens, size_t max_per_row) {
+  size_t estimate;
+  if (mcu_y == 0) {
+    estimate = 16 * max_per_row;
+  } else {
+    estimate = (4 * ysize_mcus * num_tokens) / (3 * mcu_y);
+  }
+  size_t mcus_left = ysize_mcus - mcu_y;
+  return std::min(mcus_left * max_per_row,
+                  std::max(max_per_row, estimate - num_tokens));
+}
+
+void ComputeTokens(j_compress_ptr cinfo,
+                   std::vector<TokenArray>* token_arrays) {
+  jpeg_comp_master* m = cinfo->master;
+  TokenArray ta;
+  Token* next_token = ta.tokens;
+  size_t num_tokens = 0;
+  size_t total_num_tokens = 0;
+  size_t max_tokens_per_mcu_row = MaxNumTokensPerMCURow(cinfo);
+  int xsize_mcus = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  int ysize_mcus = DivCeil(cinfo->image_height, 8 * cinfo->max_v_samp_factor);
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < ysize_mcus; ++mcu_y) {
+    ProgressMonitorEncodePass(cinfo, 0, mcu_y);
+    ta.num_tokens = next_token - ta.tokens;
+    if (ta.num_tokens + max_tokens_per_mcu_row > num_tokens) {
+      if (ta.tokens) {
+        token_arrays->push_back(ta);
+        total_num_tokens += ta.num_tokens;
+      }
+      num_tokens = EstimateNumTokens(cinfo, mcu_y, ysize_mcus, total_num_tokens,
+                                     max_tokens_per_mcu_row);
+      ta.tokens = Allocate<Token>(cinfo, num_tokens, JPOOL_IMAGE);
+      next_token = ta.tokens;
+    }
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      int by0 = mcu_y * comp->v_samp_factor;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+      ba[c] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[c], by0,
+          max_block_rows, false);
+    }
+    if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
+      for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+        for (int c = 0; c < cinfo->num_components; ++c) {
+          ComputeTokensForBlock(&ba[c][0][mcu_x][0], c, c + 4,
+                                &last_dc_coeff[c], &next_token);
+        }
+      }
+      continue;
+    }
+    for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        jpeg_component_info* comp = &cinfo->comp_info[c];
+        for (int iy = 0; iy < comp->v_samp_factor; ++iy) {
+          for (int ix = 0; ix < comp->h_samp_factor; ++ix) {
+            size_t block_y = mcu_y * comp->v_samp_factor + iy;
+            size_t block_x = mcu_x * comp->h_samp_factor + ix;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              *next_token++ = Token(c, 0, 0);
+              *next_token++ = Token(c + 4, 0, 0);
+              continue;
+            }
+            ComputeTokensForBlock(&ba[c][iy][block_x][0], c, c + 4,
+                                  &last_dc_coeff[c], &next_token);
+          }
+        }
+      }
+    }
+  }
+  ta.num_tokens = next_token - ta.tokens;
+  token_arrays->push_back(ta);
+}
+
+void WriteTokens(j_compress_ptr cinfo, const Token* tokens, size_t num_tokens,
+                 const HuffmanCodeTable* huff_tables, const int* context_map,
+                 JpegBitWriter* bw) {
+  size_t cycle_len = bw->len / 8;
+  size_t next_cycle = cycle_len;
+  for (size_t i = 0; i < num_tokens; ++i) {
+    Token t = tokens[i];
+    int nbits = t.symbol & 0xf;
+    WriteSymbol(t.symbol, &huff_tables[context_map[t.histo_idx]], bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, t.bits);
+    }
+    if (--next_cycle == 0) {
+      if (!EmptyBitWriterBuffer(bw)) {
+        JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+      }
+      next_cycle = cycle_len;
+    }
+  }
+}
+
+void BuildHistograms(const Token* tokens, size_t num_tokens,
+                     Histogram* histograms) {
+  for (size_t j = 0; j < num_tokens; ++j) {
+    Token t = tokens[j];
+    ++histograms[t.histo_idx].count[t.symbol];
+  }
+}
+
+void EncodeSingleScan(j_compress_ptr cinfo) {
+  std::vector<TokenArray> token_arrays;
+  ComputeTokens(cinfo, &token_arrays);
+  Histogram histograms[8] = {};
+  for (size_t i = 0; i < token_arrays.size(); ++i) {
+    Token* tokens = token_arrays[i].tokens;
+    size_t num_tokens = token_arrays[i].num_tokens;
+    BuildHistograms(tokens, num_tokens, histograms);
+  }
+  JpegClusteredHistograms dc_clusters;
+  ClusterJpegHistograms(histograms, 4, &dc_clusters);
+  JpegClusteredHistograms ac_clusters;
+  ClusterJpegHistograms(histograms + 4, 4, &ac_clusters);
+
+  JPEGHuffmanCode* huffman_codes =
+      Allocate<JPEGHuffmanCode>(cinfo, 8, JPOOL_IMAGE);
+  size_t num_huffman_codes = 0;
+  for (size_t i = 0; i < dc_clusters.histograms.size(); ++i) {
+    AddJpegHuffmanCode(dc_clusters.histograms[i], i, huffman_codes,
+                       &num_huffman_codes);
+  }
+  for (size_t i = 0; i < ac_clusters.histograms.size(); ++i) {
+    AddJpegHuffmanCode(ac_clusters.histograms[i], 0x10 + i, huffman_codes,
+                       &num_huffman_codes);
+  }
+
+  bool is_baseline = true;
+  int context_map[8];
+  ScanCodingInfo sci = {};
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (dc_clusters.histogram_indexes[c] > 1 ||
+        ac_clusters.histogram_indexes[c] > 1) {
+      is_baseline = false;
+    }
+    sci.dc_tbl_idx[c] = dc_clusters.histogram_indexes[c];
+    sci.ac_tbl_idx[c] = ac_clusters.histogram_indexes[c] + 4;
+    context_map[c] = sci.dc_tbl_idx[c];
+    context_map[c + 4] = sci.ac_tbl_idx[c];
+  }
+  sci.num_huffman_codes = num_huffman_codes;
+  memcpy(cinfo->master->scan_coding_info, &sci, sizeof(sci));
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  EncodeDHT(cinfo, huffman_codes, num_huffman_codes);
+  EncodeSOS(cinfo, 0);
+
+  JpegBitWriter* bw = &cinfo->master->bw;
+  HuffmanCodeTable* huff_tables = cinfo->master->huff_tables;
+  for (size_t i = 0; i < token_arrays.size(); ++i) {
+    Token* tokens = token_arrays[i].tokens;
+    size_t num_tokens = token_arrays[i].num_tokens;
+    WriteTokens(cinfo, tokens, num_tokens, huff_tables, context_map, bw);
+  }
+  JumpToByteBoundary(bw);
+  if (!EmptyBitWriterBuffer(bw)) {
+    JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+  }
+  if (!bw->healthy) {
+    JPEGLI_ERROR("Failed to encode scan.");
+  }
+}
+
+HWY_EXPORT(WriteiMCURow);
+void WriteiMCURow(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(WriteiMCURow)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/bitstream.h b/third_party/jpeg-xl/lib/jpegli/bitstream.h
new file mode 100644
index 0000000000..18b6a09c29
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/bitstream.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BITSTREAM_H_
+#define LIB_JPEGLI_BITSTREAM_H_
+
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize);
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes);
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes);
+
+void EncodeAPP0(j_compress_ptr cinfo);
+void EncodeAPP14(j_compress_ptr cinfo);
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline);
+void EncodeSOS(j_compress_ptr cinfo, int scan_index);
+void EncodeDHT(j_compress_ptr cinfo, const JPEGHuffmanCode* huffman_codes,
+               size_t num_huffman_codes, bool pre_shifted = false);
+void EncodeDQT(j_compress_ptr cinfo, bool write_all_tables, bool* is_baseline);
+bool EncodeDRI(j_compress_ptr cinfo);
+
+bool EncodeScan(j_compress_ptr cinfo, int scan_index);
+
+void EncodeSingleScan(j_compress_ptr cinfo);
+
+void WriteiMCURow(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_BITSTREAM_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/color_quantize.cc b/third_party/jpeg-xl/lib/jpegli/color_quantize.cc
new file mode 100644
index 0000000000..1079c45c9f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_quantize.cc
@@ -0,0 +1,533 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_quantize.h"
+
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+namespace {
+
+static constexpr int kNumColorCellBits[kMaxComponents] = {3, 4, 3, 3};
+static constexpr int kCompW[kMaxComponents] = {2, 3, 1, 1};
+
+int Pow(int a, int b) {
+  int r = 1;
+  for (int i = 0; i < b; ++i) {
+    r *= a;
+  }
+  return r;
+}
+
+int ComponentOrder(j_decompress_ptr cinfo, int i) {
+  if (cinfo->out_color_components == 3) {
+    return i < 2 ? 1 - i : i;
+  }
+  return i;
+}
+
+int GetColorComponent(int i, int N) {
+  return (i * 255 + (N - 1) / 2) / (N - 1);
+}
+
+}  // namespace
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int components = cinfo->out_color_components;
+  int desired = std::min(cinfo->desired_number_of_colors, 256);
+  int num = 1;
+  while (Pow(num + 1, components) <= desired) {
+    ++num;
+  }
+  if (num == 1) {
+    JPEGLI_ERROR("Too few colors (%d) in requested colormap", desired);
+  }
+  int actual = Pow(num, components);
+  for (int i = 0; i < components; ++i) {
+    m->num_colors_[i] = num;
+  }
+  while (actual < desired) {
+    int total = actual;
+    for (int i = 0; i < components; ++i) {
+      int c = ComponentOrder(cinfo, i);
+      int new_total = (actual / m->num_colors_[c]) * (m->num_colors_[c] + 1);
+      if (new_total <= desired) {
+        ++m->num_colors_[c];
+        actual = new_total;
+      }
+    }
+    if (actual == total) {
+      break;
+    }
+  }
+  cinfo->actual_number_of_colors = actual;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, actual, components);
+  int next_color[kMaxComponents] = {0};
+  for (int i = 0; i < actual; ++i) {
+    for (int c = 0; c < components; ++c) {
+      cinfo->colormap[c][i] =
+          GetColorComponent(next_color[c], m->num_colors_[c]);
+    }
+    int c = components - 1;
+    while (c > 0 && next_color[c] + 1 == m->num_colors_[c]) {
+      next_color[c--] = 0;
+    }
+    ++next_color[c];
+  }
+  if (!m->colormap_lut_) {
+    m->colormap_lut_ = Allocate<uint8_t>(cinfo, components * 256, JPOOL_IMAGE);
+  }
+  int stride = actual;
+  for (int c = 0; c < components; ++c) {
+    int N = m->num_colors_[c];
+    stride /= N;
+    for (int i = 0; i < 256; ++i) {
+      int index = ((2 * i - 1) * (N - 1) + 254) / 510;
+      m->colormap_lut_[c * 256 + i] = index * stride;
+    }
+  }
+}
+
+namespace {
+
+// 2^13 priority levels for the PQ seems to be a good compromise between
+// accuracy, running time and stack space usage.
+static const int kMaxPriority = 1 << 13;
+static const int kMaxLevel = 3;
+
+// This function is used in the multi-resolution grid to be able to compute
+// the keys for the different resolutions by just shifting the first key.
+inline int InterlaceBitsRGB(uint8_t r, uint8_t g, uint8_t b) {
+  int z = 0;
+  for (int i = 0; i < 7; ++i) {
+    z += (r >> 5) & 4;
+    z += (g >> 6) & 2;
+    z += (b >> 7);
+    z <<= 3;
+    r <<= 1;
+    g <<= 1;
+    b <<= 1;
+  }
+  z += (r >> 5) & 4;
+  z += (g >> 6) & 2;
+  z += (b >> 7);
+  return z;
+}
+
+// This function will compute the actual priorities of the colors based on
+// the current distance from the palette, the population count and the signals
+// from the multi-resolution grid.
+inline int Priority(int d, int n, const int* density, const int* radius) {
+  int p = d * n;
+  for (int level = 0; level < kMaxLevel; ++level) {
+    if (d > radius[level]) {
+      p += density[level] * (d - radius[level]);
+    }
+  }
+  return std::min(kMaxPriority - 1, p >> 4);
+}
+
+inline int ColorIntQuadDistanceRGB(uint8_t r1, uint8_t g1, uint8_t b1,
+                                   uint8_t r2, uint8_t g2, uint8_t b2) {
+  // weights for the intensity calculation
+  static constexpr int ired = 2;
+  static constexpr int igreen = 5;
+  static constexpr int iblue = 1;
+  // normalization factor for the intensity calculation (2^ishift)
+  static constexpr int ishift = 3;
+  const int rd = r1 - r2;
+  const int gd = g1 - g2;
+  const int bd = b1 - b2;
+  const int id = ired * rd + igreen * gd + iblue * bd;
+  return rd * rd + gd * gd + bd * bd + ((id * id) >> (2 * ishift));
+}
+
+inline int ScaleQuadDistanceRGB(int d) {
+  return static_cast<int>(sqrt(d * 0.25) + 0.5);
+}
+
+// The function updates the minimal distances, the clustering and the
+// quantization error after the insertion of the new color into the palette.
+void AddToRGBPalette(const uint8_t* red, const uint8_t* green,
+                     const uint8_t* blue,
+                     const int* count,  // histogram of colors
+                     const int index,   // index of color to be added
+                     const int k,       // size of current palette
+                     const int n,       // number of colors
+                     int* dist,         // array of distances from palette
+                     int* cluster,      // mapping of color indices to palette
+                     int* center,       // the inverse mapping
+                     int64_t* error) {  // measure of the quantization error
+  center[k] = index;
+  cluster[index] = k;
+  *error -=
+      static_cast<int64_t>(dist[index]) * static_cast<int64_t>(count[index]);
+  dist[index] = 0;
+  for (int j = 0; j < n; ++j) {
+    if (dist[j] > 0) {
+      const int d = ColorIntQuadDistanceRGB(
+          red[index], green[index], blue[index], red[j], green[j], blue[j]);
+      if (d < dist[j]) {
+        *error += static_cast<int64_t>((d - dist[j])) *
+                  static_cast<int64_t>(count[j]);
+        dist[j] = d;
+        cluster[j] = k;
+      }
+    }
+  }
+}
+
+struct RGBPixelHasher {
+  // A quick but good-enough hash to get 24 bits of RGB into the lower 12 bits.
+  size_t operator()(uint32_t a) const { return (a ^ (a >> 12)) * 0x9e3779b9; }
+};
+
+struct WangHasher {
+  // Thomas Wang's Hash.  Nearly perfect and still quite fast.  Above (for
+  // pixels) we use a simpler hash because the number of hash calls is
+  // proportional to the number of pixels and that hash dominates; we want the
+  // cost to be minimal and we start with a large table.  We can use a better
+  // hash for the histogram since the number of hash calls is proportional to
+  // the number of unique colors in the image, which is hopefully much smaller.
+  // Note that the difference is slight; e.g. replacing RGBPixelHasher with
+  // WangHasher only slows things down by 5% on an Opteron.
+  size_t operator()(uint32_t a) const {
+    a = (a ^ 61) ^ (a >> 16);
+    a = a + (a << 3);
+    a = a ^ (a >> 4);
+    a = a * 0x27d4eb2d;
+    a = a ^ (a >> 15);
+    return a;
+  }
+};
+
+// Build an index of all the different colors in the input
+// image. To do this we map the 24 bit RGB representation of the colors
+// to a unique integer index assigned to the different colors in order of
+// appearence in the image.  Return the number of unique colors found.
+// The colors are pre-quantized to 3 * 6 bits precision.
+static int BuildRGBColorIndex(const uint8_t* const image, int const num_pixels,
+                              int* const count, uint8_t* const red,
+                              uint8_t* const green, uint8_t* const blue) {
+  // Impossible because rgb are in the low 24 bits, and the upper 8 bits is 0.
+  const uint32_t impossible_pixel_value = 0x10000000;
+  std::unordered_map<uint32_t, int, RGBPixelHasher> index_map(1 << 12);
+  std::unordered_map<uint32_t, int, RGBPixelHasher>::iterator index_map_lookup;
+  const uint8_t* imagep = &image[0];
+  uint32_t prev_pixel = impossible_pixel_value;
+  int index = 0;
+  int n = 0;
+  for (int i = 0; i < num_pixels; ++i) {
+    uint8_t r = ((*imagep++) & 0xfc) + 2;
+    uint8_t g = ((*imagep++) & 0xfc) + 2;
+    uint8_t b = ((*imagep++) & 0xfc) + 2;
+    uint32_t pixel = (b << 16) | (g << 8) | r;
+    if (pixel != prev_pixel) {
+      prev_pixel = pixel;
+      index_map_lookup = index_map.find(pixel);
+      if (index_map_lookup != index_map.end()) {
+        index = index_map_lookup->second;
+      } else {
+        index_map[pixel] = index = n++;
+        red[index] = r;
+        green[index] = g;
+        blue[index] = b;
+      }
+    }
+    ++count[index];
+  }
+  return n;
+}
+
+}  // namespace
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo) {
+  if (cinfo->out_color_space != JCS_RGB) {
+    JPEGLI_ERROR("Two-pass quantizer must use RGB output color space.");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t num_pixels = cinfo->output_width * cinfo->output_height;
+  const int max_color_count = std::max<size_t>(num_pixels, 1u << 18);
+  const int max_palette_size = cinfo->desired_number_of_colors;
+  std::unique_ptr<uint8_t[]> red(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> green(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> blue(new uint8_t[max_color_count]);
+  std::vector<int> count(max_color_count, 0);
+  // number of colors
+  int n = BuildRGBColorIndex(m->pixels_, num_pixels, &count[0], &red[0],
+                             &green[0], &blue[0]);
+
+  std::vector<int> dist(n, std::numeric_limits<int>::max());
+  std::vector<int> cluster(n);
+  std::vector<bool> in_palette(n, false);
+  int center[256];
+  int k = 0;  // palette size
+  const int count_threshold = (num_pixels * 4) / max_palette_size;
+  static constexpr int kAveragePixelErrorThreshold = 1;
+  const int64_t error_threshold = num_pixels * kAveragePixelErrorThreshold;
+  int64_t error = 0;  // quantization error
+
+  int max_count = 0;
+  int winner = 0;
+  for (int i = 0; i < n; ++i) {
+    if (count[i] > max_count) {
+      max_count = count[i];
+      winner = i;
+    }
+    if (!in_palette[i] && count[i] > count_threshold) {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+      in_palette[i] = true;
+    }
+  }
+  if (k == 0) {
+    AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], winner, k++, n,
+                    &dist[0], &cluster[0], &center[0], &error);
+    in_palette[winner] = true;
+  }
+
+  // Calculation of the multi-resolution density grid.
+  std::vector<int> density(n * kMaxLevel);
+  std::vector<int> radius(n * kMaxLevel);
+  std::unordered_map<uint32_t, int, WangHasher> histogram[kMaxLevel];
+  for (int level = 0; level < kMaxLevel; ++level) {
+    // This value is never used because key = InterlaceBitsRGB(...) >> 6
+  }
+
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      for (int level = 0; level < kMaxLevel; ++level) {
+        histogram[level][key >> (3 * level)] += count[i];
+      }
+    }
+  }
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      for (int level = 0; level < kMaxLevel; ++level) {
+        const int mask = (4 << level) - 1;
+        const int rd = std::max(red[i] & mask, mask - (red[i] & mask));
+        const int gd = std::max(green[i] & mask, mask - (green[i] & mask));
+        const int bd = std::max(blue[i] & mask, mask - (blue[i] & mask));
+        radius[i * kMaxLevel + level] =
+            ScaleQuadDistanceRGB(ColorIntQuadDistanceRGB(0, 0, 0, rd, gd, bd));
+      }
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      if (kMaxLevel > 0) {
+        density[i * kMaxLevel] = histogram[0][key] - count[i];
+      }
+      for (int level = 1; level < kMaxLevel; ++level) {
+        density[i * kMaxLevel + level] =
+            (histogram[level][key >> (3 * level)] -
+             histogram[level - 1][key >> (3 * level - 3)]);
+      }
+    }
+  }
+
+  // Calculate the initial error now that the palette has been initialized.
+  error = 0;
+  for (int i = 0; i < n; ++i) {
+    error += static_cast<int64_t>(dist[i]) * static_cast<int64_t>(count[i]);
+  }
+
+  std::unique_ptr<std::vector<int>[]> bucket_array(
+      new std::vector<int>[kMaxPriority]);
+  int top_priority = -1;
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                              &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+      bucket_array[priority].push_back(i);
+      top_priority = std::max(priority, top_priority);
+    }
+  }
+  double error_accum = 0;
+  while (top_priority >= 0 && k < max_palette_size) {
+    if (error < error_threshold) {
+      error_accum += std::min(error_threshold, error_threshold - error);
+      if (error_accum >= 10 * error_threshold) {
+        break;
+      }
+    }
+    int i = bucket_array[top_priority].back();
+    int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                            &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+    if (priority < top_priority) {
+      bucket_array[priority].push_back(i);
+    } else {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+    }
+    bucket_array[top_priority].pop_back();
+    while (top_priority >= 0 && bucket_array[top_priority].empty()) {
+      --top_priority;
+    }
+  }
+
+  cinfo->actual_number_of_colors = k;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, k, 3);
+  for (int i = 0; i < k; ++i) {
+    int index = center[i];
+    cinfo->colormap[0][i] = red[index];
+    cinfo->colormap[1][i] = green[index];
+    cinfo->colormap[2][i] = blue[index];
+  }
+}
+
+namespace {
+
+void FindCandidatesForCell(j_decompress_ptr cinfo, int ncomp, int cell[],
+                           std::vector<uint8_t>* candidates) {
+  int cell_min[kMaxComponents];
+  int cell_max[kMaxComponents];
+  int cell_center[kMaxComponents];
+  for (int c = 0; c < ncomp; ++c) {
+    cell_min[c] = cell[c] << (8 - kNumColorCellBits[c]);
+    cell_max[c] = cell_min[c] + (1 << (8 - kNumColorCellBits[c])) - 1;
+    cell_center[c] = (cell_min[c] + cell_max[c]) >> 1;
+  }
+  int min_maxdist = std::numeric_limits<int>::max();
+  int mindist[256];
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    int dmin = 0;
+    int dmax = 0;
+    for (int c = 0; c < ncomp; ++c) {
+      int palette_c = cinfo->colormap[c][i];
+      int dminc = 0, dmaxc;
+      if (palette_c < cell_min[c]) {
+        dminc = cell_min[c] - palette_c;
+        dmaxc = cell_max[c] - palette_c;
+      } else if (palette_c > cell_max[c]) {
+        dminc = palette_c - cell_max[c];
+        dmaxc = palette_c - cell_min[c];
+      } else if (palette_c > cell_center[c]) {
+        dmaxc = palette_c - cell_min[c];
+      } else {
+        dmaxc = cell_max[c] - palette_c;
+      }
+      dminc *= kCompW[c];
+      dmaxc *= kCompW[c];
+      dmin += dminc * dminc;
+      dmax += dmaxc * dmaxc;
+    }
+    mindist[i] = dmin;
+    min_maxdist = std::min(dmax, min_maxdist);
+  }
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    if (mindist[i] < min_maxdist) {
+      candidates->push_back(i);
+    }
+  }
+}
+
+}  // namespace
+
+void CreateInverseColorMap(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int ncomp = cinfo->out_color_components;
+  int num_cells = 1;
+  for (int c = 0; c < ncomp; ++c) {
+    num_cells *= (1 << kNumColorCellBits[c]);
+  }
+  m->candidate_lists_.resize(num_cells);
+
+  int next_cell[kMaxComponents] = {0};
+  for (int i = 0; i < num_cells; ++i) {
+    m->candidate_lists_[i].clear();
+    FindCandidatesForCell(cinfo, ncomp, next_cell, &m->candidate_lists_[i]);
+    int c = ncomp - 1;
+    while (c > 0 && next_cell[c] + 1 == (1 << kNumColorCellBits[c])) {
+      next_cell[c--] = 0;
+    }
+    ++next_cell[c];
+  }
+  m->regenerate_inverse_colormap_ = false;
+}
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel) {
+  jpeg_decomp_master* m = cinfo->master;
+  int num_channels = cinfo->out_color_components;
+  int index = 0;
+  if (m->quant_mode_ == 1) {
+    for (int c = 0; c < num_channels; ++c) {
+      index += m->colormap_lut_[c * 256 + pixel[c]];
+    }
+  } else {
+    size_t cell_idx = 0;
+    size_t stride = 1;
+    for (int c = num_channels - 1; c >= 0; --c) {
+      cell_idx += (pixel[c] >> (8 - kNumColorCellBits[c])) * stride;
+      stride <<= kNumColorCellBits[c];
+    }
+    JXL_ASSERT(cell_idx < m->candidate_lists_.size());
+    int mindist = std::numeric_limits<int>::max();
+    const auto& candidates = m->candidate_lists_[cell_idx];
+    for (uint8_t i : candidates) {
+      int dist = 0;
+      for (int c = 0; c < num_channels; ++c) {
+        int d = (cinfo->colormap[c][i] - pixel[c]) * kCompW[c];
+        dist += d * d;
+      }
+      if (dist < mindist) {
+        mindist = dist;
+        index = i;
+      }
+    }
+  }
+  JXL_ASSERT(index < cinfo->actual_number_of_colors);
+  return index;
+}
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  static constexpr size_t kDitherSize = 4;
+  static constexpr size_t kDitherMask = kDitherSize - 1;
+  static constexpr float kBaseDitherMatrix[] = {
+      0,  8,  2,  10,  //
+      12, 4,  14, 6,   //
+      3,  11, 1,  9,   //
+      15, 7,  13, 5,   //
+  };
+  m->dither_size_ = kDitherSize;
+  m->dither_mask_ = kDitherMask;
+  size_t ncells = m->dither_size_ * m->dither_size_;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    float spread = 1.0f / (m->num_colors_[c] - 1);
+    float mul = spread / ncells;
+    float offset = 0.5f * spread;
+    if (m->dither_[c] == nullptr) {
+      m->dither_[c] = Allocate<float>(cinfo, ncells, JPOOL_IMAGE_ALIGNED);
+    }
+    for (size_t idx = 0; idx < ncells; ++idx) {
+      m->dither_[c][idx] = kBaseDitherMatrix[idx] * mul - offset;
+    }
+  }
+}
+
+void InitFSDitherState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    if (m->error_row_[c] == nullptr) {
+      m->error_row_[c] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+      m->error_row_[c + kMaxComponents] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+    }
+    memset(m->error_row_[c], 0.0, cinfo->output_width * sizeof(float));
+    memset(m->error_row_[c + kMaxComponents], 0.0,
+           cinfo->output_width * sizeof(float));
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/color_quantize.h b/third_party/jpeg-xl/lib/jpegli/color_quantize.h
new file mode 100644
index 0000000000..36a92d2f77
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_quantize.h
@@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_QUANTIZE_H_
+#define LIB_JPEGLI_COLOR_QUANTIZE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo);
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo);
+
+void CreateInverseColorMap(j_decompress_ptr cinfo);
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo);
+
+void InitFSDitherState(j_decompress_ptr cinfo);
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_QUANTIZE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/color_transform.cc b/third_party/jpeg-xl/lib/jpegli/color_transform.cc
new file mode 100644
index 0000000000..020a6fd80c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_transform.cc
@@ -0,0 +1,281 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_transform.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/color_transform.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto crcr = Set(df, 1.402f);
+  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+  const auto cbcb = Set(df, 1.772f);
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto y_vec = Load(df, row0 + x);
+    const auto cb_vec = Load(df, row1 + x);
+    const auto cr_vec = Load(df, row2 + x);
+    const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+    const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+    const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+    Store(r_vec, df, row0 + x);
+    Store(g_vec, df, row1 + x);
+    Store(b_vec, df, row2 + x);
+  }
+}
+
+void YCCKToCMYK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  YCbCrToRGB(row, xsize);
+  const auto offset = Set(df, -1.0f / 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(offset, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(offset, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(offset, Load(df, row2 + x)), df, row2 + x);
+  }
+}
+
+void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto c128 = Set(df, 128.0f);
+  const auto kR = Set(df, 0.299f);  // NTSC luma
+  const auto kG = Set(df, 0.587f);
+  const auto kB = Set(df, 0.114f);
+  const auto kAmpR = Set(df, 0.701f);
+  const auto kAmpB = Set(df, 0.886f);
+  const auto kDiffR = Add(kAmpR, kR);
+  const auto kDiffB = Add(kAmpB, kB);
+  const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
+  const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto r = Load(df, row0 + x);
+    const auto g = Load(df, row1 + x);
+    const auto b = Load(df, row2 + x);
+    const auto r_base = Mul(r, kR);
+    const auto r_diff = Mul(r, kDiffR);
+    const auto g_base = Mul(g, kG);
+    const auto b_base = Mul(b, kB);
+    const auto b_diff = Mul(b, kDiffB);
+    const auto y_base = Add(r_base, Add(g_base, b_base));
+    const auto cb_vec = MulAdd(Sub(b_diff, y_base), kNormB, c128);
+    const auto cr_vec = MulAdd(Sub(r_diff, y_base), kNormR, c128);
+    Store(y_base, df, row0 + x);
+    Store(cb_vec, df, row1 + x);
+    Store(cr_vec, df, row2 + x);
+  }
+}
+
+void CMYKToYCCK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  const auto unity = Set(df, 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(unity, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(unity, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(unity, Load(df, row2 + x)), df, row2 + x);
+  }
+  RGBToYCbCr(row, xsize);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(CMYKToYCCK);
+HWY_EXPORT(YCCKToCMYK);
+HWY_EXPORT(YCbCrToRGB);
+HWY_EXPORT(RGBToYCbCr);
+
+bool CheckColorSpaceComponents(int num_components, J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      return num_components == 1;
+    case JCS_RGB:
+    case JCS_YCbCr:
+    case JCS_EXT_RGB:
+    case JCS_EXT_BGR:
+      return num_components == 3;
+    case JCS_CMYK:
+    case JCS_YCCK:
+    case JCS_EXT_RGBX:
+    case JCS_EXT_BGRX:
+    case JCS_EXT_XBGR:
+    case JCS_EXT_XRGB:
+    case JCS_EXT_RGBA:
+    case JCS_EXT_BGRA:
+    case JCS_EXT_ABGR:
+    case JCS_EXT_ARGB:
+      return num_components == 4;
+    default:
+      // Unrecognized colorspaces can have any number of channels, since no
+      // color transform will be performed on them.
+      return true;
+  }
+}
+
+void NullTransform(float* row[kMaxComponents], size_t len) {}
+
+void GrayscaleToRGB(float* row[kMaxComponents], size_t len) {
+  memcpy(row[1], row[0], len * sizeof(row[1][0]));
+  memcpy(row[2], row[0], len * sizeof(row[2][0]));
+}
+
+void GrayscaleToYCbCr(float* row[kMaxComponents], size_t len) {
+  memset(row[1], 0, len * sizeof(row[1][0]));
+  memset(row[2], 0, len * sizeof(row[2][0]));
+}
+
+void ChooseColorTransform(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->input_components,
+                                 cinfo->in_color_space)) {
+    JPEGLI_ERROR("Invalid number of input components %d for colorspace %d",
+                 cinfo->input_components, cinfo->in_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->in_color_space) {
+    if (cinfo->num_components != cinfo->input_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->input_components, cinfo->num_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  if (cinfo->in_color_space == JCS_RGB && m->xyb_mode) {
+    JPEGLI_ERROR("Color transform on XYB colorspace is not supported.");
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_YCbCr ||
+               cinfo->in_color_space == JCS_YCCK) {
+      // Since the first luminance channel is the grayscale version of the
+      // image, nothing to do here
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToYCbCr;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->in_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(CMYKToYCCK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d", cinfo->in_color_space,
+                 cinfo->jpeg_color_space);
+  }
+}
+
+void ChooseColorTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->out_color_components,
+                                 cinfo->out_color_space)) {
+    JPEGLI_ERROR("Invalid number of output components %d for colorspace %d",
+                 cinfo->out_color_components, cinfo->out_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->out_color_space) {
+    if (cinfo->num_components != cinfo->out_color_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->num_components, cinfo->out_color_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB);
+    } else if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->out_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCCKToCMYK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d",
+                 cinfo->jpeg_color_space, cinfo->out_color_space);
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/color_transform.h b/third_party/jpeg-xl/lib/jpegli/color_transform.h
new file mode 100644
index 0000000000..27570858f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/color_transform.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_TRANSFORM_H_
+#define LIB_JPEGLI_COLOR_TRANSFORM_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseColorTransform(j_compress_ptr cinfo);
+
+void ChooseColorTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/common.cc b/third_party/jpeg-xl/lib/jpegli/common.cc
new file mode 100644
index 0000000000..5f34372f3e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common.cc
@@ -0,0 +1,59 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/common.h"
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/memory_manager.h"
+
+void jpegli_abort(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    if (pool_id == JPOOL_PERMANENT) continue;
+    (*cinfo->mem->free_pool)(cinfo, pool_id);
+  }
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecStart;
+  } else {
+    cinfo->global_state = jpegli::kEncStart;
+  }
+}
+
+void jpegli_destroy(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  (*cinfo->mem->self_destruct)(cinfo);
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecNull;
+    delete reinterpret_cast<j_decompress_ptr>(cinfo)->master;
+  } else {
+    cinfo->global_state = jpegli::kEncNull;
+  }
+}
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo) {
+  JQUANT_TBL* table = jpegli::Allocate<JQUANT_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo) {
+  JHUFF_TBL* table = jpegli::Allocate<JHUFF_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+int jpegli_bytes_per_sample(JpegliDataType data_type) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+      return 1;
+    case JPEGLI_TYPE_UINT16:
+      return 2;
+    case JPEGLI_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/common.h b/third_party/jpeg-xl/lib/jpegli/common.h
new file mode 100644
index 0000000000..f46b751018
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common.h
@@ -0,0 +1,67 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the common encoder/decoder part of libjpegli
+// library, which is based on the C API of libjpeg, with the function names
+// changed from jpeg_* to jpegli_*, while compressor and dempressor object
+// definitions are included directly from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_COMMON_H_
+#define LIB_JPEGLI_COMMON_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err);
+
+void jpegli_abort(j_common_ptr cinfo);
+
+void jpegli_destroy(j_common_ptr cinfo);
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo);
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo);
+
+//
+// New API structs and functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+typedef enum {
+  JPEGLI_TYPE_FLOAT = 0,
+  JPEGLI_TYPE_UINT8 = 2,
+  JPEGLI_TYPE_UINT16 = 3,
+} JpegliDataType;
+
+typedef enum {
+  JPEGLI_NATIVE_ENDIAN = 0,
+  JPEGLI_LITTLE_ENDIAN = 1,
+  JPEGLI_BIG_ENDIAN = 2,
+} JpegliEndianness;
+
+int jpegli_bytes_per_sample(JpegliDataType data_type);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/common_internal.h b/third_party/jpeg-xl/lib/jpegli/common_internal.h
new file mode 100644
index 0000000000..248d3154e1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/common_internal.h
@@ -0,0 +1,150 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COMMON_INTERNAL_H_
+#define LIB_JPEGLI_COMMON_INTERNAL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <hwy/aligned_allocator.h>
+
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/simd.h"
+#include "lib/jxl/base/compiler_specific.h"  // for ssize_t
+#include "lib/jxl/base/status.h"             // for JXL_CHECK
+
+namespace jpegli {
+
+enum State {
+  kDecNull,
+  kDecStart,
+  kDecInHeader,
+  kDecHeaderDone,
+  kDecProcessMarkers,
+  kDecProcessScan,
+  kEncNull,
+  kEncStart,
+  kEncHeader,
+  kEncReadImage,
+  kEncWriteCoeffs,
+};
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 RoundUpTo(T1 a, T2 b) {
+  return DivCeil(a, b) * b;
+}
+
+constexpr size_t kDCTBlockSize = 64;
+// This is set to the same value as MAX_COMPS_IN_SCAN, because that is the
+// maximum number of channels the libjpeg-turbo decoder can decode.
+constexpr int kMaxComponents = 4;
+constexpr int kMaxQuantTables = 4;
+constexpr int kJpegPrecision = 8;
+constexpr int kMaxHuffmanTables = 4;
+constexpr size_t kJpegHuffmanMaxBitLength = 16;
+constexpr int kJpegHuffmanAlphabetSize = 256;
+constexpr int kJpegDCAlphabetSize = 12;
+constexpr int kMaxDHTMarkers = 512;
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kApp1 = 0xE1;
+constexpr uint8_t kApp2 = 0xE2;
+const uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+const uint8_t kExifTag[6] = "Exif\0";
+const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/";
+
+/* clang-format off */
+constexpr uint32_t kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+constexpr uint32_t kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+/* clang-format on */
+
+template <typename T>
+class RowBuffer {
+ public:
+  template <typename CInfoType>
+  void Allocate(CInfoType cinfo, size_t num_rows, size_t rowsize) {
+    size_t vec_size = std::max(VectorSize(), sizeof(T));
+    JXL_CHECK(vec_size % sizeof(T) == 0);
+    size_t alignment = std::max<size_t>(HWY_ALIGNMENT, vec_size);
+    size_t min_memstride = alignment + rowsize * sizeof(T) + vec_size;
+    size_t memstride = RoundUpTo(min_memstride, alignment);
+    xsize_ = rowsize;
+    ysize_ = num_rows;
+    stride_ = memstride / sizeof(T);
+    offset_ = alignment / sizeof(T);
+    data_ = ::jpegli::Allocate<T>(cinfo, ysize_ * stride_, JPOOL_IMAGE_ALIGNED);
+  }
+
+  T* Row(ssize_t y) const {
+    return &data_[((ysize_ + y) % ysize_) * stride_ + offset_];
+  }
+
+  size_t xsize() const { return xsize_; };
+  size_t ysize() const { return ysize_; };
+  size_t stride() const { return stride_; }
+
+  void PadRow(size_t y, size_t from, int border) {
+    float* row = Row(y);
+    for (int offset = -border; offset < 0; ++offset) {
+      row[offset] = row[0];
+    }
+    float last_val = row[from - 1];
+    for (size_t x = from; x < xsize_ + border; ++x) {
+      row[x] = last_val;
+    }
+  }
+
+  void CopyRow(ssize_t dst_row, ssize_t src_row, int border) {
+    memcpy(Row(dst_row) - border, Row(src_row) - border,
+           (xsize_ + 2 * border) * sizeof(T));
+  }
+
+  void FillRow(ssize_t y, T val, size_t len) {
+    T* row = Row(y);
+    for (size_t x = 0; x < len; ++x) {
+      row[x] = val;
+    }
+  }
+
+ private:
+  size_t xsize_;
+  size_t ysize_;
+  size_t stride_;
+  size_t offset_;
+  T* data_;
+};
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COMMON_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/dct-inl.h b/third_party/jpeg-xl/lib/jpegli/dct-inl.h
new file mode 100644
index 0000000000..0524e220d6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct-inl.h
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_DCT_INL_H_
+#undef LIB_JPEGLI_DCT_INL_H_
+#else
+#define LIB_JPEGLI_DCT_INL_H_
+#endif
+
+#include "lib/jpegli/transpose-inl.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::DemoteTo;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Round;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+
+template <size_t N>
+void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Add(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Sub(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void B(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  auto in2 = Load(d8, coeff + 8);
+  Store(MulAdd(in1, sqrt2, in2), d8, coeff);
+  for (size_t i = 1; i + 1 < N; i++) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i + 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+}
+
+// Ideally optimized away by compiler (except the multiply).
+template <size_t N>
+void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + 2 * i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
+  }
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+// Invoked on full vector.
+template <size_t N>
+void Multiply(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
+  }
+}
+
+void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                   size_t off, float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < 8; i++) {
+    Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
+  }
+}
+
+void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
+                          size_t off) {
+  HWY_CAPPED(float, 8) d8;
+  auto mul = Set(d8, 1.0f / 8);
+  for (size_t i = 0; i < 8; i++) {
+    StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
+  }
+}
+
+template <size_t N>
+struct DCT1DImpl;
+
+template <>
+struct DCT1DImpl<1> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+};
+
+template <>
+struct DCT1DImpl<2> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+    HWY_CAPPED(float, 8) d8;
+    auto in1 = Load(d8, mem);
+    auto in2 = Load(d8, mem + 8);
+    Store(Add(in1, in2), d8, mem);
+    Store(Sub(in1, in2), d8, mem + 8);
+  }
+};
+
+template <size_t N>
+struct DCT1DImpl {
+  void operator()(float* JXL_RESTRICT mem) {
+    HWY_ALIGN float tmp[N * 8];
+    AddReverse<N / 2>(mem, mem + N * 4, tmp);
+    DCT1DImpl<N / 2>()(tmp);
+    SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
+    Multiply<N>(tmp);
+    DCT1DImpl<N / 2>()(tmp + N * 4);
+    B<N / 2>(tmp + N * 4);
+    InverseEvenOdd<N>(tmp, mem);
+  }
+};
+
+void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+           float* JXL_RESTRICT output) {
+  HWY_CAPPED(float, 8) d8;
+  HWY_ALIGN float tmp[64];
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    // TODO(veluca): consider removing the temporary memory here (as is done in
+    // IDCT), if it turns out that some compilers don't optimize away the loads
+    // and this is performance-critical.
+    LoadFromBlock(pixels, pixels_stride, i, tmp);
+    DCT1DImpl<8>()(tmp);
+    StoreToBlockAndScale(tmp, output, i);
+  }
+}
+
+void TransformFromPixels(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* JXL_RESTRICT scratch_space) {
+  DCT1D(pixels, pixels_stride, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+  DCT1D(coefficients, 8, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+}
+
+void StoreQuantizedValue(const Vec<DI>& ival, int16_t* out) {
+  Rebind<int16_t, DI> di16;
+  Store(DemoteTo(di16, ival), di16, out);
+}
+
+void StoreQuantizedValue(const Vec<DI>& ival, int32_t* out) {
+  DI di;
+  Store(ival, di, out);
+}
+
+template <typename T>
+void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
+                   const float* zero_bias_offset, const float* zero_bias_mul,
+                   T* block) {
+  D d;
+  DI di;
+  const auto aq_mul = Set(d, aq_strength);
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto val = Load(d, dct + k);
+    const auto q = Load(d, qmc + k);
+    const auto qval = Mul(val, q);
+    const auto zb_offset = Load(d, zero_bias_offset + k);
+    const auto zb_mul = Load(d, zero_bias_mul + k);
+    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
+    const auto nzero_mask = Ge(Abs(qval), threshold);
+    const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
+    StoreQuantizedValue(ival, block + k);
+  }
+}
+
+template <typename T>
+void QuantizeBlockNoAQ(const float* dct, const float* qmc, T* block) {
+  D d;
+  DI di;
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto val = Load(d, dct + k);
+    const auto q = Load(d, qmc + k);
+    const auto ival = ConvertTo(di, Round(Mul(val, q)));
+    StoreQuantizedValue(ival, block + k);
+  }
+}
+
+template <typename T>
+void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
+                             const float* JXL_RESTRICT qmc, float aq_strength,
+                             const float* zero_bias_offset,
+                             const float* zero_bias_mul,
+                             float* JXL_RESTRICT tmp, T* block) {
+  float* JXL_RESTRICT dct = tmp;
+  float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
+  TransformFromPixels(pixels, stride, dct, scratch_space);
+  if (aq_strength > 0.0f) {
+    QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul,
+                  block);
+  } else {
+    QuantizeBlockNoAQ(dct, qmc, block);
+  }
+  // Center DC values around zero.
+  static constexpr float kDCBias = 128.0f;
+  block[0] = std::round((dct[0] - kDCBias) * qmc[0]);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/dct.cc b/third_party/jpeg-xl/lib/jpegli/dct.cc
new file mode 100644
index 0000000000..4320abe4c6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct.cc
@@ -0,0 +1,75 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/dct.h"
+
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/dct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/memory_manager.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  float* tmp = m->dct_buffer;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    int by0 = m->next_iMCU_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[c], by0,
+        max_block_rows, true);
+    float* qmc = m->quant_mul[c];
+    RowBuffer<float>* plane = m->raw_data[c];
+    const int h_factor = m->h_factor[c];
+    const int v_factor = m->v_factor[c];
+    const float* zero_bias_offset = m->zero_bias_offset[c];
+    const float* zero_bias_mul = m->zero_bias_mul[c];
+    float aq_strength = 0.0f;
+    for (int iy = 0; iy < comp->v_samp_factor; iy++) {
+      size_t by = by0 + iy;
+      if (by >= comp->height_in_blocks) continue;
+      JBLOCKROW brow = ba[iy];
+      const float* row = plane->Row(8 * by);
+      for (size_t bx = 0; bx < comp->width_in_blocks; bx++) {
+        JCOEF* block = &brow[bx][0];
+        if (m->use_adaptive_quantization) {
+          aq_strength = m->quant_field.Row(by * v_factor)[bx * h_factor];
+        }
+        ComputeCoefficientBlock(row + 8 * bx, plane->stride(), qmc, aq_strength,
+                                zero_bias_offset, zero_bias_mul, tmp, block);
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ComputeDCTCoefficients);
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/dct.h b/third_party/jpeg-xl/lib/jpegli/dct.h
new file mode 100644
index 0000000000..9ae5f9f7c2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/dct.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DCT_H_
+#define LIB_JPEGLI_DCT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ComputeDCTCoefficients(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DCT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode.cc b/third_party/jpeg-xl/lib/jpegli/decode.cc
new file mode 100644
index 0000000000..cf87673705
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode.cc
@@ -0,0 +1,981 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+
+#include <string.h>
+
+#include <vector>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/decode_marker.h"
+#include "lib/jpegli/decode_scan.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/render.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+void InitializeImage(j_decompress_ptr cinfo) {
+  cinfo->restart_interval = 0;
+  cinfo->saw_JFIF_marker = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+  cinfo->saw_Adobe_marker = FALSE;
+  cinfo->Adobe_transform = 0;
+  cinfo->CCIR601_sampling = FALSE;  // not used
+  cinfo->marker_list = nullptr;
+  cinfo->comp_info = nullptr;
+  cinfo->input_scan_number = 0;
+  cinfo->input_iMCU_row = 0;
+  cinfo->output_scan_number = 0;
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  cinfo->unread_marker = 0;
+  cinfo->coef_bits = nullptr;
+  // We set all these to zero since we don't yet support arithmetic coding.
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  // Initialize the private fields.
+  jpeg_decomp_master* m = cinfo->master;
+  m->input_buffer_.clear();
+  m->input_buffer_pos_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  m->is_multiscan_ = false;
+  m->found_soi_ = false;
+  m->found_dri_ = false;
+  m->found_sof_ = false;
+  m->found_eoi_ = false;
+  m->icc_index_ = 0;
+  m->icc_total_ = 0;
+  m->icc_profile_.clear();
+  memset(m->dc_huff_lut_, 0, sizeof(m->dc_huff_lut_));
+  memset(m->ac_huff_lut_, 0, sizeof(m->ac_huff_lut_));
+  // Initialize the values to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  for (size_t i = 0; i < kAllHuffLutSize; ++i) {
+    m->dc_huff_lut_[i].bits = 0;
+    m->dc_huff_lut_[i].value = 0xffff;
+    m->ac_huff_lut_[i].bits = 0;
+    m->ac_huff_lut_[i].value = 0xffff;
+  }
+  m->colormap_lut_ = nullptr;
+  m->pixels_ = nullptr;
+  m->scanlines_ = nullptr;
+  m->regenerate_inverse_colormap_ = true;
+  for (int i = 0; i < kMaxComponents; ++i) {
+    m->dither_[i] = nullptr;
+    m->error_row_[i] = nullptr;
+  }
+  m->output_passes_done_ = 0;
+  m->xoffset_ = 0;
+  m->dequant_ = nullptr;
+}
+
+void InitializeDecompressParams(j_decompress_ptr cinfo) {
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->out_color_space = JCS_UNKNOWN;
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->output_gamma = 0.0f;
+  cinfo->buffered_image = FALSE;
+  cinfo->raw_data_out = FALSE;
+  cinfo->dct_method = JDCT_DEFAULT;
+  cinfo->do_fancy_upsampling = TRUE;
+  cinfo->do_block_smoothing = TRUE;
+  cinfo->quantize_colors = FALSE;
+  cinfo->dither_mode = JDITHER_FS;
+  cinfo->two_pass_quantize = TRUE;
+  cinfo->desired_number_of_colors = 256;
+  cinfo->enable_1pass_quant = FALSE;
+  cinfo->enable_external_quant = FALSE;
+  cinfo->enable_2pass_quant = FALSE;
+  cinfo->actual_number_of_colors = 0;
+  cinfo->colormap = nullptr;
+}
+
+void InitProgressMonitor(j_decompress_ptr cinfo, bool coef_only) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int nc = cinfo->num_components;
+  int estimated_num_scans =
+      cinfo->progressive_mode ? 2 + 3 * nc : (m->is_multiscan_ ? nc : 1);
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows * estimated_num_scans;
+  cinfo->progress->pass_counter = 0;
+  if (coef_only) {
+    cinfo->progress->total_passes = 1;
+  } else {
+    int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+    bool two_pass_quant = cinfo->quantize_colors && !cinfo->colormap &&
+                          cinfo->two_pass_quantize && cinfo->enable_2pass_quant;
+    cinfo->progress->total_passes = input_passes + (two_pass_quant ? 2 : 1);
+  }
+  cinfo->progress->completed_passes = 0;
+}
+
+void InitProgressMonitorForOutput(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int passes_per_output = cinfo->enable_2pass_quant ? 2 : 1;
+  int output_passes_left = cinfo->buffered_image && !m->found_eoi_ ? 2 : 1;
+  cinfo->progress->total_passes =
+      m->output_passes_done_ + passes_per_output * output_passes_left;
+  cinfo->progress->completed_passes = m->output_passes_done_;
+}
+
+void ProgressMonitorInputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  cinfo->progress->pass_counter =
+      ((cinfo->input_scan_number - 1) * cinfo->total_iMCU_rows +
+       cinfo->input_iMCU_row);
+  if (cinfo->progress->pass_counter > cinfo->progress->pass_limit) {
+    cinfo->progress->pass_limit =
+        cinfo->input_scan_number * cinfo->total_iMCU_rows;
+  }
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void ProgressMonitorOutputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+  cinfo->progress->pass_counter = cinfo->output_scanline;
+  cinfo->progress->pass_limit = cinfo->output_height;
+  cinfo->progress->completed_passes = input_passes + m->output_passes_done_;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void BuildHuffmanLookupTable(j_decompress_ptr cinfo, JHUFF_TBL* table,
+                             HuffmanTableEntry* huff_lut) {
+  uint32_t counts[kJpegHuffmanMaxBitLength + 1] = {};
+  counts[0] = 0;
+  int total_count = 0;
+  int space = 1 << kJpegHuffmanMaxBitLength;
+  int max_depth = 1;
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    int count = table->bits[i];
+    if (count != 0) {
+      max_depth = i;
+    }
+    counts[i] = count;
+    total_count += count;
+    space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+  }
+  uint32_t values[kJpegHuffmanAlphabetSize + 1] = {};
+  uint8_t values_seen[256] = {0};
+  for (int i = 0; i < total_count; ++i) {
+    int value = table->huffval[i];
+    if (values_seen[value]) {
+      return JPEGLI_ERROR("Duplicate Huffman code value %d", value);
+    }
+    values_seen[value] = 1;
+    values[i] = value;
+  }
+  // Add an invalid symbol that will have the all 1 code.
+  ++counts[max_depth];
+  values[total_count] = kJpegHuffmanAlphabetSize;
+  space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+  if (space < 0) {
+    JPEGLI_ERROR("Invalid Huffman code lengths.");
+  } else if (space > 0 && huff_lut[0].value != 0xffff) {
+    // Re-initialize the values to an invalid symbol so that we can recognize
+    // it when reading the bit stream using a Huffman code with space > 0.
+    for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+      huff_lut[i].bits = 0;
+      huff_lut[i].value = 0xffff;
+    }
+  }
+  BuildJpegHuffmanTable(&counts[0], &values[0], huff_lut);
+}
+
+void PrepareForScan(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    int* prev_coef_bits = cinfo->coef_bits[comp_idx + cinfo->num_components];
+    for (int k = std::min(cinfo->Ss, 1); k <= std::max(cinfo->Se, 9); k++) {
+      prev_coef_bits[k] =
+          (cinfo->input_scan_number > 0) ? cinfo->coef_bits[comp_idx][k] : 0;
+    }
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      cinfo->coef_bits[comp_idx][k] = cinfo->Al;
+    }
+  }
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/false);
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/true);
+  // Check that all the Huffman tables needed for this scan are defined and
+  // build derived lookup tables.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    if (cinfo->Ss == 0) {
+      int dc_tbl_idx = cinfo->cur_comp_info[i]->dc_tbl_no;
+      JHUFF_TBL* table = cinfo->dc_huff_tbl_ptrs[dc_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->dc_huff_lut_[dc_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("DC Huffman table %d not found", dc_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+    if (cinfo->Se > 0) {
+      int ac_tbl_idx = cinfo->cur_comp_info[i]->ac_tbl_no;
+      JHUFF_TBL* table = cinfo->ac_huff_tbl_ptrs[ac_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->ac_huff_lut_[ac_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("AC Huffman table %d not found", ac_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+  }
+  // Copy quantization tables into comp_info.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    if (comp->quant_table == nullptr) {
+      comp->quant_table = Allocate<JQUANT_TBL>(cinfo, 1, JPOOL_IMAGE);
+      memcpy(comp->quant_table, cinfo->quant_tbl_ptrs[comp->quant_tbl_no],
+             sizeof(JQUANT_TBL));
+    }
+  }
+  if (cinfo->comps_in_scan == 1) {
+    const auto& comp = *cinfo->cur_comp_info[0];
+    cinfo->MCUs_per_row = DivCeil(cinfo->image_width * comp.h_samp_factor,
+                                  cinfo->max_h_samp_factor * DCTSIZE);
+    cinfo->MCU_rows_in_scan = DivCeil(cinfo->image_height * comp.v_samp_factor,
+                                      cinfo->max_v_samp_factor * DCTSIZE);
+    m->mcu_rows_per_iMCU_row_ = cinfo->cur_comp_info[0]->v_samp_factor;
+  } else {
+    cinfo->MCU_rows_in_scan = cinfo->total_iMCU_rows;
+    cinfo->MCUs_per_row = m->iMCU_cols_;
+    m->mcu_rows_per_iMCU_row_ = 1;
+    size_t mcu_size = 0;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+    }
+    if (mcu_size > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("MCU size too big");
+    }
+  }
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  m->restarts_to_go_ = cinfo->restart_interval;
+  m->next_restart_marker_ = 0;
+  m->eobrun_ = -1;
+  m->scan_mcu_row_ = 0;
+  m->scan_mcu_col_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  ++cinfo->input_scan_number;
+  cinfo->input_iMCU_row = 0;
+  PrepareForiMCURow(cinfo);
+  cinfo->global_state = kDecProcessScan;
+}
+
+int ConsumeInput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == kDecProcessScan && m->streaming_mode_ &&
+      cinfo->input_iMCU_row > cinfo->output_iMCU_row) {
+    // Prevent input from getting ahead of output in streaming mode.
+    return JPEG_SUSPENDED;
+  }
+  jpeg_source_mgr* src = cinfo->src;
+  int status;
+  for (;;) {
+    const uint8_t* data;
+    size_t len;
+    if (m->input_buffer_.empty()) {
+      data = cinfo->src->next_input_byte;
+      len = cinfo->src->bytes_in_buffer;
+    } else {
+      data = &m->input_buffer_[m->input_buffer_pos_];
+      len = m->input_buffer_.size() - m->input_buffer_pos_;
+    }
+    size_t pos = 0;
+    if (cinfo->global_state == kDecProcessScan) {
+      status = ProcessScan(cinfo, data, len, &pos, &m->codestream_bits_ahead_);
+    } else {
+      status = ProcessMarkers(cinfo, data, len, &pos);
+    }
+    if (m->input_buffer_.empty()) {
+      cinfo->src->next_input_byte += pos;
+      cinfo->src->bytes_in_buffer -= pos;
+    } else {
+      m->input_buffer_pos_ += pos;
+      size_t bytes_left = m->input_buffer_.size() - m->input_buffer_pos_;
+      if (bytes_left <= src->bytes_in_buffer) {
+        src->next_input_byte += (src->bytes_in_buffer - bytes_left);
+        src->bytes_in_buffer = bytes_left;
+        m->input_buffer_.clear();
+        m->input_buffer_pos_ = 0;
+      }
+    }
+    if (status == kHandleRestart) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (cinfo->unread_marker == 0xd0 + m->next_restart_marker_) {
+        cinfo->unread_marker = 0;
+      } else {
+        if (!(*cinfo->src->resync_to_restart)(cinfo, m->next_restart_marker_)) {
+          return JPEG_SUSPENDED;
+        }
+      }
+      m->next_restart_marker_ += 1;
+      m->next_restart_marker_ &= 0x7;
+      m->restarts_to_go_ = cinfo->restart_interval;
+      if (cinfo->unread_marker != 0) {
+        JPEGLI_WARN("Failed to resync to next restart marker, skipping scan.");
+        return JPEG_SCAN_COMPLETED;
+      }
+      continue;
+    }
+    if (status == kHandleMarkerProcessor) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (!(*GetMarkerProcessor(cinfo))(cinfo)) {
+        return JPEG_SUSPENDED;
+      }
+      cinfo->unread_marker = 0;
+      continue;
+    }
+    if (status != kNeedMoreInput) {
+      break;
+    }
+    if (m->input_buffer_.empty()) {
+      JXL_DASSERT(m->input_buffer_pos_ == 0);
+      m->input_buffer_.assign(src->next_input_byte,
+                              src->next_input_byte + src->bytes_in_buffer);
+    }
+    if (!(*cinfo->src->fill_input_buffer)(cinfo)) {
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      return JPEG_SUSPENDED;
+    }
+    if (src->bytes_in_buffer == 0) {
+      JPEGLI_ERROR("Empty input.");
+    }
+    m->input_buffer_.insert(m->input_buffer_.end(), src->next_input_byte,
+                            src->next_input_byte + src->bytes_in_buffer);
+  }
+  if (status == JPEG_SCAN_COMPLETED) {
+    cinfo->global_state = kDecProcessMarkers;
+  } else if (status == JPEG_REACHED_SOS) {
+    if (cinfo->global_state == kDecInHeader) {
+      cinfo->global_state = kDecHeaderDone;
+    } else {
+      PrepareForScan(cinfo);
+    }
+  }
+  return status;
+}
+
+bool IsInputReady(j_decompress_ptr cinfo) {
+  if (cinfo->master->found_eoi_) {
+    return true;
+  }
+  if (cinfo->input_scan_number > cinfo->output_scan_number) {
+    return true;
+  }
+  if (cinfo->input_scan_number < cinfo->output_scan_number) {
+    return false;
+  }
+  if (cinfo->input_iMCU_row == cinfo->total_iMCU_rows) {
+    return true;
+  }
+  return cinfo->input_iMCU_row >
+         cinfo->output_iMCU_row + (cinfo->master->streaming_mode_ ? 0 : 2);
+}
+
+bool ReadOutputPass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->pixels_) {
+    size_t stride = cinfo->out_color_components * cinfo->output_width;
+    size_t num_samples = cinfo->output_height * stride;
+    m->pixels_ = Allocate<uint8_t>(cinfo, num_samples, JPOOL_IMAGE);
+    m->scanlines_ =
+        Allocate<JSAMPROW>(cinfo, cinfo->output_height, JPOOL_IMAGE);
+    for (size_t i = 0; i < cinfo->output_height; ++i) {
+      m->scanlines_[i] = &m->pixels_[i * stride];
+    }
+  }
+  size_t num_output_rows = 0;
+  while (num_output_rows < cinfo->output_height) {
+    if (IsInputReady(cinfo)) {
+      ProgressMonitorOutputPass(cinfo);
+      ProcessOutput(cinfo, &num_output_rows, m->scanlines_,
+                    cinfo->output_height);
+    } else if (ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return false;
+    }
+  }
+  cinfo->output_scanline = 0;
+  cinfo->output_iMCU_row = 0;
+  return true;
+}
+
+boolean PrepareQuantizedOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Color quantization is not supported in raw data mode.");
+  }
+  if (m->output_data_type_ != JPEGLI_TYPE_UINT8) {
+    JPEGLI_ERROR("Color quantization must use 8-bit mode.");
+  }
+  if (cinfo->colormap) {
+    m->quant_mode_ = 3;
+  } else if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) {
+    m->quant_mode_ = 2;
+  } else if (cinfo->enable_1pass_quant) {
+    m->quant_mode_ = 1;
+  } else {
+    JPEGLI_ERROR("Invalid quantization mode change");
+  }
+  if (m->quant_mode_ > 1 && cinfo->dither_mode == JDITHER_ORDERED) {
+    cinfo->dither_mode = JDITHER_FS;
+  }
+  if (m->quant_mode_ == 1) {
+    ChooseColorMap1Pass(cinfo);
+  } else if (m->quant_mode_ == 2) {
+    m->quant_pass_ = 0;
+    if (!ReadOutputPass(cinfo)) {
+      return FALSE;
+    }
+    ChooseColorMap2Pass(cinfo);
+  }
+  if (m->quant_mode_ == 2 ||
+      (m->quant_mode_ == 3 && m->regenerate_inverse_colormap_)) {
+    CreateInverseColorMap(cinfo);
+  }
+  if (cinfo->dither_mode == JDITHER_ORDERED) {
+    CreateOrderedDitherTables(cinfo);
+  } else if (cinfo->dither_mode == JDITHER_FS) {
+    InitFSDitherState(cinfo);
+  }
+  m->quant_pass_ = 1;
+  return TRUE;
+}
+
+void AllocateCoefficientBuffer(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  jvirt_barray_ptr* coef_arrays = jpegli::Allocate<jvirt_barray_ptr>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t height_in_blocks =
+        m->streaming_mode_ ? comp->v_samp_factor : comp->height_in_blocks;
+    coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+        comptr, JPOOL_IMAGE, TRUE, comp->width_in_blocks, height_in_blocks,
+        comp->v_samp_factor);
+  }
+  cinfo->master->coef_arrays = coef_arrays;
+  (*cinfo->mem->realize_virt_arrays)(comptr);
+}
+
+}  // namespace jpegli
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpeg_decompress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->is_decompressor = TRUE;
+  cinfo->progress = nullptr;
+  cinfo->src = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; i++) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; i++) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  cinfo->global_state = jpegli::kDecStart;
+  cinfo->sample_range_limit = nullptr;  // not used
+  cinfo->rec_outbuf_height = 1;         // output works with any buffer height
+  cinfo->master = new jpeg_decomp_master;
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < 16; ++i) {
+    m->app_marker_parsers[i] = nullptr;
+  }
+  m->com_marker_parser = nullptr;
+  memset(m->markers_to_save_, 0, sizeof(m->markers_to_save_));
+  jpegli::InitializeDecompressParams(cinfo);
+  jpegli::InitializeImage(cinfo);
+}
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit) {
+  // TODO(szabadka) Limit our memory usage by taking into account length_limit.
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code < 0xe0) {
+    JPEGLI_ERROR("jpegli_save_markers: invalid marker code %d", marker_code);
+  }
+  m->markers_to_save_[marker_code - 0xe0] = 1;
+}
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code == 0xfe) {
+    m->com_marker_parser = routine;
+  } else if (marker_code >= 0xe0 && marker_code <= 0xef) {
+    m->app_marker_parsers[marker_code - 0xe0] = routine;
+  } else {
+    JPEGLI_ERROR("jpegli_set_marker_processor: invalid marker code %d",
+                 marker_code);
+  }
+}
+
+int jpegli_consume_input(j_decompress_ptr cinfo) {
+  if (cinfo->global_state == jpegli::kDecStart) {
+    (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+    (*cinfo->src->init_source)(cinfo);
+    jpegli::InitializeDecompressParams(cinfo);
+    jpegli::InitializeImage(cinfo);
+    cinfo->global_state = jpegli::kDecInHeader;
+  }
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    return JPEG_REACHED_SOS;
+  }
+  if (cinfo->master->found_eoi_) {
+    return JPEG_REACHED_EOI;
+  }
+  if (cinfo->global_state == jpegli::kDecInHeader ||
+      cinfo->global_state == jpegli::kDecProcessMarkers ||
+      cinfo->global_state == jpegli::kDecProcessScan) {
+    return jpegli::ConsumeInput(cinfo);
+  }
+  JPEGLI_ERROR("Unexpected state %d", cinfo->global_state);
+  return JPEG_REACHED_EOI;  // return value does not matter
+}
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  if (cinfo->global_state != jpegli::kDecStart &&
+      cinfo->global_state != jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_header: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->src == nullptr) {
+    JPEGLI_ERROR("Missing source.");
+  }
+  for (;;) {
+    int retcode = jpegli_consume_input(cinfo);
+    if (retcode == JPEG_SUSPENDED) {
+      return retcode;
+    } else if (retcode == JPEG_REACHED_SOS) {
+      break;
+    } else if (retcode == JPEG_REACHED_EOI) {
+      if (require_image) {
+        JPEGLI_ERROR("jpegli_read_header: unexpected EOI marker.");
+      }
+      jpegli_abort_decompress(cinfo);
+      return JPEG_HEADER_TABLES_ONLY;
+    }
+  };
+  return JPEG_HEADER_OK;
+}
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET** icc_data_ptr,
+                                unsigned int* icc_data_len) {
+  if (cinfo->global_state == jpegli::kDecStart ||
+      cinfo->global_state == jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (icc_data_ptr == nullptr || icc_data_len == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: invalid output buffer");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->icc_profile_.empty()) {
+    *icc_data_ptr = nullptr;
+    *icc_data_len = 0;
+    return FALSE;
+  }
+  *icc_data_len = m->icc_profile_.size();
+  *icc_data_ptr = (JOCTET*)malloc(*icc_data_len);
+  if (*icc_data_ptr == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: Out of memory");
+  }
+  memcpy(*icc_data_ptr, m->icc_profile_.data(), *icc_data_len);
+  return TRUE;
+}
+
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("No SOF marker found.");
+  }
+  if (cinfo->raw_data_out) {
+    if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+      JPEGLI_ERROR("Output scaling is not supported in raw output mode");
+    }
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    int dctsize = 16;
+    while (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * (dctsize - 1)) {
+      --dctsize;
+    }
+    m->min_scaled_dct_size = dctsize;
+    cinfo->output_width =
+        jpegli::DivCeil(cinfo->image_width * dctsize, DCTSIZE);
+    cinfo->output_height =
+        jpegli::DivCeil(cinfo->image_height * dctsize, DCTSIZE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = m->min_scaled_dct_size;
+    }
+  } else {
+    cinfo->output_width = cinfo->image_width;
+    cinfo->output_height = cinfo->image_height;
+    m->min_scaled_dct_size = DCTSIZE;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = DCTSIZE;
+    }
+  }
+}
+
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  jpegli_core_output_dimensions(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      // Prefer IDCT scaling over 2x upsampling.
+      while (m->scaled_dct_size[c] < DCTSIZE && (m->v_factor[c] % 2) == 0 &&
+             (m->h_factor[c] % 2) == 0) {
+        m->scaled_dct_size[c] *= 2;
+        m->v_factor[c] /= 2;
+        m->h_factor[c] /= 2;
+      }
+    }
+  }
+  if (cinfo->out_color_space == JCS_GRAYSCALE) {
+    cinfo->out_color_components = 1;
+  } else if (cinfo->out_color_space == JCS_RGB ||
+             cinfo->out_color_space == JCS_YCbCr) {
+    cinfo->out_color_components = 3;
+  } else if (cinfo->out_color_space == JCS_CMYK ||
+             cinfo->out_color_space == JCS_YCCK) {
+    cinfo->out_color_components = 4;
+  } else {
+    cinfo->out_color_components = cinfo->num_components;
+  }
+  cinfo->output_components =
+      cinfo->quantize_colors ? 1 : cinfo->out_color_components;
+  cinfo->rec_outbuf_height = 1;
+}
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo) {
+  if (cinfo->input_scan_number == 0) {
+    JPEGLI_ERROR("No SOS marker found.");
+  }
+  return cinfo->master->is_multiscan_;
+}
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo) {
+  return cinfo->master->found_eoi_;
+}
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    m->streaming_mode_ = !m->is_multiscan_ && !cinfo->buffered_image &&
+                         (!cinfo->quantize_colors || !cinfo->two_pass_quantize);
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::PrepareForScan(cinfo);
+    if (cinfo->quantize_colors) {
+      if (cinfo->colormap != nullptr) {
+        cinfo->enable_external_quant = TRUE;
+      } else if (cinfo->two_pass_quantize &&
+                 cinfo->out_color_space == JCS_RGB) {
+        cinfo->enable_2pass_quant = TRUE;
+      } else {
+        cinfo->enable_1pass_quant = TRUE;
+      }
+    }
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/false);
+    if (cinfo->buffered_image == TRUE) {
+      cinfo->output_scan_number = 0;
+      return TRUE;
+    }
+  } else if (!m->is_multiscan_) {
+    JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (m->is_multiscan_) {
+    if (cinfo->global_state != jpegli::kDecProcessScan &&
+        cinfo->global_state != jpegli::kDecProcessMarkers) {
+      JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                   cinfo->global_state);
+    }
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return FALSE;
+      }
+    }
+  }
+  cinfo->output_scan_number = cinfo->input_scan_number;
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_start_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_start_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  cinfo->output_scan_number = std::max(1, scan_number);
+  if (m->found_eoi_) {
+    cinfo->output_scan_number =
+        std::min(cinfo->output_scan_number, cinfo->input_scan_number);
+  }
+  jpegli::InitProgressMonitorForOutput(cinfo);
+  // TODO(szabadka): Figure out how much we can reuse.
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo) {
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_finish_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  // Advance input to the start of the next scan, or to the end of input.
+  while (cinfo->input_scan_number <= cinfo->output_scan_number &&
+         !cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  return TRUE;
+}
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_scanlines: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->buffered_image) {
+    if (cinfo->output_scan_number == 0) {
+      JPEGLI_ERROR(
+          "jpegli_read_scanlines: "
+          "jpegli_start_output() was not called");
+    }
+  } else if (m->is_multiscan_ && !m->found_eoi_) {
+    JPEGLI_ERROR(
+        "jpegli_read_scanlines: "
+        "jpegli_start_decompress() did not finish");
+  }
+  if (cinfo->output_scanline + max_lines > cinfo->output_height) {
+    max_lines = cinfo->output_height - cinfo->output_scanline;
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  size_t num_output_rows = 0;
+  while (num_output_rows < max_lines) {
+    if (jpegli::IsInputReady(cinfo)) {
+      jpegli::ProcessOutput(cinfo, &num_output_rows, scanlines, max_lines);
+    } else if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      break;
+    }
+  }
+  return num_output_rows;
+}
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  return jpegli_read_scanlines(cinfo, nullptr, num_lines);
+}
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION* xoffset,
+                          JDIMENSION* width) {
+  jpeg_decomp_master* m = cinfo->master;
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      cinfo->output_scanline != 0) {
+    JPEGLI_ERROR("jpegli_crop_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Output cropping is not supported in raw data mode");
+  }
+  if (xoffset == nullptr || width == nullptr || *width == 0 ||
+      *xoffset + *width > cinfo->output_width) {
+    JPEGLI_ERROR("jpegli_crop_scanline: Invalid arguments");
+  }
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  size_t xend = *xoffset + *width;
+  size_t iMCU_width = m->min_scaled_dct_size * cinfo->max_h_samp_factor;
+  *xoffset = (*xoffset / iMCU_width) * iMCU_width;
+  *width = xend - *xoffset;
+  cinfo->master->xoffset_ = *xoffset;
+  cinfo->output_width = *width;
+}
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines) {
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      !cinfo->raw_data_out) {
+    JPEGLI_ERROR("jpegli_read_raw_data: unexpected state %d",
+                 cinfo->global_state);
+  }
+  size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+  if (max_lines < iMCU_height) {
+    JPEGLI_ERROR("jpegli_read_raw_data: output buffer too small");
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  while (!jpegli::IsInputReady(cinfo)) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return 0;
+    }
+  }
+  if (cinfo->output_iMCU_row < cinfo->total_iMCU_rows) {
+    jpegli::ProcessRawOutput(cinfo, data);
+    return iMCU_height;
+  }
+  return 0;
+}
+
+jvirt_barray_ptr* jpegli_read_coefficients(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  m->streaming_mode_ = false;
+  if (!cinfo->buffered_image && cinfo->global_state == jpegli::kDecHeaderDone) {
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/true);
+    jpegli::PrepareForScan(cinfo);
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_coefficients: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return nullptr;
+      }
+    }
+    cinfo->output_scanline = cinfo->output_height;
+  }
+  return m->coef_arrays;
+}
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image && cinfo->output_scanline < cinfo->output_height) {
+    JPEGLI_ERROR("Incomplete output");
+  }
+  while (!cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  (*cinfo->src->term_source)(cinfo);
+  jpegli_abort_decompress(cinfo);
+  return TRUE;
+}
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  JPEGLI_WARN("Invalid restart marker found: 0x%02x vs 0x%02x.",
+              cinfo->unread_marker, 0xd0 + desired);
+  // This is a trivial implementation, we just let the decoder skip the entire
+  // scan and attempt to render the partial input.
+  return TRUE;
+}
+
+void jpegli_new_colormap(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_new_colormap: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in  buffered image mode");
+  }
+  if (!cinfo->enable_external_quant) {
+    JPEGLI_ERROR("external colormap quantizer was not enabled");
+  }
+  if (!cinfo->quantize_colors || cinfo->colormap == nullptr) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in external colormap mode");
+  }
+  cinfo->master->regenerate_inverse_colormap_ = true;
+}
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->output_data_type_ = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+      cinfo->master->swap_endianness_ = false;
+      break;
+    case JPEGLI_LITTLE_ENDIAN:
+      cinfo->master->swap_endianness_ = !IsLittleEndian();
+      break;
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->swap_endianness_ = IsLittleEndian();
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/decode.h b/third_party/jpeg-xl/lib/jpegli/decode.h
new file mode 100644
index 0000000000..7b7a2034ad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the decoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while dempressor object definitions are included directly
+// from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_DECODE_H_
+#define LIB_JPEGLI_DECODE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_decompress(cinfo)              \
+  jpegli_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                          (size_t)sizeof(struct jpeg_decompress_struct))
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize);
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE *infile);
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                    unsigned long insize);
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image);
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines);
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                          JDIMENSION *width);
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines);
+
+jvirt_barray_ptr *jpegli_read_coefficients(j_decompress_ptr cinfo);
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo);
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number);
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo);
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo);
+
+int jpegli_consume_input(j_decompress_ptr cinfo);
+
+#if JPEG_LIB_VERSION >= 80
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo);
+#endif
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo);
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit);
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine);
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                                unsigned int *icc_data_len);
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo);
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo);
+
+void jpegli_new_colormap(j_decompress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_DECODE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc
new file mode 100644
index 0000000000..0bb7321db4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_api_test.cc
@@ -0,0 +1,1305 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+static constexpr size_t kNumSourceBuffers = 4;
+
+// Custom source manager that refills the input buffer in chunks, simulating
+// a file reader with a fixed buffer size.
+class SourceManager {
+ public:
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size)
+      : data_(data), len_(len), max_chunk_size_(max_chunk_size) {
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+    buffers_.resize(kNumSourceBuffers, std::vector<uint8_t>(max_chunk_size_));
+    Reset();
+  }
+
+  void Reset() {
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pos_ = 0;
+    chunk_idx_ = 0;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    EXPECT_EQ(len_, pos_);
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t chunk_idx_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  std::vector<std::vector<uint8_t>> buffers_;
+
+  static void init_source(j_decompress_ptr cinfo) {}
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (src->pos_ < src->len_) {
+      size_t chunk_size = std::min(src->len_ - src->pos_, src->max_chunk_size_);
+      size_t next_idx = ++src->chunk_idx_ % kNumSourceBuffers;
+      uint8_t* next_buffer = src->buffers_[next_idx].data();
+      memcpy(next_buffer, src->data_ + src->pos_, chunk_size);
+      src->pub_.next_input_byte = next_buffer;
+      src->pub_.bytes_in_buffer = chunk_size;
+    } else {
+      src->pub_.next_input_byte = kFakeEoiMarker;
+      src->pub_.bytes_in_buffer = 2;
+      src->len_ += 2;
+    }
+    src->pos_ += src->pub_.bytes_in_buffer;
+    return TRUE;
+  }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  if (cinfo->src->bytes_in_buffer == 0) {
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpegli_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+  }
+  output->ysize = ysize_cropped;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->out_color_components;
+  output->data_type = dparams.data_type;
+  output->endianness = dparams.endianness;
+  size_t bytes_per_sample = jpegli_bytes_per_sample(dparams.data_type);
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize * bytes_per_sample);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      num_output_lines = jpegli_read_raw_data(cinfo, &data[0], max_lines);
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      if (cinfo->output_scanline < yoffset) {
+        max_lines = yoffset - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else if (cinfo->output_scanline >= yoffset + ysize_cropped) {
+        max_lines = cinfo->output_height - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else {
+        size_t lines_left = yoffset + ysize_cropped - cinfo->output_scanline;
+        max_lines = std::min<size_t>(max_output_lines, lines_left);
+        size_t stride = cinfo->output_width * cinfo->out_color_components *
+                        bytes_per_sample;
+        std::vector<JSAMPROW> scanlines(max_lines);
+        for (size_t i = 0; i < max_lines; ++i) {
+          size_t yidx = cinfo->output_scanline - yoffset + i;
+          scanlines[i] = &output->pixels[yidx * stride];
+        }
+        num_output_lines =
+            jpegli_read_scanlines(cinfo, &scanlines[0], max_lines);
+        if (cinfo->quantize_colors) {
+          for (size_t i = 0; i < num_output_lines; ++i) {
+            UnmapColors(scanlines[i], cinfo->output_width,
+                        cinfo->out_color_components, cinfo->colormap,
+                        cinfo->actual_number_of_colors);
+          }
+        }
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    EXPECT_EQ(num_output_lines, max_lines);
+  }
+  EXPECT_EQ(cinfo->total_iMCU_rows,
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  bool compare_to_orig = false;
+  float max_tolerance_factor = 1.01f;
+  float max_rms_dist = 1.0f;
+  float max_diff = 35.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  std::vector<uint8_t> compressed;
+  if (!config.fn.empty()) {
+    compressed = ReadTestData(config.fn.c_str());
+  } else {
+    GeneratePixels(&config.input);
+    JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  }
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  return compressed;
+}
+
+void TestAPINonBuffered(const CompressParams& jparams,
+                        const DecompressParams& dparams,
+                        const TestImage& expected_output,
+                        j_decompress_ptr cinfo, TestImage* output) {
+  if (jparams.add_marker) {
+    jpegli_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpegli_save_markers(cinfo, kSpecialMarker1, 0xffff);
+    num_markers_seen = 0;
+    jpegli_set_marker_processor(cinfo, 0xe6, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe7, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe8, test_marker_processor);
+  }
+  if (!jparams.icc.empty()) {
+    jpegli_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  if (jparams.add_marker) {
+    EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+    EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+  }
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpegli_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    EXPECT_EQ(0, memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  // Check that jpegli_calc_output_dimensions can be called multiple times
+  // even with different parameters.
+  if (!cinfo->raw_data_out) {
+    cinfo->scale_num = 1;
+    cinfo->scale_denom = 2;
+  }
+  jpegli_calc_output_dimensions(cinfo);
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/true);
+  VerifyHeader(jparams, cinfo);
+  jpegli_calc_output_dimensions(cinfo);
+  EXPECT_LE(expected_output.xsize, cinfo->output_width);
+  if (!dparams.crop_output) {
+    EXPECT_EQ(expected_output.xsize, cinfo->output_width);
+  }
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    jpegli_start_decompress(cinfo);
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputImage(dparams, cinfo, output);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+void TestAPIBuffered(const CompressParams& jparams,
+                     const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     std::vector<TestImage>* output_progression) {
+  EXPECT_EQ(JPEG_REACHED_SOS,
+            jpegli_read_header(cinfo, /*require_image=*/TRUE));
+  cinfo->buffered_image = TRUE;
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/true);
+  VerifyHeader(jparams, cinfo);
+  EXPECT_TRUE(jpegli_start_decompress(cinfo));
+  // start decompress should not read the whole input in buffered image mode
+  EXPECT_FALSE(jpegli_input_complete(cinfo));
+  bool has_multiple_scans = jpegli_has_multiple_scans(cinfo);
+  EXPECT_EQ(0, cinfo->output_scan_number);
+  int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+  while (!jpegli_input_complete(cinfo)) {
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    if (dparams.skip_scans && (cinfo->input_scan_number % 2) != 1) {
+      int result = JPEG_SUSPENDED;
+      while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+        result = jpegli_consume_input(cinfo);
+      }
+      if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+      continue;
+    }
+    SetScanDecompressParams(dparams, cinfo, cinfo->input_scan_number,
+                            /*is_jpegli=*/true);
+    EXPECT_TRUE(jpegli_start_output(cinfo, cinfo->input_scan_number));
+    // start output sets output_scan_number, but does not change
+    // input_scan_number
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    VerifyScanHeader(jparams, cinfo);
+    TestImage output;
+    ReadOutputImage(dparams, cinfo, &output);
+    output_progression->emplace_back(std::move(output));
+    // read scanlines/read raw data does not change input/output scan number
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_TRUE(jpegli_finish_output(cinfo));
+    ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(cinfo, coef_arrays, &output_progression->back());
+    }
+  }
+  jpegli_finish_decompress(cinfo);
+  if (dparams.size_factor == 1.0f) {
+    EXPECT_EQ(has_multiple_scans, cinfo->input_scan_number > 1);
+  }
+}
+
+TEST(DecodeAPITest, ReuseCinfo) {
+  TestImage input, output, expected;
+  std::vector<TestImage> output_progression, expected_output_progression;
+  CompressParams jparams;
+  DecompressParams dparams;
+  std::vector<uint8_t> compressed;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    GeneratePixels(&input);
+    for (int h_samp : {2, 1}) {
+      for (int v_samp : {2, 1}) {
+        for (int progr : {0, 2}) {
+          jparams.h_sampling = {h_samp, 1, 1};
+          jparams.v_sampling = {v_samp, 1, 1};
+          jparams.progressive_mode = progr;
+          printf(
+              "Generating input with %dx%d chroma subsampling "
+              "progressive level %d\n",
+              h_samp, v_samp, progr);
+          JXL_CHECK(EncodeWithJpegli(input, jparams, &compressed));
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+            for (bool crop : {true, false}) {
+              if (crop && output_mode != PIXELS) continue;
+              for (int scale_num : {1, 2, 3, 4, 7, 8, 13, 16}) {
+                if (scale_num != 8 && output_mode != PIXELS) continue;
+                int scale_denom = 8;
+                while (scale_num % 2 == 0 && scale_denom % 2 == 0) {
+                  scale_num /= 2;
+                  scale_denom /= 2;
+                }
+                printf("Decoding with output mode %d output scaling %d/%d %s\n",
+                       output_mode, scale_num, scale_denom,
+                       crop ? "with cropped output" : "");
+                dparams.output_mode = output_mode;
+                dparams.scale_num = scale_num;
+                dparams.scale_denom = scale_denom;
+                expected.Clear();
+                DecodeWithLibjpeg(jparams, dparams, compressed, &expected);
+                output.Clear();
+                cinfo.buffered_image = false;
+                cinfo.raw_data_out = false;
+                cinfo.scale_num = cinfo.scale_denom = 1;
+                SourceManager src(compressed.data(), compressed.size(),
+                                  1u << 12);
+                cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+                jpegli_read_header(&cinfo, /*require_image=*/TRUE);
+                jpegli_abort_decompress(&cinfo);
+                src.Reset();
+                TestAPINonBuffered(jparams, dparams, expected, &cinfo, &output);
+                float max_rms = output_mode == COEFFICIENTS ? 0.0f : 1.0f;
+                if (scale_num == 1 && scale_denom == 8 && h_samp != v_samp) {
+                  max_rms = 5.0f;  // libjpeg does not do fancy upsampling
+                }
+                VerifyOutputImage(expected, output, max_rms);
+                printf("Decoding in buffered image mode\n");
+                expected_output_progression.clear();
+                DecodeAllScansWithLibjpeg(jparams, dparams, compressed,
+                                          &expected_output_progression);
+                output_progression.clear();
+                src.Reset();
+                TestAPIBuffered(jparams, dparams, &cinfo, &output_progression);
+                JXL_CHECK(output_progression.size() ==
+                          expected_output_progression.size());
+                for (size_t i = 0; i < output_progression.size(); ++i) {
+                  const TestImage& output = output_progression[i];
+                  const TestImage& expected = expected_output_progression[i];
+                  VerifyOutputImage(expected, output, max_rms);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.input.xsize = 257 + samp * 37;
+      config.input.ysize = 265 + (progr / 2) * 17;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = progr;
+      GeneratePixels(&config.input);
+      all_configs.push_back(config);
+    }
+  }
+  return all_configs;
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameMemSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, buffer, buffer_size);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameStdSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  rewind(tmpf);
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_stdio_src(&cinfo, tmpf);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  fclose(tmpf);
+}
+
+TEST(DecodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, table_stream, table_stream_size);
+      jpegli_read_header(&cinfo, FALSE);
+      jpegli_mem_src(&cinfo, data_stream, data_stream_size);
+      jpegli_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      EXPECT_EQ(3, cinfo.num_components);
+      jpegli_start_decompress(&cinfo);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_read_scanlines(&cinfo, row, 1);
+      EXPECT_EQ(0, image[0]);
+      EXPECT_EQ(0, image[1]);
+      EXPECT_EQ(0, image[2]);
+      jpegli_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+class DecodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(DecodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  if (dparams.skip_scans) return;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPINonBuffered(config.jparams, dparams, output1, &cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  if (config.compare_to_orig) {
+    double rms0 = DistanceRms(config.input, output0);
+    double rms1 = DistanceRms(config.input, output1);
+    printf("rms: %f  vs  %f\n", rms0, rms1);
+    EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+  } else {
+    VerifyOutputImage(output0, output1, config.max_rms_dist, config.max_diff);
+  }
+}
+
+class DecodeAPITestParamBuffered : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(DecodeAPITestParamBuffered, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPIBuffered(config.jparams, dparams, &cinfo, &output_progression0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    if (config.compare_to_orig) {
+      double rms0 = DistanceRms(config.input, output);
+      double rms1 = DistanceRms(config.input, expected);
+      printf("rms: %f  vs  %f\n", rms0, rms1);
+      EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+    } else {
+      VerifyOutputImage(expected, output, config.max_rms_dist, config.max_diff);
+    }
+  }
+}
+
+std::vector<TestConfig> GenerateTests(bool buffered) {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+    });
+    for (size_t i = 0; i < (buffered ? 1u : testfiles.size()); ++i) {
+      TestConfig config;
+      config.fn = testfiles[i].first;
+      config.fn_desc = testfiles[i].second;
+      for (size_t chunk_size : {0, 1, 64, 65536}) {
+        config.dparams.chunk_size = chunk_size;
+        for (size_t max_output_lines : {0, 1, 8, 16}) {
+          config.dparams.max_output_lines = max_output_lines;
+          config.dparams.output_mode = PIXELS;
+          all_tests.push_back(config);
+        }
+        {
+          config.dparams.max_output_lines = 16;
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower_small.q85_444_non_interleaved.jpg",
+         "Q85YUV444NonInterleaved"},
+        {"jxl/flower/flower_small.q85_420_non_interleaved.jpg",
+         "Q85YUV420NonInterleaved"},
+        {"jxl/flower/flower_small.q85_444_partially_interleaved.jpg",
+         "Q85YUV444PartiallyInterleaved"},
+        {"jxl/flower/flower_small.q85_420_partially_interleaved.jpg",
+         "Q85YUV420PartiallyInterleaved"},
+        {"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
+        {"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
+        {"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
+        {"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
+        {"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
+        {"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
+        {"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
+        {"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
+         "Q85RGBSubsampleBlue"},
+        {"jxl/flower/flower_small.cmyk.jpg", "CMYK"},
+    });
+    for (size_t i = 0; i < (buffered ? 4u : testfiles.size()); ++i) {
+      for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+        TestConfig config;
+        config.fn = testfiles[i].first;
+        config.fn_desc = testfiles[i].second;
+        config.dparams.output_mode = output_mode;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Tests for common chroma subsampling and output modes.
+  for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    for (int h_samp : {1, 2}) {
+      for (int v_samp : {1, 2}) {
+        for (bool fancy : {true, false}) {
+          if (!fancy && (output_mode != PIXELS || h_samp * v_samp == 1)) {
+            continue;
+          }
+          TestConfig config;
+          config.dparams.output_mode = output_mode;
+          config.dparams.do_fancy_upsampling = fancy;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          if (output_mode == COEFFICIENTS) {
+            config.max_rms_dist = 0.0f;
+          }
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (bool skip_scans : {false, true}) {
+          if (skip_scans && (progr != 1 || size_factor < 0.5f)) continue;
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+            TestConfig config;
+            config.input.xsize = 517;
+            config.input.ysize = 523;
+            config.jparams.h_sampling = {samp, 1, 1};
+            config.jparams.v_sampling = {samp, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.dparams.size_factor = size_factor;
+            config.dparams.output_mode = output_mode;
+            config.dparams.skip_scans = skip_scans;
+            // The last partially available block can behave differently.
+            // TODO(szabadka) Figure out if we can make the behaviour more
+            // similar.
+            config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+            config.max_diff = 255.0f;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      for (bool skip_scans : {false, true}) {
+        if (skip_scans && size_factor < 0.3f) continue;
+        TestConfig config;
+        config.input.xsize = 517;
+        config.input.ysize = 523;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = 2;
+        config.dparams.size_factor = size_factor;
+        config.dparams.do_block_smoothing = true;
+        config.dparams.skip_scans = skip_scans;
+        // libjpeg does smoothing for incomplete scans differently at
+        // the border between current and previous scans.
+        config.max_rms_dist = 8.0f;
+        config.max_diff = 255.0f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Test for switching output color quantization modes between scans.
+  if (buffered) {
+    TestConfig config;
+    config.jparams.progressive_mode = 2;
+    config.dparams.quantize_colors = true;
+    config.dparams.scan_params = {
+        {3, JDITHER_NONE, CQUANT_1PASS},  {4, JDITHER_ORDERED, CQUANT_1PASS},
+        {5, JDITHER_FS, CQUANT_1PASS},    {6, JDITHER_NONE, CQUANT_EXTERNAL},
+        {8, JDITHER_NONE, CQUANT_REUSE},  {9, JDITHER_NONE, CQUANT_EXTERNAL},
+        {10, JDITHER_NONE, CQUANT_2PASS}, {11, JDITHER_NONE, CQUANT_REUSE},
+        {12, JDITHER_NONE, CQUANT_2PASS}, {13, JDITHER_FS, CQUANT_2PASS},
+    };
+    config.compare_to_orig = true;
+    config.max_tolerance_factor = 1.04f;
+    all_tests.push_back(config);
+  }
+
+  if (buffered) {
+    return all_tests;
+  }
+
+  // Tests for output color quantization.
+  for (int num_colors : {8, 64, 256}) {
+    for (ColorQuantMode mode : {CQUANT_1PASS, CQUANT_EXTERNAL, CQUANT_2PASS}) {
+      if (mode == CQUANT_EXTERNAL && num_colors != 256) continue;
+      for (J_DITHER_MODE dither : {JDITHER_NONE, JDITHER_ORDERED, JDITHER_FS}) {
+        if (mode == CQUANT_EXTERNAL && dither != JDITHER_NONE) continue;
+        if (mode != CQUANT_1PASS && dither == JDITHER_ORDERED) continue;
+        for (bool crop : {false, true}) {
+          for (bool scale : {false, true}) {
+            for (bool samp : {false, true}) {
+              if ((num_colors != 256) && (crop || scale || samp)) {
+                continue;
+              }
+              if (mode == CQUANT_2PASS && crop) continue;
+              TestConfig config;
+              config.input.xsize = 1024;
+              config.input.ysize = 768;
+              config.dparams.quantize_colors = true;
+              config.dparams.desired_number_of_colors = num_colors;
+              config.dparams.scan_params = {{kLastScan, dither, mode}};
+              config.dparams.crop_output = crop;
+              if (scale) {
+                config.dparams.scale_num = 7;
+                config.dparams.scale_denom = 8;
+              }
+              if (samp) {
+                config.jparams.h_sampling = {2, 1, 1};
+                config.jparams.v_sampling = {2, 1, 1};
+              }
+              if (!scale && !crop) {
+                config.compare_to_orig = true;
+                if (dither != JDITHER_NONE) {
+                  config.max_tolerance_factor = 1.05f;
+                }
+                if (mode == CQUANT_2PASS &&
+                    (num_colors == 8 || dither == JDITHER_FS)) {
+                  // TODO(szabadka) Lower this bound.
+                  config.max_tolerance_factor = 1.5f;
+                }
+              } else {
+                // We only test for buffer overflows, etc.
+                config.max_rms_dist = 100.0f;
+                config.max_diff = 255.0f;
+              }
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for output formats.
+  for (JpegliDataType type :
+       {JPEGLI_TYPE_UINT8, JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_NATIVE_ENDIAN, JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN}) {
+      if (type == JPEGLI_TYPE_UINT8 && endianness != JPEGLI_NATIVE_ENDIAN) {
+        continue;
+      }
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.dparams.data_type = type;
+        config.dparams.endianness = endianness;
+        config.input.color_space = JCS_UNKNOWN;
+        config.input.components = channels;
+        config.dparams.set_out_color_space = true;
+        config.dparams.out_color_space = JCS_UNKNOWN;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  // Test for output cropping.
+  {
+    TestConfig config;
+    config.dparams.crop_output = true;
+    all_tests.push_back(config);
+  }
+  // Tests for color transforms.
+  for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_GRAYSCALE}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.dparams.set_out_color_space = true;
+    config.dparams.out_color_space = out_color_space;
+    all_tests.push_back(config);
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && out_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && out_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for progressive levels.
+  for (int p = 0; p < 3 + kNumTestScripts; ++p) {
+    TestConfig config;
+    config.jparams.progressive_mode = p;
+    all_tests.push_back(config);
+  }
+  // Tests for RST markers.
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      for (int progr : {0, 2}) {
+        TestConfig config;
+        config.dparams.chunk_size = chunk_size;
+        config.jparams.progressive_mode = progr;
+        config.jparams.restart_interval = r;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    all_tests.push_back(config);
+  }
+  // Tests for custom quantization tables.
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          config.compare_to_orig = true;
+          config.max_tolerance_factor = 1.02;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.compare_to_orig = true;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.compare_to_orig = true;
+    all_tests.push_back(config);
+  }
+  // Tests for fixed (and custom) prefix codes.
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  // Test for jpeg without DHT marker.
+  {
+    TestConfig config;
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.jparams.omit_standard_tables = true;
+    all_tests.push_back(config);
+  }
+  // Test for custom component ids.
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 128;
+    config.jparams.comp_ids = {7, 17, 177};
+    all_tests.push_back(config);
+  }
+  // Tests for JFIF/Adobe markers.
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for small images.
+  for (int xsize : {1, 7, 8, 9, 15, 16, 17}) {
+    for (int ysize : {1, 7, 8, 9, 15, 16, 17}) {
+      TestConfig config;
+      config.input.xsize = xsize;
+      config.input.ysize = ysize;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for custom marker processor.
+  for (size_t chunk_size : {0, 1, 64, 65536}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for icc profile decoding.
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  // Tests for unusual sampling factors.
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      for (int dxb = 0; dxb < h0_samp; ++dxb) {
+        for (int dyb = 0; dyb < v0_samp; ++dyb) {
+          for (int dx = 0; dx < 2; ++dx) {
+            for (int dy = 0; dy < 2; ++dy) {
+              TestConfig config;
+              config.input.xsize = 128 + dyb * 8 + dy;
+              config.input.ysize = 256 + dxb * 8 + dx;
+              config.jparams.progressive_mode = 2;
+              config.jparams.h_sampling = {h0_samp, 1, 1};
+              config.jparams.v_sampling = {v0_samp, 1, 1};
+              config.compare_to_orig = true;
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.compare_to_orig = true;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for output scaling.
+  for (int scale_num = 1; scale_num <= 16; ++scale_num) {
+    if (scale_num == 8) continue;
+    for (bool crop : {false, true}) {
+      for (int samp : {1, 2}) {
+        for (int progr : {0, 2}) {
+          TestConfig config;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.scale_num = scale_num;
+          config.dparams.scale_denom = 8;
+          config.dparams.crop_output = crop;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::string QuantMode(ColorQuantMode mode) {
+  switch (mode) {
+    case CQUANT_1PASS:
+      return "1pass";
+    case CQUANT_EXTERNAL:
+      return "External";
+    case CQUANT_2PASS:
+      return "2pass";
+    case CQUANT_REUSE:
+      return "Reuse";
+  }
+  return "";
+}
+
+std::string DitherMode(J_DITHER_MODE mode) {
+  switch (mode) {
+    case JDITHER_NONE:
+      return "No";
+    case JDITHER_ORDERED:
+      return "Ordered";
+    case JDITHER_FS:
+      return "FS";
+  }
+  return "";
+}
+
+std::ostream& operator<<(std::ostream& os, const DecompressParams& dparams) {
+  if (dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << dparams.chunk_size;
+  }
+  if (dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(dparams.size_factor * 100) << "p";
+  }
+  if (dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << dparams.max_output_lines;
+  }
+  if (dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  os << IOMethodName(dparams.data_type, dparams.endianness);
+  if (dparams.set_out_color_space) {
+    os << "OutColor" << ColorSpaceName(dparams.out_color_space);
+  }
+  if (dparams.crop_output) {
+    os << "Crop";
+  }
+  if (dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  if (!dparams.do_fancy_upsampling) {
+    os << "NoFancyUpsampling";
+  }
+  if (dparams.scale_num != 1 || dparams.scale_denom != 1) {
+    os << "Scale" << dparams.scale_num << "_" << dparams.scale_denom;
+  }
+  if (dparams.quantize_colors) {
+    os << "Quant" << dparams.desired_number_of_colors << "colors";
+    for (size_t i = 0; i < dparams.scan_params.size(); ++i) {
+      if (i > 0) os << "_";
+      const auto& sparam = dparams.scan_params[i];
+      os << QuantMode(sparam.color_quant_mode);
+      os << DitherMode(sparam.dither_mode) << "Dither";
+    }
+  }
+  if (dparams.skip_scans) {
+    os << "SkipScans";
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  os << c.dparams;
+  return os;
+}
+
+std::string TestDescription(const testing::TestParamInfo<TestConfig>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITest, DecodeAPITestParam,
+                                testing::ValuesIn(GenerateTests(false)),
+                                TestDescription);
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITestBuffered,
+                                DecodeAPITestParamBuffered,
+                                testing::ValuesIn(GenerateTests(true)),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_internal.h b/third_party/jpeg-xl/lib/jpegli/decode_internal.h
new file mode 100644
index 0000000000..1c4f248d40
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_internal.h
@@ -0,0 +1,150 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_INTERNAL_H_
+#define LIB_JPEGLI_DECODE_INTERNAL_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/huffman.h"
+
+namespace jpegli {
+
+static constexpr int kNeedMoreInput = 100;
+static constexpr int kHandleRestart = 101;
+static constexpr int kHandleMarkerProcessor = 102;
+static constexpr int kProcessNextMarker = 103;
+static constexpr size_t kAllHuffLutSize = NUM_HUFF_TBLS * kJpegHuffmanLutSize;
+
+typedef int16_t coeff_t;
+
+// State of the decoder that has to be saved before decoding one MCU in case
+// we run out of the bitstream.
+struct MCUCodingState {
+  coeff_t last_dc_coeff[kMaxComponents];
+  int eobrun;
+  coeff_t coeffs[D_MAX_BLOCKS_IN_MCU * DCTSIZE2];
+};
+
+}  // namespace jpegli
+
+// Use this forward-declared libjpeg struct to hold all our private variables.
+// TODO(szabadka) Remove variables that have a corresponding version in cinfo.
+struct jpeg_decomp_master {
+  //
+  // Input handling state.
+  //
+  std::vector<uint8_t> input_buffer_;
+  size_t input_buffer_pos_;
+  // Number of bits after codestream_pos_ that were already processed.
+  size_t codestream_bits_ahead_;
+  bool streaming_mode_;
+
+  // Coefficient buffers
+  jvirt_barray_ptr* coef_arrays;
+  JBLOCKARRAY coeff_rows[jpegli::kMaxComponents];
+
+  //
+  // Marker data processing state.
+  //
+  bool found_soi_;
+  bool found_dri_;
+  bool found_sof_;
+  bool found_eoi_;
+  size_t icc_index_;
+  size_t icc_total_;
+  std::vector<uint8_t> icc_profile_;
+  jpegli::HuffmanTableEntry dc_huff_lut_[jpegli::kAllHuffLutSize];
+  jpegli::HuffmanTableEntry ac_huff_lut_[jpegli::kAllHuffLutSize];
+  uint8_t markers_to_save_[32];
+  jpeg_marker_parser_method app_marker_parsers[16];
+  jpeg_marker_parser_method com_marker_parser;
+  // Whether this jpeg has multiple scans (progressive or non-interleaved
+  // sequential).
+  bool is_multiscan_;
+
+  // Fields defined by SOF marker.
+  size_t iMCU_cols_;
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+
+  // Initialized at strat of frame.
+  uint16_t scan_progression_[jpegli::kMaxComponents][DCTSIZE2];
+
+  //
+  // Per scan state.
+  //
+  size_t scan_mcu_row_;
+  size_t scan_mcu_col_;
+  size_t mcu_rows_per_iMCU_row_;
+  jpegli::coeff_t last_dc_coeff_[jpegli::kMaxComponents];
+  int eobrun_;
+  int restarts_to_go_;
+  int next_restart_marker_;
+
+  jpegli::MCUCodingState mcu_;
+
+  //
+  // Rendering state.
+  //
+  int output_passes_done_;
+  JpegliDataType output_data_type_ = JPEGLI_TYPE_UINT8;
+  bool swap_endianness_ = false;
+  size_t xoffset_;
+
+  int min_scaled_dct_size;
+  int scaled_dct_size[jpegli::kMaxComponents];
+
+  size_t raw_height_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> raw_output_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> render_output_[jpegli::kMaxComponents];
+
+  void (*inverse_transform[jpegli::kMaxComponents])(
+      const int16_t* JXL_RESTRICT qblock, const float* JXL_RESTRICT dequant,
+      const float* JXL_RESTRICT biases, float* JXL_RESTRICT scratch_space,
+      float* JXL_RESTRICT output, size_t output_stride, size_t dctsize);
+
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+
+  float* idct_scratch_;
+  float* upsample_scratch_;
+  uint8_t* output_scratch_;
+  int16_t* smoothing_scratch_;
+  float* dequant_;
+  // 1 = 1pass, 2 = 2pass, 3 = external
+  int quant_mode_;
+  int quant_pass_;
+  int num_colors_[jpegli::kMaxComponents];
+  uint8_t* colormap_lut_;
+  uint8_t* pixels_;
+  JSAMPARRAY scanlines_;
+  std::vector<std::vector<uint8_t>> candidate_lists_;
+  bool regenerate_inverse_colormap_;
+  float* dither_[jpegli::kMaxComponents];
+  float* error_row_[2 * jpegli::kMaxComponents];
+  size_t dither_size_;
+  size_t dither_mask_;
+
+  // Per channel and per frequency statistics about the number of nonzeros and
+  // the sum of coefficient absolute values, used in dequantization bias
+  // computation.
+  int* nonzeros_;
+  int* sumabs_;
+  size_t num_processed_blocks_[jpegli::kMaxComponents];
+  float* biases_;
+#define SAVED_COEFS 10
+  // This holds the coef_bits of the scan before the current scan,
+  // i.e. the bottom half when rendering incomplete scans.
+  int (*coef_bits_latch)[SAVED_COEFS];
+  int (*prev_coef_bits_latch)[SAVED_COEFS];
+  bool apply_smoothing;
+};
+
+#endif  // LIB_JPEGLI_DECODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_marker.cc b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc
new file mode 100644
index 0000000000..c5c5790cdf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_marker.cc
@@ -0,0 +1,588 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_marker.h"
+
+#include <string.h>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jpegli {
+namespace {
+
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+
+// Macros for commonly used error conditions.
+
+#define JPEG_VERIFY_LEN(n)                                      \
+  if (pos + (n) > len) {                                        \
+    return JPEGLI_ERROR("Unexpected end of marker: pos=%" PRIuS \
+                        " need=%d len=%" PRIuS,                 \
+                        pos, static_cast<int>(n), len);         \
+  }
+
+#define JPEG_VERIFY_INPUT(var, low, high)                               \
+  if ((var) < (low) || (var) > (high)) {                                \
+    return JPEGLI_ERROR("Invalid " #var ": %d", static_cast<int>(var)); \
+  }
+
+#define JPEG_VERIFY_MARKER_END()                                  \
+  if (pos != len) {                                               \
+    return JPEGLI_ERROR("Invalid marker length: declared=%" PRIuS \
+                        " actual=%" PRIuS,                        \
+                        len, pos);                                \
+  }
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+void ProcessSOF(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_soi_) {
+    JPEGLI_ERROR("Unexpected SOF marker.");
+  }
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Duplicate SOF marker.");
+  }
+  m->found_sof_ = true;
+  cinfo->progressive_mode = (cinfo->unread_marker == 0xc2);
+  cinfo->arith_code = 0;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(6);
+  cinfo->data_precision = ReadUint8(data, &pos);
+  cinfo->image_height = ReadUint16(data, &pos);
+  cinfo->image_width = ReadUint16(data, &pos);
+  cinfo->num_components = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->data_precision, kJpegPrecision, kJpegPrecision);
+  JPEG_VERIFY_INPUT(cinfo->image_height, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->image_width, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->num_components, 1, kMaxComponents);
+  JPEG_VERIFY_LEN(3 * cinfo->num_components);
+  cinfo->comp_info = jpegli::Allocate<jpeg_component_info>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+  // Read sampling factors and quant table index for each component.
+  uint8_t ids_seen[256] = {0};
+  cinfo->max_h_samp_factor = 1;
+  cinfo->max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    comp->component_index = i;
+    const int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
+      JPEGLI_ERROR("Duplicate ID %d in SOF.", id);
+    }
+    ids_seen[id] = 1;
+    comp->component_id = id;
+    int factor = ReadUint8(data, &pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    JPEG_VERIFY_INPUT(h_samp_factor, 1, MAX_SAMP_FACTOR);
+    JPEG_VERIFY_INPUT(v_samp_factor, 1, MAX_SAMP_FACTOR);
+    comp->h_samp_factor = h_samp_factor;
+    comp->v_samp_factor = v_samp_factor;
+    cinfo->max_h_samp_factor =
+        std::max(cinfo->max_h_samp_factor, h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(cinfo->max_v_samp_factor, v_samp_factor);
+    int quant_tbl_idx = ReadUint8(data, &pos);
+    JPEG_VERIFY_INPUT(quant_tbl_idx, 0, NUM_QUANT_TBLS - 1);
+    comp->quant_tbl_no = quant_tbl_idx;
+    if (cinfo->quant_tbl_ptrs[quant_tbl_idx] == nullptr) {
+      JPEGLI_ERROR("Quantization table with index %u not found", quant_tbl_idx);
+    }
+    comp->quant_table = nullptr;  // will be allocated after SOS marker
+  }
+  JPEG_VERIFY_MARKER_END();
+
+  // Set the input colorspace based on the markers we have seen and set
+  // default output colorspace.
+  if (cinfo->num_components == 1) {
+    cinfo->jpeg_color_space = JCS_GRAYSCALE;
+    cinfo->out_color_space = JCS_GRAYSCALE;
+  } else if (cinfo->num_components == 3) {
+    if (cinfo->saw_JFIF_marker) {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+    } else if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_RGB : JCS_YCbCr;
+    } else {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+      if (cinfo->comp_info[0].component_id == 'R' &&  //
+          cinfo->comp_info[1].component_id == 'G' &&  //
+          cinfo->comp_info[2].component_id == 'B') {
+        cinfo->jpeg_color_space = JCS_RGB;
+      }
+    }
+    cinfo->out_color_space = JCS_RGB;
+  } else if (cinfo->num_components == 4) {
+    if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_CMYK : JCS_YCCK;
+    } else {
+      cinfo->jpeg_color_space = JCS_CMYK;
+    }
+    cinfo->out_color_space = JCS_CMYK;
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  cinfo->total_iMCU_rows =
+      DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE);
+  m->iMCU_cols_ =
+      DivCeil(cinfo->image_width, cinfo->max_h_samp_factor * DCTSIZE);
+  // Compute the block dimensions for each component.
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral subsampling ratios.");
+    }
+    m->h_factor[i] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[i] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[i]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[i]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+  }
+  memset(m->scan_progression_, 0, sizeof(m->scan_progression_));
+}
+
+void ProcessSOS(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("Unexpected SOS marker.");
+  }
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(1);
+  cinfo->comps_in_scan = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, cinfo->num_components);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, MAX_COMPS_IN_SCAN);
+
+  JPEG_VERIFY_LEN(2 * cinfo->comps_in_scan);
+  bool is_interleaved = (cinfo->comps_in_scan > 1);
+  uint8_t ids_seen[256] = {0};
+  cinfo->blocks_in_MCU = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
+      return JPEGLI_ERROR("Duplicate ID %d in SOS.", id);
+    }
+    ids_seen[id] = 1;
+    jpeg_component_info* comp = nullptr;
+    for (int j = 0; j < cinfo->num_components; ++j) {
+      if (cinfo->comp_info[j].component_id == id) {
+        comp = &cinfo->comp_info[j];
+        cinfo->cur_comp_info[i] = comp;
+      }
+    }
+    if (!comp) {
+      return JPEGLI_ERROR("SOS marker: Could not find component with id %d",
+                          id);
+    }
+    int c = ReadUint8(data, &pos);
+    comp->dc_tbl_no = c >> 4;
+    comp->ac_tbl_no = c & 0xf;
+    JPEG_VERIFY_INPUT(comp->dc_tbl_no, 0, 3);
+    JPEG_VERIFY_INPUT(comp->ac_tbl_no, 0, 3);
+    comp->MCU_width = is_interleaved ? comp->h_samp_factor : 1;
+    comp->MCU_height = is_interleaved ? comp->v_samp_factor : 1;
+    comp->MCU_blocks = comp->MCU_width * comp->MCU_height;
+    if (cinfo->blocks_in_MCU + comp->MCU_blocks > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("Too many blocks in MCU.");
+    }
+    for (int j = 0; j < comp->MCU_blocks; ++j) {
+      cinfo->MCU_membership[cinfo->blocks_in_MCU++] = i;
+    }
+  }
+  JPEG_VERIFY_LEN(3);
+  cinfo->Ss = ReadUint8(data, &pos);
+  cinfo->Se = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->Ss, 0, 63);
+  JPEG_VERIFY_INPUT(cinfo->Se, cinfo->Ss, 63);
+  int c = ReadUint8(data, &pos);
+  cinfo->Ah = c >> 4;
+  cinfo->Al = c & 0xf;
+  JPEG_VERIFY_MARKER_END();
+
+  if (cinfo->input_scan_number == 0) {
+    m->is_multiscan_ = (cinfo->comps_in_scan < cinfo->num_components ||
+                        cinfo->progressive_mode);
+  }
+  if (cinfo->Ah != 0 && cinfo->Al != cinfo->Ah - 1) {
+    // section G.1.1.1.2 : Successive approximation control only improves
+    // by one bit at a time.
+    JPEGLI_ERROR("Invalid progressive parameters: Al=%d Ah=%d", cinfo->Al,
+                 cinfo->Ah);
+  }
+  if (!cinfo->progressive_mode) {
+    cinfo->Ss = 0;
+    cinfo->Se = 63;
+    cinfo->Ah = 0;
+    cinfo->Al = 0;
+  }
+  const uint16_t scan_bitmask =
+      cinfo->Ah == 0 ? (0xffff << cinfo->Al) : (1u << cinfo->Al);
+  const uint16_t refinement_bitmask = (1 << cinfo->Al) - 1;
+  if (!cinfo->coef_bits) {
+    cinfo->coef_bits =
+        Allocate<int[DCTSIZE2]>(cinfo, cinfo->num_components * 2, JPOOL_IMAGE);
+    m->coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    m->prev_coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int i = 0; i < DCTSIZE2; ++i) {
+        cinfo->coef_bits[c][i] = -1;
+        if (i < SAVED_COEFS) {
+          m->coef_bits_latch[c][i] = -1;
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      if (m->scan_progression_[comp_idx][k] & scan_bitmask) {
+        return JPEGLI_ERROR(
+            "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      if (m->scan_progression_[comp_idx][k] & refinement_bitmask) {
+        return JPEGLI_ERROR(
+            "Invalid scan order, a more refined scan was already done: "
+            "component=%d k=%d prev_mask=%u cur_mask=%u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      m->scan_progression_[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (cinfo->Al > 10) {
+    return JPEGLI_ERROR("Scan parameter Al=%d is not supported.", cinfo->Al);
+  }
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and builds the Huffman
+// decoding table in either dc_huff_lut_ or ac_huff_lut_, depending on the type
+// and solt_id of Huffman code being read.
+void ProcessDHT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DHT marker: no Huffman table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    // The index of the Huffman code in the current set of Huffman codes. For AC
+    // component Huffman codes, 0x10 is added to the index.
+    int slot_id = ReadUint8(data, &pos);
+    int huffman_index = slot_id;
+    int is_ac_table = (slot_id & 0x10) != 0;
+    JHUFF_TBL** table;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->ac_huff_tbl_ptrs[huffman_index];
+    } else {
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->dc_huff_tbl_ptrs[huffman_index];
+    }
+    if (*table == nullptr) {
+      *table = jpegli_alloc_huff_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    int total_count = 0;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, &pos);
+      (*table)->bits[i] = count;
+      total_count += count;
+    }
+    if (is_ac_table) {
+      JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize);
+    } else {
+      // Allow symbols up to 15 here, we check later whether any invalid symbols
+      // are actually decoded.
+      // TODO(szabadka) Make sure decoder works (does not crash) with up to
+      // 15-nbits DC symbols and then increase kJpegDCAlphabetSize.
+      JPEG_VERIFY_INPUT(total_count, 0, 16);
+    }
+    JPEG_VERIFY_LEN(total_count);
+    for (int i = 0; i < total_count; ++i) {
+      int value = ReadUint8(data, &pos);
+      if (!is_ac_table) {
+        JPEG_VERIFY_INPUT(value, 0, 15);
+      }
+      (*table)->huffval[i] = value;
+    }
+    for (int i = total_count; i < kJpegHuffmanAlphabetSize; ++i) {
+      (*table)->huffval[i] = 0;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDQT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Updating quant tables between scans is not supported.");
+  }
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DQT marker: no quantization table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, &pos);
+    int precision = quant_table_index >> 4;
+    JPEG_VERIFY_INPUT(precision, 0, 1);
+    quant_table_index &= 0xf;
+    JPEG_VERIFY_INPUT(quant_table_index, 0, NUM_QUANT_TBLS - 1);
+    JPEG_VERIFY_LEN((precision + 1) * DCTSIZE2);
+
+    if (cinfo->quant_tbl_ptrs[quant_table_index] == nullptr) {
+      cinfo->quant_tbl_ptrs[quant_table_index] =
+          jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_table_index];
+
+    for (size_t i = 0; i < DCTSIZE2; ++i) {
+      int quant_val =
+          precision ? ReadUint16(data, &pos) : ReadUint8(data, &pos);
+      JPEG_VERIFY_INPUT(quant_val, 1, 65535);
+      quant_table->quantval[kJPEGNaturalOrder[i]] = quant_val;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDNL(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessDRI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_dri_) {
+    return JPEGLI_ERROR("Duplicate DRI marker.");
+  }
+  m->found_dri_ = true;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(2);
+  cinfo->restart_interval = ReadUint16(data, &pos);
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessAPP(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+  if (marker == 0xE0) {
+    if (payload_size >= 14 && memcmp(payload, "JFIF", 4) == 0) {
+      cinfo->saw_JFIF_marker = TRUE;
+      cinfo->JFIF_major_version = payload[5];
+      cinfo->JFIF_minor_version = payload[6];
+      cinfo->density_unit = payload[7];
+      cinfo->X_density = (payload[8] << 8) + payload[9];
+      cinfo->Y_density = (payload[10] << 8) + payload[11];
+    }
+  } else if (marker == 0xEE) {
+    if (payload_size >= 12 && memcmp(payload, "Adobe", 5) == 0) {
+      cinfo->saw_Adobe_marker = TRUE;
+      cinfo->Adobe_transform = payload[11];
+    }
+  } else if (marker == 0xE2) {
+    if (payload_size >= sizeof(kIccProfileTag) &&
+        memcmp(payload, kIccProfileTag, sizeof(kIccProfileTag)) == 0) {
+      payload += sizeof(kIccProfileTag);
+      payload_size -= sizeof(kIccProfileTag);
+      if (payload_size < 2) {
+        return JPEGLI_ERROR("ICC chunk is too small.");
+      }
+      uint8_t index = payload[0];
+      uint8_t total = payload[1];
+      ++m->icc_index_;
+      if (m->icc_index_ != index) {
+        return JPEGLI_ERROR("Invalid ICC chunk order.");
+      }
+      if (total == 0) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_total_ == 0) {
+        m->icc_total_ = total;
+      } else if (m->icc_total_ != total) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_index_ > m->icc_total_) {
+        return JPEGLI_ERROR("Invalid ICC chunk index.");
+      }
+      m->icc_profile_.insert(m->icc_profile_.end(), payload + 2,
+                             payload + payload_size);
+    }
+  }
+}
+
+void ProcessCOM(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessSOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_soi_) {
+    JPEGLI_ERROR("Duplicate SOI marker");
+  }
+  m->found_soi_ = true;
+}
+
+void ProcessEOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  cinfo->master->found_eoi_ = true;
+}
+
+void SaveMarker(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+
+  // Insert new saved marker to the head of the list.
+  jpeg_saved_marker_ptr next = cinfo->marker_list;
+  cinfo->marker_list =
+      jpegli::Allocate<jpeg_marker_struct>(cinfo, 1, JPOOL_IMAGE);
+  cinfo->marker_list->next = next;
+  cinfo->marker_list->marker = marker;
+  cinfo->marker_list->original_length = payload_size;
+  cinfo->marker_list->data_length = payload_size;
+  cinfo->marker_list->data =
+      jpegli::Allocate<uint8_t>(cinfo, payload_size, JPOOL_IMAGE);
+  memcpy(cinfo->marker_list->data, payload, payload_size);
+}
+
+uint8_t ProcessNextMarker(j_decompress_ptr cinfo, const uint8_t* const data,
+                          const size_t len, size_t* pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t num_skipped = 0;
+  uint8_t marker = cinfo->unread_marker;
+  if (marker == 0) {
+    // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+    static const uint8_t kIsValidMarker[] = {
+        1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+    };
+    // Skip bytes between markers.
+    while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] < 0xc0 ||
+                              !kIsValidMarker[data[*pos + 1] - 0xc0])) {
+      ++(*pos);
+      ++num_skipped;
+    }
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker = data[*pos + 1];
+    if (num_skipped > 0) {
+      if (m->found_soi_) {
+        JPEGLI_WARN("Skipped %d bytes before marker 0x%02x", (int)num_skipped,
+                    marker);
+      } else {
+        JPEGLI_ERROR("Did not find SOI marker.");
+      }
+    }
+    *pos += 2;
+    cinfo->unread_marker = marker;
+  }
+  if (!m->found_soi_ && marker != 0xd8) {
+    JPEGLI_ERROR("Did not find SOI marker.");
+  }
+  if (GetMarkerProcessor(cinfo)) {
+    return kHandleMarkerProcessor;
+  }
+  const uint8_t* marker_data = &data[*pos];
+  size_t marker_len = 0;
+  if (marker != 0xd8 && marker != 0xd9) {
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker_len += (data[*pos] << 8) + data[*pos + 1];
+    if (marker_len < 2) {
+      JPEGLI_ERROR("Invalid marker length");
+    }
+    if (*pos + marker_len > len) {
+      // TODO(szabadka) Limit our memory usage by using the skip_input_data
+      // source manager callback on APP markers that are not saved.
+      return kNeedMoreInput;
+    }
+    if (marker >= 0xe0 && m->markers_to_save_[marker - 0xe0]) {
+      SaveMarker(cinfo, marker_data, marker_len);
+    }
+  }
+  if (marker == 0xc0 || marker == 0xc1 || marker == 0xc2) {
+    ProcessSOF(cinfo, marker_data, marker_len);
+  } else if (marker == 0xc4) {
+    ProcessDHT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xda) {
+    ProcessSOS(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdb) {
+    ProcessDQT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdc) {
+    ProcessDNL(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdd) {
+    ProcessDRI(cinfo, marker_data, marker_len);
+  } else if (marker >= 0xe0 && marker <= 0xef) {
+    ProcessAPP(cinfo, marker_data, marker_len);
+  } else if (marker == 0xfe) {
+    ProcessCOM(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd8) {
+    ProcessSOI(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd9) {
+    ProcessEOI(cinfo, marker_data, marker_len);
+  } else {
+    JPEGLI_ERROR("Unexpected marker 0x%x", marker);
+  }
+  *pos += marker_len;
+  cinfo->unread_marker = 0;
+  if (marker == 0xda) {
+    return JPEG_REACHED_SOS;
+  } else if (marker == 0xd9) {
+    return JPEG_REACHED_EOI;
+  }
+  return kProcessNextMarker;
+}
+
+}  // namespace
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t marker = cinfo->unread_marker;
+  jpeg_marker_parser_method callback = nullptr;
+  if (marker >= 0xe0 && marker <= 0xef) {
+    callback = m->app_marker_parsers[marker - 0xe0];
+  } else if (marker == 0xfe) {
+    callback = m->com_marker_parser;
+  }
+  return callback;
+}
+
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos) {
+  for (;;) {
+    int status = ProcessNextMarker(cinfo, data, len, pos);
+    if (status != kProcessNextMarker) {
+      return status;
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_marker.h b/third_party/jpeg-xl/lib/jpegli/decode_marker.h
new file mode 100644
index 0000000000..d52c335341
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_marker.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_MARKER_H_
+#define LIB_JPEGLI_DECODE_MARKER_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until either
+// the end of the next SOS marker or the end of the input.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start or at the end of a marker
+// data segment (inter-marker data is allowed).
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the current input buffer ends before the next SOS or
+//       EOI marker. Input buffer refill is handled by the caller;
+//   * JPEG_REACHED_SOS, if the the next SOS marker is found;
+//   * JPEG_REACHED_EOR, if the end of the input is found.
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos);
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_MARKER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_scan.cc b/third_party/jpeg-xl/lib/jpegli/decode_scan.cc
new file mode 100644
index 0000000000..29c0172950
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_scan.cc
@@ -0,0 +1,566 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_scan.h"
+
+#include <string.h>
+
+#include <hwy/base.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+// Max 14 block per MCU (when 1 channel is subsampled)
+// Max 64 nonzero coefficients per block
+// Max 16 symbol bits plus 11 extra bits per nonzero symbol
+// Max 2 bytes per 8 bits (worst case is all bytes are escaped 0xff)
+constexpr int kMaxMCUByteSize = 6048;
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len), start_pos_(pos) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = pos_ < len_ ? data_[pos_] : 0;
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position, and *bit_pos to the bit position
+  // within the next byte where parsing should continue.
+  // Returns false if the stream ended too early.
+  bool FinishStream(size_t* pos, size_t* bit_pos) {
+    *bit_pos = (8 - (bits_left_ & 7)) & 7;
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = DivCeil(bits_left_, 8);
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (((pos_ == len_ && pos_ == next_marker_pos_) ||
+           (pos_ > 0 && pos_ < next_marker_pos_ && data_[pos_] == 0)) &&
+          (data_[pos_ - 1] == 0xff)) {
+        --pos_;
+      }
+    }
+    if (pos_ >= next_marker_pos_) {
+      *pos = next_marker_pos_;
+      if (pos_ > next_marker_pos_ || *bit_pos > 0) {
+        // Data ran out before the scan was complete.
+        return false;
+      }
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+  size_t start_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+/**
+ * Returns the DC diff or AC value for extra bits value x and prefix code s.
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.1 – Difference magnitude categories for DC coding
+ *  SSSS | DIFF values
+ * ------+--------------------------
+ *     0 | 0
+ *     1 | –1, 1
+ *     2 | –3, –2, 2, 3
+ *     3 | –7..–4, 4..7
+ * ......|..........................
+ *    11 | –2047..–1024, 1024..2047
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.2 – Categories assigned to coefficient values
+ * [ Same as Table F.1, but does not include SSSS equal to 0 and 11]
+ *
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * F.1.2.1.1 Structure of DC code table
+ * For each category,... additional bits... appended... to uniquely identify
+ * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF
+ * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are
+ * appended... Most significant bit... is 0 for negative differences and 1 for
+ * positive differences.
+ *
+ * In other words the upper half of extra bits range represents DIFF as is.
+ * The lower half represents the negative DIFFs with an offset.
+ */
+int HuffExtend(int x, int s) {
+  JXL_DASSERT(s >= 1);
+  int half = 1 << (s - 1);
+  if (x >= half) {
+    JXL_DASSERT(x < (1 << s));
+    return x;
+  } else {
+    return x - (1 << s) + 1;
+  }
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      return false;
+    }
+    int diff = 0;
+    if (s > 0) {
+      int bits = br->ReadBits(s);
+      diff = HuffExtend(bits, s);
+    }
+    int coeff = diff + *last_dc_coeff;
+    const int dc_coeff = coeff * Am;
+    coeffs[0] = dc_coeff;
+    // TODO(eustas): is there a more elegant / explicit way to check this?
+    if (dc_coeff != coeffs[0]) {
+      return false;
+    }
+    *last_dc_coeff = coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  for (int k = Ss; k <= Se; k++) {
+    int sr = ReadSymbol(ac_huff, br);
+    if (sr >= kJpegHuffmanAlphabetSize) {
+      return false;
+    }
+    int r = sr >> 4;
+    int s = sr & 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        return false;
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        return false;
+      }
+      int bits = br->ReadBits(s);
+      int coeff = HuffExtend(bits, s);
+      coeffs[kJPEGNaturalOrder[k]] = coeff * Am;
+    } else if (r == 15) {
+      k += 15;
+    } else {
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          return false;
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s * Am;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = Am;
+  int m1 = -Am;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        return false;
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          return false;
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              return false;
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          return false;
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    return false;
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+void SaveMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->mcu_.last_dc_coeff, m->last_dc_coeff_, sizeof(m->last_dc_coeff_));
+  m->mcu_.eobrun = m->eobrun_;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(&m->mcu_.coeffs[offset], coeffs, ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+void RestoreMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->last_dc_coeff_, m->mcu_.last_dc_coeff, sizeof(m->last_dc_coeff_));
+  m->eobrun_ = m->mcu_.eobrun;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(coeffs, &m->mcu_.coeffs[offset], ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+bool FinishScan(j_decompress_ptr cinfo, const uint8_t* data, const size_t len,
+                size_t* pos, size_t* bit_pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->eobrun_ > 0) {
+    JPEGLI_ERROR("End-of-block run too long.");
+  }
+  m->eobrun_ = -1;
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  if (*bit_pos == 0) {
+    return true;
+  }
+  if (data[*pos] == 0xff) {
+    // After last br.FinishStream we checked that there is at least 2 bytes
+    // in the buffer.
+    JXL_DASSERT(*pos + 1 < len);
+    // br.FinishStream would have detected an early marker.
+    JXL_DASSERT(data[*pos + 1] == 0);
+    *pos += 2;
+  } else {
+    *pos += 1;
+  }
+  *bit_pos = 0;
+  return true;
+}
+
+}  // namespace
+
+void PrepareForiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    int by0 = cinfo->input_iMCU_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    m->coeff_rows[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, true);
+  }
+}
+
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos) {
+  if (len == 0) {
+    return kNeedMoreInput;
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  for (;;) {
+    // Handle the restart intervals.
+    if (cinfo->restart_interval > 0 && m->restarts_to_go_ == 0) {
+      if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+        return kNeedMoreInput;
+      }
+      // Go to the next marker, warn if we had to skip any data.
+      size_t num_skipped = 0;
+      while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] == 0 ||
+                                data[*pos + 1] == 0xff)) {
+        ++(*pos);
+        ++num_skipped;
+      }
+      if (num_skipped > 0) {
+        JPEGLI_WARN("Skipped %d bytes before restart marker", (int)num_skipped);
+      }
+      if (*pos + 2 > len) {
+        return kNeedMoreInput;
+      }
+      cinfo->unread_marker = data[*pos + 1];
+      *pos += 2;
+      return kHandleRestart;
+    }
+
+    size_t start_pos = *pos;
+    BitReaderState br(data, len, start_pos);
+    if (*bit_pos > 0) {
+      br.ReadBits(*bit_pos);
+    }
+    if (start_pos + kMaxMCUByteSize > len) {
+      SaveMCUCodingState(cinfo);
+    }
+
+    // Decode one MCU.
+    HWY_ALIGN_MAX coeff_t dummy_block[DCTSIZE2];
+    bool scan_ok = true;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      int c = comp->component_index;
+      const HuffmanTableEntry* dc_lut =
+          &m->dc_huff_lut_[comp->dc_tbl_no * kJpegHuffmanLutSize];
+      const HuffmanTableEntry* ac_lut =
+          &m->ac_huff_lut_[comp->ac_tbl_no * kJpegHuffmanLutSize];
+      for (int iy = 0; iy < comp->MCU_height; ++iy) {
+        size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+        int biy = block_y % comp->v_samp_factor;
+        for (int ix = 0; ix < comp->MCU_width; ++ix) {
+          size_t block_x = m->scan_mcu_col_ * comp->MCU_width + ix;
+          coeff_t* coeffs;
+          if (block_x >= comp->width_in_blocks ||
+              block_y >= comp->height_in_blocks) {
+            // Note that it is OK that dummy_block is uninitialized because
+            // it will never be used in any branches, even in the RefineDCTBlock
+            // case, because only DC scans can be interleaved and we don't use
+            // the zero-ness of the DC coeff in the DC refinement code-path.
+            coeffs = dummy_block;
+          } else {
+            coeffs = &m->coeff_rows[c][biy][block_x][0];
+          }
+          if (cinfo->Ah == 0) {
+            if (!DecodeDCTBlock(dc_lut, ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br,
+                                &m->last_dc_coeff_[comp->component_index],
+                                coeffs)) {
+              scan_ok = false;
+            }
+          } else {
+            if (!RefineDCTBlock(ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br, coeffs)) {
+              scan_ok = false;
+            }
+          }
+        }
+      }
+    }
+    size_t new_pos;
+    size_t new_bit_pos;
+    bool stream_ok = br.FinishStream(&new_pos, &new_bit_pos);
+    if (new_pos + 2 > len) {
+      // If reading stopped within the last two bytes, we have to request more
+      // input even if FinishStream() returned true, since the Huffman code
+      // reader could have peaked ahead some bits past the current input chunk
+      // and thus the last prefix code length could have been wrong. We can do
+      // this because a valid JPEG bit stream has two extra bytes at the end.
+      RestoreMCUCodingState(cinfo);
+      return kNeedMoreInput;
+    }
+    *pos = new_pos;
+    *bit_pos = new_bit_pos;
+    if (!stream_ok) {
+      // We hit a marker during parsing.
+      JXL_DASSERT(data[*pos] == 0xff);
+      JXL_DASSERT(data[*pos + 1] != 0);
+      RestoreMCUCodingState(cinfo);
+      JPEGLI_WARN("Incomplete scan detected.");
+      return JPEG_SCAN_COMPLETED;
+    }
+    if (!scan_ok) {
+      JPEGLI_ERROR("Failed to decode DCT block");
+    }
+    if (m->restarts_to_go_ > 0) {
+      --m->restarts_to_go_;
+    }
+    ++m->scan_mcu_col_;
+    if (m->scan_mcu_col_ == cinfo->MCUs_per_row) {
+      ++m->scan_mcu_row_;
+      m->scan_mcu_col_ = 0;
+      if (m->scan_mcu_row_ == cinfo->MCU_rows_in_scan) {
+        if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+          return kNeedMoreInput;
+        }
+        break;
+      } else if ((m->scan_mcu_row_ % m->mcu_rows_per_iMCU_row_) == 0) {
+        // Current iMCU row is done.
+        break;
+      }
+    }
+  }
+  ++cinfo->input_iMCU_row;
+  if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows) {
+    PrepareForiMCURow(cinfo);
+    return JPEG_ROW_COMPLETED;
+  }
+  return JPEG_SCAN_COMPLETED;
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/decode_scan.h b/third_party/jpeg-xl/lib/jpegli/decode_scan.h
new file mode 100644
index 0000000000..61d05c67d6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/decode_scan.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_SCAN_H_
+#define LIB_JPEGLI_DECODE_SCAN_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until the end
+// of the next iMCU row.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start of an MCU, or at the end
+// of the scan.
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the input buffer ends before the end of an iMCU row;
+//   * JPEG_ROW_COMPLETED, if the next iMCU row (but not the scan) is reached;
+//   * JPEG_SCAN_COMPLETED, if the end of the scan is reached.
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos);
+
+void PrepareForiMCURow(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_SCAN_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/destination_manager.cc b/third_party/jpeg-xl/lib/jpegli/destination_manager.cc
new file mode 100644
index 0000000000..9bc269f0c9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/destination_manager.cc
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+constexpr size_t kDestBufferSize = 64 << 10;
+
+struct StdioDestinationManager {
+  jpeg_destination_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    if (fwrite(dest->buffer, 1, kDestBufferSize, dest->f) != kDestBufferSize) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    size_t bytes_left = kDestBufferSize - dest->pub.free_in_buffer;
+    if (bytes_left &&
+        fwrite(dest->buffer, 1, bytes_left, dest->f) != bytes_left) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    fflush(dest->f);
+    if (ferror(dest->f)) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+  }
+};
+
+struct MemoryDestinationManager {
+  jpeg_destination_mgr pub;
+  // Output buffer supplied by the application
+  uint8_t** output;
+  unsigned long* output_size;
+  // Output buffer allocated by us.
+  uint8_t* temp_buffer;
+  // Current output buffer (either application supplied or allocated by us).
+  uint8_t* current_buffer;
+  size_t buffer_size;
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    uint8_t* next_buffer =
+        reinterpret_cast<uint8_t*>(malloc(dest->buffer_size * 2));
+    memcpy(next_buffer, dest->current_buffer, dest->buffer_size);
+    if (dest->temp_buffer != nullptr) {
+      free(dest->temp_buffer);
+    }
+    dest->temp_buffer = next_buffer;
+    dest->current_buffer = next_buffer;
+    *dest->output = next_buffer;
+    *dest->output_size = dest->buffer_size;
+    dest->pub.next_output_byte = next_buffer + dest->buffer_size;
+    dest->pub.free_in_buffer = dest->buffer_size;
+    dest->buffer_size *= 2;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    *dest->output_size = dest->buffer_size - dest->pub.free_in_buffer;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile) {
+  if (outfile == nullptr) {
+    JPEGLI_ERROR("jpegli_stdio_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::StdioDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_stdio_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(
+        jpegli::Allocate<jpegli::StdioDestinationManager>(cinfo, 1));
+  }
+  auto dest = reinterpret_cast<jpegli::StdioDestinationManager*>(cinfo->dest);
+  dest->f = outfile;
+  dest->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kDestBufferSize);
+  dest->pub.next_output_byte = dest->buffer;
+  dest->pub.free_in_buffer = jpegli::kDestBufferSize;
+  dest->pub.init_destination =
+      jpegli::StdioDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::StdioDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::StdioDestinationManager::term_destination;
+}
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize) {
+  if (outbuffer == nullptr || outsize == nullptr) {
+    JPEGLI_ERROR("jpegli_mem_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::MemoryDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_mem_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    auto dest = jpegli::Allocate<jpegli::MemoryDestinationManager>(cinfo, 1);
+    dest->temp_buffer = nullptr;
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(dest);
+  }
+  auto dest = reinterpret_cast<jpegli::MemoryDestinationManager*>(cinfo->dest);
+  dest->pub.init_destination =
+      jpegli::MemoryDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::MemoryDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::MemoryDestinationManager::term_destination;
+  dest->output = outbuffer;
+  dest->output_size = outsize;
+  if (*outbuffer == nullptr || *outsize == 0) {
+    dest->temp_buffer =
+        reinterpret_cast<uint8_t*>(malloc(jpegli::kDestBufferSize));
+    *outbuffer = dest->temp_buffer;
+    *outsize = jpegli::kDestBufferSize;
+  }
+  dest->current_buffer = *outbuffer;
+  dest->buffer_size = *outsize;
+  dest->pub.next_output_byte = dest->current_buffer;
+  dest->pub.free_in_buffer = dest->buffer_size;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/downsample.cc b/third_party/jpeg-xl/lib/jpegli/downsample.cc
new file mode 100644
index 0000000000..df2c156972
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/downsample.cc
@@ -0,0 +1,356 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/downsample.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_CAPPED(float, 8);
+constexpr D d;
+
+void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.5f);
+  Vec<D> v0, v1;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row_in + 2 * x, v0, v1);
+    Store(Mul(mul, Add(v0, v1)), d, row_out + x);
+  }
+}
+
+void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 3;
+  const auto mul = Set(d, 1.0f / 3);
+  Vec<D> v0, v1, v2;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
+    Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
+  }
+}
+
+void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 4;
+  const auto mul = Set(d, 0.25f);
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, row_out);
+}
+
+void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, row_out);
+}
+
+void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, row_out);
+}
+
+void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.5f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  for (size_t x = 0; x < len; x += N) {
+    Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
+  }
+}
+
+void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row0 + 2 * x, v0, v1);
+    LoadInterleaved2(d, row1 + 2 * x, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 3, row_out);
+}
+
+void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 4, row_out);
+}
+
+void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 1.0f / 3);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
+  }
+}
+
+void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 2, row_out);
+}
+
+void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 3, row_out);
+}
+
+void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 4, row_out);
+}
+
+void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  float* row3 = rows_in[3];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    const auto in3 = Load(d, row3 + x);
+    Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
+  }
+}
+
+void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow2x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 2, row_out);
+}
+
+void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow3x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 3, row_out);
+}
+
+void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow4x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 4, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Downsample1x2);
+HWY_EXPORT(Downsample1x3);
+HWY_EXPORT(Downsample1x4);
+HWY_EXPORT(Downsample2x1);
+HWY_EXPORT(Downsample2x2);
+HWY_EXPORT(Downsample2x3);
+HWY_EXPORT(Downsample2x4);
+HWY_EXPORT(Downsample3x1);
+HWY_EXPORT(Downsample3x2);
+HWY_EXPORT(Downsample3x3);
+HWY_EXPORT(Downsample3x4);
+HWY_EXPORT(Downsample4x1);
+HWY_EXPORT(Downsample4x2);
+HWY_EXPORT(Downsample4x3);
+HWY_EXPORT(Downsample4x4);
+
+void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                    float* row_out) {}
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    m->downsample_method[c] = nullptr;
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (v_factor == 1) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = NullDownsample;
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
+      }
+    } else if (v_factor == 2) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 3) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 4) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
+      }
+    }
+    if (m->downsample_method[c] == nullptr) {
+      JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
+    }
+  }
+}
+
+void DownsampleInputBuffer(j_compress_ptr cinfo) {
+  if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const size_t y0 = m->next_iMCU_row * iMCU_height;
+  const size_t y1 = y0 + iMCU_height;
+  const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (h_factor == 1 && v_factor == 1) {
+      continue;
+    }
+    auto& input = *m->smooth_input[c];
+    auto& output = *m->raw_data[c];
+    const size_t yout0 = y0 / v_factor;
+    float* rows_in[MAX_SAMP_FACTOR];
+    for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) {
+      for (int iy = 0; iy < v_factor; ++iy) {
+        rows_in[iy] = input.Row(yin + iy);
+      }
+      float* row_out = output.Row(yout);
+      (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
+    }
+  }
+}
+
+void ApplyInputSmoothing(j_compress_ptr cinfo) {
+  if (!cinfo->smoothing_factor) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const float kW1 = cinfo->smoothing_factor / 1024.0;
+  const float kW0 = 1.0f - 8.0f * kW1;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const ssize_t y0 = m->next_iMCU_row * iMCU_height;
+  const ssize_t y1 = y0 + iMCU_height;
+  const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    auto& input = m->input_buffer[c];
+    auto& output = *m->smooth_input[c];
+    if (m->next_iMCU_row == 0) {
+      input.CopyRow(-1, 0, 1);
+    }
+    if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+      size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+      input.CopyRow(last_row + 1, last_row, 1);
+    }
+    // TODO(szabadka) SIMDify this.
+    for (ssize_t y = y0; y < y1; ++y) {
+      const float* row_t = input.Row(y - 1);
+      const float* row_m = input.Row(y);
+      const float* row_b = input.Row(y + 1);
+      float* row_out = output.Row(y);
+      for (ssize_t x = 0; x < xsize_padded; ++x) {
+        float val_tl = row_t[x - 1];
+        float val_tm = row_t[x];
+        float val_tr = row_t[x + 1];
+        float val_ml = row_m[x - 1];
+        float val_mm = row_m[x];
+        float val_mr = row_m[x + 1];
+        float val_bl = row_b[x - 1];
+        float val_bm = row_b[x];
+        float val_br = row_b[x + 1];
+        float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
+                      val_bm + val_br);
+        row_out[x] = val_mm * kW0 + val1 * kW1;
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/downsample.h b/third_party/jpeg-xl/lib/jpegli/downsample.h
new file mode 100644
index 0000000000..9d87047758
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/downsample.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DOWNSAMPLE_H_
+#define LIB_JPEGLI_DOWNSAMPLE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo);
+
+void DownsampleInputBuffer(j_compress_ptr cinfo);
+
+void ApplyInputSmoothing(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DOWNSAMPLE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/encode.cc b/third_party/jpeg-xl/lib/jpegli/encode.cc
new file mode 100644
index 0000000000..6015d7d9bb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode.cc
@@ -0,0 +1,1153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+
+#include <cmath>
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/bitstream.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/dct.h"
+#include "lib/jpegli/downsample.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/input.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/quant.h"
+
+namespace jpegli {
+
+constexpr size_t kMaxBytesInMarker = 65533;
+
+void CheckState(j_compress_ptr cinfo, int state) {
+  if (cinfo->global_state != state) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d]",
+                 cinfo->global_state, state);
+  }
+}
+
+void CheckState(j_compress_ptr cinfo, int state1, int state2) {
+  if (cinfo->global_state != state1 && cinfo->global_state != state2) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d or %d]",
+                 cinfo->global_state, state1, state2);
+  }
+}
+
+// Initialize cinfo fields that are not dependent on input image. This is shared
+// between jpegli_CreateCompress() and jpegli_set_defaults()
+void InitializeCompressParams(j_compress_ptr cinfo) {
+  cinfo->data_precision = 8;
+  cinfo->num_scans = 0;
+  cinfo->scan_info = nullptr;
+  cinfo->raw_data_in = FALSE;
+  cinfo->arith_code = FALSE;
+  cinfo->optimize_coding = FALSE;
+  cinfo->CCIR601_sampling = FALSE;
+  cinfo->smoothing_factor = 0;
+  cinfo->dct_method = JDCT_FLOAT;
+  cinfo->restart_interval = 0;
+  cinfo->restart_in_rows = 0;
+  cinfo->write_JFIF_header = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+#if JPEG_LIB_VERSION >= 70
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->do_fancy_downsampling = FALSE;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#endif
+}
+
+float LinearQualityToDistance(int scale_factor) {
+  scale_factor = std::min(5000, std::max(0, scale_factor));
+  int quality =
+      scale_factor < 100 ? 100 - scale_factor / 2 : 5000 / scale_factor;
+  return jpegli_quality_to_distance(quality);
+}
+
+template <typename T>
+void SetSentTableFlag(T** table_ptrs, size_t num, boolean val) {
+  for (size_t i = 0; i < num; ++i) {
+    if (table_ptrs[i]) table_ptrs[i]->sent_table = val;
+  }
+}
+
+struct ProgressiveScan {
+  int Ss, Se, Ah, Al;
+  bool interleaved;
+};
+
+void SetDefaultScanScript(j_compress_ptr cinfo) {
+  int level = cinfo->master->progressive_level;
+  std::vector<ProgressiveScan> progressive_mode;
+  bool interleave_dc =
+      (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1);
+  if (level == 0) {
+    progressive_mode.push_back({0, 63, 0, 0, true});
+  } else if (level == 1) {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 63, 0, 1, false});
+    progressive_mode.push_back({1, 63, 1, 0, false});
+  } else {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 2, 0, 0, false});
+    progressive_mode.push_back({3, 63, 0, 2, false});
+    progressive_mode.push_back({3, 63, 2, 1, false});
+    progressive_mode.push_back({3, 63, 1, 0, false});
+  }
+
+  cinfo->script_space_size = 0;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    cinfo->script_space_size += DivCeil(cinfo->num_components, comps);
+  }
+  cinfo->script_space =
+      Allocate<jpeg_scan_info>(cinfo, cinfo->script_space_size);
+
+  jpeg_scan_info* next_scan = cinfo->script_space;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    for (int c = 0; c < cinfo->num_components; c += comps) {
+      next_scan->Ss = scan.Ss;
+      next_scan->Se = scan.Se;
+      next_scan->Ah = scan.Ah;
+      next_scan->Al = scan.Al;
+      next_scan->comps_in_scan = std::min(comps, cinfo->num_components - c);
+      for (int j = 0; j < next_scan->comps_in_scan; ++j) {
+        next_scan->component_index[j] = c + j;
+      }
+      ++next_scan;
+    }
+  }
+  JXL_ASSERT(next_scan - cinfo->script_space == cinfo->script_space_size);
+  cinfo->scan_info = cinfo->script_space;
+  cinfo->num_scans = cinfo->script_space_size;
+}
+
+void ValidateScanScript(j_compress_ptr cinfo) {
+  // Mask of coefficient bits defined by the scan script, for each component
+  // and coefficient index.
+  uint16_t comp_mask[kMaxComponents][DCTSIZE2] = {};
+  static constexpr int kMaxRefinementBit = 10;
+
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info& si = cinfo->scan_info[i];
+    if (si.comps_in_scan < 1 || si.comps_in_scan > MAX_COMPS_IN_SCAN) {
+      JPEGLI_ERROR("Invalid number of components in scan %d", si.comps_in_scan);
+    }
+    int last_ci = -1;
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (ci < 0 || ci >= cinfo->num_components) {
+        JPEGLI_ERROR("Invalid component index %d in scan", ci);
+      } else if (ci == last_ci) {
+        JPEGLI_ERROR("Duplicate component index %d in scan", ci);
+      } else if (ci < last_ci) {
+        JPEGLI_ERROR("Out of order component index %d in scan", ci);
+      }
+      last_ci = ci;
+    }
+    if (si.Ss < 0 || si.Se < si.Ss || si.Se >= DCTSIZE2) {
+      JPEGLI_ERROR("Invalid spectral range %d .. %d in scan", si.Ss, si.Se);
+    }
+    if (si.Ah < 0 || si.Al < 0 || si.Al > kMaxRefinementBit) {
+      JPEGLI_ERROR("Invalid refinement bits %d/%d", si.Ah, si.Al);
+    }
+    if (!cinfo->progressive_mode) {
+      if (si.Ss != 0 || si.Se != DCTSIZE2 - 1 || si.Ah != 0 || si.Al != 0) {
+        JPEGLI_ERROR("Invalid scan for sequential mode");
+      }
+    } else {
+      if (si.Ss == 0 && si.Se != 0) {
+        JPEGLI_ERROR("DC and AC together in progressive scan");
+      }
+    }
+    if (si.Ss != 0 && si.comps_in_scan != 1) {
+      JPEGLI_ERROR("Interleaved AC only scan.");
+    }
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (si.Ss != 0 && comp_mask[ci][0] == 0) {
+        JPEGLI_ERROR("AC before DC in component %d of scan", ci);
+      }
+      for (int k = si.Ss; k <= si.Se; ++k) {
+        if (comp_mask[ci][k] == 0) {
+          if (si.Ah != 0) {
+            JPEGLI_ERROR("Invalid first scan refinement bit");
+          }
+          comp_mask[ci][k] = ((0xffff << si.Al) & 0xffff);
+        } else {
+          if (comp_mask[ci][k] != ((0xffff << si.Ah) & 0xffff) ||
+              si.Al != si.Ah - 1) {
+            JPEGLI_ERROR("Invalid refinement bit progression.");
+          }
+          comp_mask[ci][k] |= 1 << si.Al;
+        }
+      }
+    }
+    if (si.comps_in_scan > 1) {
+      size_t mcu_size = 0;
+      for (int j = 0; j < si.comps_in_scan; ++j) {
+        int ci = si.component_index[j];
+        jpeg_component_info* comp = &cinfo->comp_info[ci];
+        mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+      }
+      if (mcu_size > C_MAX_BLOCKS_IN_MCU) {
+        JPEGLI_ERROR("MCU size too big");
+      }
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      if (comp_mask[c][k] != 0xffff) {
+        JPEGLI_ERROR("Incomplete scan of component %d and frequency %d", c, k);
+      }
+    }
+  }
+}
+
+void ProcessCompressionParams(j_compress_ptr cinfo) {
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  if (cinfo->image_width < 1 || cinfo->image_height < 1 ||
+      cinfo->input_components < 1) {
+    JPEGLI_ERROR("Empty input image.");
+  }
+  if (cinfo->image_width > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->image_height > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->input_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Input image too big.");
+  }
+  if (cinfo->num_components < 1 ||
+      cinfo->num_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Invalid number of components.");
+  }
+  if (cinfo->data_precision != kJpegPrecision) {
+    JPEGLI_ERROR("Invalid data precision");
+  }
+  if (cinfo->arith_code) {
+    JPEGLI_ERROR("Arithmetic coding is not implemented.");
+  }
+  if (cinfo->CCIR601_sampling) {
+    JPEGLI_ERROR("CCIR601 sampling is not implemented.");
+  }
+  if (cinfo->restart_interval > 65535u) {
+    JPEGLI_ERROR("Restart interval too big");
+  }
+  if (cinfo->smoothing_factor < 0 || cinfo->smoothing_factor > 100) {
+    JPEGLI_ERROR("Invalid smoothing factor %d", cinfo->smoothing_factor);
+  }
+  jpeg_comp_master* m = cinfo->master;
+  cinfo->max_h_samp_factor = cinfo->max_v_samp_factor = 1;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (comp->component_index != c) {
+      JPEGLI_ERROR("Invalid component index");
+    }
+    for (int j = 0; j < c; ++j) {
+      if (cinfo->comp_info[j].component_id == comp->component_id) {
+        JPEGLI_ERROR("Duplicate component id %d", comp->component_id);
+      }
+    }
+    if (comp->h_samp_factor <= 0 || comp->v_samp_factor <= 0 ||
+        comp->h_samp_factor > MAX_SAMP_FACTOR ||
+        comp->v_samp_factor > MAX_SAMP_FACTOR) {
+      JPEGLI_ERROR("Invalid sampling factor %d x %d", comp->h_samp_factor,
+                   comp->v_samp_factor);
+    }
+    cinfo->max_h_samp_factor =
+        std::max(comp->h_samp_factor, cinfo->max_h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(comp->v_samp_factor, cinfo->max_v_samp_factor);
+  }
+  if (cinfo->num_components == 1 &&
+      (cinfo->max_h_samp_factor != 1 || cinfo->max_v_samp_factor != 1)) {
+    JPEGLI_ERROR("Sampling is not supported for simgle component image.");
+  }
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  cinfo->total_iMCU_rows = DivCeil(cinfo->image_height, iMCU_height);
+  m->xsize_blocks = total_iMCU_cols * cinfo->max_h_samp_factor;
+  m->ysize_blocks = cinfo->total_iMCU_rows * cinfo->max_v_samp_factor;
+
+  size_t blocks_per_iMCU = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral sampling ratios are not supported.");
+    }
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[c]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[c]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+    blocks_per_iMCU += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  m->blocks_per_iMCU_row = total_iMCU_cols * blocks_per_iMCU;
+  // Disable adaptive quantization for subsampled luma channel.
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  if (y_comp->h_samp_factor != cinfo->max_h_samp_factor ||
+      y_comp->v_samp_factor != cinfo->max_v_samp_factor) {
+    m->use_adaptive_quantization = false;
+  }
+  if (cinfo->scan_info == nullptr) {
+    SetDefaultScanScript(cinfo);
+  }
+  cinfo->progressive_mode =
+      cinfo->scan_info->Ss != 0 || cinfo->scan_info->Se != DCTSIZE2 - 1;
+  ValidateScanScript(cinfo);
+}
+
+void ResetForImage(j_compress_ptr cinfo) {
+  (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+  (*cinfo->dest->init_destination)(cinfo);
+  jpeg_comp_master* m = cinfo->master;
+  m->next_iMCU_row = 0;
+  m->last_restart_interval = 0;
+  m->last_dht_index = 0;
+  m->num_huffman_codes = 0;
+  if (cinfo->num_scans > 0) {
+    m->scan_coding_info =
+        Allocate<ScanCodingInfo>(cinfo, cinfo->num_scans, JPOOL_IMAGE_ALIGNED);
+  }
+}
+
+bool IsStreamingSupported(j_compress_ptr cinfo) {
+  if (cinfo->global_state == kEncWriteCoeffs) {
+    return false;
+  }
+  // TODO(szabadka) Remove this restriction.
+  if (cinfo->restart_interval > 0 || cinfo->restart_in_rows > 0) {
+    return false;
+  }
+  if (cinfo->optimize_coding || cinfo->num_scans > 1) {
+    return false;
+  }
+  return true;
+}
+
+bool IsSinglePassOptimizerSupported(j_compress_ptr cinfo) {
+  return cinfo->num_scans == 1 && cinfo->optimize_coding &&
+         cinfo->restart_interval == 0 && cinfo->restart_in_rows == 0;
+}
+
+void AllocateBuffers(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  size_t xsize_full = total_iMCU_cols * iMCU_width;
+  size_t ysize_full = 3 * iMCU_height;
+  if (!cinfo->raw_data_in) {
+    int num_all_components =
+        std::max(cinfo->input_components, cinfo->num_components);
+    for (int c = 0; c < num_all_components; ++c) {
+      m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full);
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = total_iMCU_cols * comp->h_samp_factor * DCTSIZE;
+    size_t ysize = 3 * comp->v_samp_factor * DCTSIZE;
+    if (cinfo->raw_data_in) {
+      m->input_buffer[c].Allocate(cinfo, ysize, xsize);
+    }
+    m->smooth_input[c] = &m->input_buffer[c];
+    if (!cinfo->raw_data_in && cinfo->smoothing_factor) {
+      m->smooth_input[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->smooth_input[c]->Allocate(cinfo, ysize_full, xsize_full);
+    }
+    m->raw_data[c] = m->smooth_input[c];
+    if (!cinfo->raw_data_in && (m->h_factor[c] > 1 || m->v_factor[c] > 1)) {
+      m->raw_data[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->raw_data[c]->Allocate(cinfo, ysize, xsize);
+    }
+    m->quant_mul[c] = Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  }
+  m->dct_buffer = Allocate<float>(cinfo, 2 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  m->block_tmp = Allocate<int32_t>(cinfo, DCTSIZE2 * 4, JPOOL_IMAGE_ALIGNED);
+  if (!IsStreamingSupported(cinfo)) {
+    m->coeff_buffers =
+        Allocate<jvirt_barray_ptr>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      const size_t xsize_blocks = comp->width_in_blocks;
+      const size_t ysize_blocks = comp->height_in_blocks;
+      m->coeff_buffers[c] = (*cinfo->mem->request_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          /*pre_zero=*/false, xsize_blocks, ysize_blocks, comp->v_samp_factor);
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+    jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+    const size_t xsize_blocks = y_comp->width_in_blocks;
+    const size_t vecsize = VectorSize();
+    const size_t xsize_padded = DivCeil(2 * xsize_blocks, vecsize) * vecsize;
+    m->diff_buffer =
+        Allocate<float>(cinfo, xsize_blocks * DCTSIZE + 8, JPOOL_IMAGE_ALIGNED);
+    m->fuzzy_erosion_tmp.Allocate(cinfo, 2, xsize_padded);
+    m->pre_erosion.Allocate(cinfo, 6 * cinfo->max_v_samp_factor, xsize_padded);
+    m->quant_field.Allocate(cinfo, cinfo->max_v_samp_factor, xsize_blocks);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->zero_bias_offset[c] =
+          Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+      m->zero_bias_mul[c] =
+          Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+    }
+  }
+}
+
+void ReadInputRow(j_compress_ptr cinfo, const uint8_t* scanline,
+                  float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  int num_all_components =
+      std::max(cinfo->input_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    row[c] = m->input_buffer[c].Row(m->next_input_row);
+  }
+  ++m->next_input_row;
+  if (scanline == nullptr) {
+    for (int c = 0; c < cinfo->input_components; ++c) {
+      memset(row[c], 0, cinfo->image_width * sizeof(row[c][0]));
+    }
+    return;
+  }
+  (*m->input_method)(scanline, cinfo->image_width, row);
+}
+
+void PadInputBuffer(j_compress_ptr cinfo, float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  const size_t len0 = cinfo->image_width;
+  const size_t len1 = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    // Pad row to a multiple of the iMCU width, plus create a border of 1
+    // repeated pixel for adaptive quant field calculation.
+    float last_val = row[c][len0 - 1];
+    for (size_t x = len0; x <= len1; ++x) {
+      row[c][x] = last_val;
+    }
+    row[c][-1] = row[c][0];
+  }
+  if (m->next_input_row == cinfo->image_height) {
+    size_t num_rows = m->ysize_blocks * DCTSIZE - cinfo->image_height;
+    for (size_t i = 0; i < num_rows; ++i) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        float* dest = m->input_buffer[c].Row(m->next_input_row) - 1;
+        memcpy(dest, row[c] - 1, (len1 + 2) * sizeof(dest[0]));
+      }
+      ++m->next_input_row;
+    }
+  }
+}
+
+void ProcessiMCURow(j_compress_ptr cinfo) {
+  JXL_ASSERT(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows);
+  if (!cinfo->raw_data_in) {
+    ApplyInputSmoothing(cinfo);
+    DownsampleInputBuffer(cinfo);
+  }
+  ComputeAdaptiveQuantField(cinfo);
+  if (IsStreamingSupported(cinfo)) {
+    WriteiMCURow(cinfo);
+  } else {
+    ComputeDCTCoefficients(cinfo);
+  }
+  ++cinfo->master->next_iMCU_row;
+}
+
+void ProcessiMCURows(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  // To have context rows both above and below the current iMCU row, we delay
+  // processing the first iMCU row and process two iMCU rows after we receive
+  // the last input row.
+  if (m->next_input_row % iMCU_height == 0 && m->next_input_row > iMCU_height) {
+    ProcessiMCURow(cinfo);
+  }
+  if (m->next_input_row >= cinfo->image_height) {
+    ProcessiMCURow(cinfo);
+  }
+}
+
+void InitProgressMonitor(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  if (IsStreamingSupported(cinfo)) {
+    // We have only one input pass.
+    cinfo->progress->total_passes = 1;
+  } else if (IsSinglePassOptimizerSupported(cinfo)) {
+    // We have one input pass and an encode pass for each scan.
+    cinfo->progress->total_passes = 1 + cinfo->num_scans;
+  } else {
+    // We have one input pass, a histogram pass for each scan, and an encode
+    // pass for each scan.
+    cinfo->progress->total_passes = 1 + 2 * cinfo->num_scans;
+  }
+}
+
+void ProgressMonitorInputPass(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 0;
+  cinfo->progress->pass_counter = cinfo->next_scanline;
+  cinfo->progress->pass_limit = cinfo->image_height;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void WriteFileHeader(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  if (cinfo->write_JFIF_header) {
+    EncodeAPP0(cinfo);
+  }
+  if (cinfo->write_Adobe_marker) {
+    EncodeAPP14(cinfo);
+  }
+}
+
+void WriteScanHeader(j_compress_ptr cinfo, size_t scan_idx) {
+  jpeg_comp_master* m = cinfo->master;
+  cinfo->restart_interval = RestartIntervalForScan(cinfo, scan_idx);
+  if (cinfo->restart_interval != m->last_restart_interval) {
+    EncodeDRI(cinfo);
+    m->last_restart_interval = cinfo->restart_interval;
+  }
+  size_t num_dht = cinfo->master->scan_coding_info[scan_idx].num_huffman_codes;
+  if (num_dht > 0) {
+    bool pre_shifted = IsStreamingSupported(cinfo);
+    EncodeDHT(cinfo, m->huffman_codes + m->last_dht_index, num_dht,
+              pre_shifted);
+    m->last_dht_index += num_dht;
+  }
+  EncodeSOS(cinfo, scan_idx);
+}
+
+void WriteHeaderMarkers(j_compress_ptr cinfo) {
+  bool is_baseline = true;
+  CopyHuffmanCodes(cinfo, &is_baseline);
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  WriteScanHeader(cinfo, 0);
+  memset(cinfo->master->last_dc_coeff, 0, sizeof(cinfo->master->last_dc_coeff));
+}
+
+void EncodeScans(j_compress_ptr cinfo) {
+  if (IsSinglePassOptimizerSupported(cinfo)) {
+    EncodeSingleScan(cinfo);
+    return;
+  }
+  bool is_baseline = false;
+  if (cinfo->optimize_coding || cinfo->progressive_mode) {
+    OptimizeHuffmanCodes(cinfo, &is_baseline);
+  } else {
+    CopyHuffmanCodes(cinfo, &is_baseline);
+  }
+  EncodeDQT(cinfo, /*write_all_tables=*/false, &is_baseline);
+  EncodeSOF(cinfo, is_baseline);
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    WriteScanHeader(cinfo, i);
+    if (!EncodeScan(cinfo, i)) {
+      JPEGLI_ERROR("Failed to encode scan.");
+    }
+  }
+}
+
+}  // namespace jpegli
+
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpegli_compress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->progress = nullptr;
+  cinfo->is_decompressor = FALSE;
+  cinfo->global_state = jpegli::kEncStart;
+  cinfo->dest = nullptr;
+  cinfo->image_width = 0;
+  cinfo->image_height = 0;
+  cinfo->input_components = 0;
+  cinfo->in_color_space = JCS_UNKNOWN;
+  cinfo->input_gamma = 1.0f;
+  cinfo->num_components = 0;
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->comp_info = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; ++i) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  cinfo->write_Adobe_marker = false;
+  jpegli::InitializeCompressParams(cinfo);
+  cinfo->master = jpegli::Allocate<jpeg_comp_master>(cinfo, 1);
+  cinfo->master->force_baseline = true;
+  cinfo->master->xyb_mode = false;
+  cinfo->master->cicp_transfer_function = 2;  // unknown transfer function code
+  cinfo->master->use_std_tables = false;
+  cinfo->master->use_adaptive_quantization = true;
+  cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel;
+  cinfo->master->data_type = JPEGLI_TYPE_UINT8;
+  cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN;
+  cinfo->master->coeff_buffers = nullptr;
+}
+
+void jpegli_set_xyb_mode(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->xyb_mode = true;
+}
+
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->cicp_transfer_function = code;
+}
+
+void jpegli_set_defaults(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::InitializeCompressParams(cinfo);
+  jpegli_default_colorspace(cinfo);
+  jpegli_set_quality(cinfo, 90, TRUE);
+  jpegli_set_progressive_level(cinfo, jpegli::kDefaultProgressiveLevel);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/false);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/true);
+}
+
+void jpegli_default_colorspace(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (cinfo->in_color_space) {
+    case JCS_GRAYSCALE:
+      jpegli_set_colorspace(cinfo, JCS_GRAYSCALE);
+      break;
+    case JCS_RGB: {
+      if (cinfo->master->xyb_mode) {
+        jpegli_set_colorspace(cinfo, JCS_RGB);
+      } else {
+        jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      }
+      break;
+    }
+    case JCS_YCbCr:
+      jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      break;
+    case JCS_CMYK:
+      jpegli_set_colorspace(cinfo, JCS_CMYK);
+      break;
+    case JCS_YCCK:
+      jpegli_set_colorspace(cinfo, JCS_YCCK);
+      break;
+    case JCS_UNKNOWN:
+      jpegli_set_colorspace(cinfo, JCS_UNKNOWN);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported input colorspace %d", cinfo->in_color_space);
+  }
+}
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->jpeg_color_space = colorspace;
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      cinfo->num_components = 1;
+      break;
+    case JCS_RGB:
+    case JCS_YCbCr:
+      cinfo->num_components = 3;
+      break;
+    case JCS_CMYK:
+    case JCS_YCCK:
+      cinfo->num_components = 4;
+      break;
+    case JCS_UNKNOWN:
+      cinfo->num_components =
+          std::min<int>(jpegli::kMaxComponents, cinfo->input_components);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported jpeg colorspace %d", colorspace);
+  }
+  // Adobe marker is only needed to distinguish CMYK and YCCK JPEGs.
+  cinfo->write_Adobe_marker = (cinfo->jpeg_color_space == JCS_YCCK);
+  if (cinfo->comp_info == nullptr) {
+    cinfo->comp_info =
+        jpegli::Allocate<jpeg_component_info>(cinfo, MAX_COMPONENTS);
+  }
+  memset(cinfo->comp_info, 0,
+         jpegli::kMaxComponents * sizeof(jpeg_component_info));
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    comp->component_index = c;
+    comp->component_id = c + 1;
+    comp->h_samp_factor = 1;
+    comp->v_samp_factor = 1;
+    comp->quant_tbl_no = 0;
+    comp->dc_tbl_no = 0;
+    comp->ac_tbl_no = 0;
+  }
+  if (colorspace == JCS_RGB) {
+    cinfo->comp_info[0].component_id = 'R';
+    cinfo->comp_info[1].component_id = 'G';
+    cinfo->comp_info[2].component_id = 'B';
+    if (cinfo->master->xyb_mode) {
+      // Subsample blue channel.
+      cinfo->comp_info[0].h_samp_factor = cinfo->comp_info[0].v_samp_factor = 2;
+      cinfo->comp_info[1].h_samp_factor = cinfo->comp_info[1].v_samp_factor = 2;
+      cinfo->comp_info[2].h_samp_factor = cinfo->comp_info[2].v_samp_factor = 1;
+      // Use separate quantization tables for each component
+      cinfo->comp_info[1].quant_tbl_no = 1;
+      cinfo->comp_info[2].quant_tbl_no = 2;
+    }
+  } else if (colorspace == JCS_CMYK) {
+    cinfo->comp_info[0].component_id = 'C';
+    cinfo->comp_info[1].component_id = 'M';
+    cinfo->comp_info[2].component_id = 'Y';
+    cinfo->comp_info[3].component_id = 'K';
+  } else if (colorspace == JCS_YCbCr || colorspace == JCS_YCCK) {
+    // Use separate quantization and Huffman tables for luma and chroma
+    cinfo->comp_info[1].quant_tbl_no = 1;
+    cinfo->comp_info[2].quant_tbl_no = 1;
+    cinfo->comp_info[1].dc_tbl_no = cinfo->comp_info[1].ac_tbl_no = 1;
+    cinfo->comp_info[2].dc_tbl_no = cinfo->comp_info[2].ac_tbl_no = 1;
+  }
+}
+
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/true);
+}
+
+float jpegli_quality_to_distance(int quality) {
+  return (quality >= 100  ? 0.01f
+          : quality >= 30 ? 0.1f + (100 - quality) * 0.09f
+                          : 53.0f / 3000.0f * quality * quality -
+                                23.0f / 20.0f * quality + 25.0f);
+}
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli_quality_to_distance(quality);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli::LinearQualityToDistance(scale_factor);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS];
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    distances[i] = jpegli::LinearQualityToDistance(cinfo->q_scale_factor[i]);
+  }
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+#endif
+
+int jpegli_quality_scaling(int quality) {
+  quality = std::min(100, std::max(1, quality));
+  return quality < 50 ? 5000 / quality : 200 - 2 * quality;
+}
+
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_std_tables = true;
+}
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (which_tbl < 0 || which_tbl > NUM_QUANT_TBLS) {
+    JPEGLI_ERROR("Invalid quant table index %d", which_tbl);
+  }
+  if (cinfo->quant_tbl_ptrs[which_tbl] == nullptr) {
+    cinfo->quant_tbl_ptrs[which_tbl] =
+        jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+  }
+  int max_qval = force_baseline ? 255 : 32767U;
+  JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[which_tbl];
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    int qval = (basic_table[k] * scale_factor + 50) / 100;
+    qval = std::max(1, std::min(qval, max_qval));
+    quant_table->quantval[k] = qval;
+  }
+  quant_table->sent_table = FALSE;
+}
+
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_adaptive_quantization = value;
+}
+
+void jpegli_simple_progression(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli_set_progressive_level(cinfo, 2);
+}
+
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (level < 0) {
+    JPEGLI_ERROR("Invalid progressive level %d", level);
+  }
+  cinfo->master->progressive_level = level;
+}
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->data_type = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+    case JPEGLI_LITTLE_ENDIAN:
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->endianness = endianness;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  // Since input scaling is not supported, we just copy the image dimensions.
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+}
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo) {
+  CheckState(dstinfo, jpegli::kEncStart);
+  // Image parameters.
+  dstinfo->image_width = srcinfo->image_width;
+  dstinfo->image_height = srcinfo->image_height;
+  dstinfo->input_components = srcinfo->num_components;
+  dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->input_gamma = srcinfo->output_gamma;
+  // Compression parameters.
+  jpegli_set_defaults(dstinfo);
+  jpegli_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
+  if (dstinfo->num_components != srcinfo->num_components) {
+    const auto& cinfo = dstinfo;
+    return JPEGLI_ERROR("Mismatch between src colorspace and components");
+  }
+  dstinfo->data_precision = srcinfo->data_precision;
+  dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
+  dstinfo->JFIF_major_version = srcinfo->JFIF_major_version;
+  dstinfo->JFIF_minor_version = srcinfo->JFIF_minor_version;
+  dstinfo->density_unit = srcinfo->density_unit;
+  dstinfo->X_density = srcinfo->X_density;
+  dstinfo->Y_density = srcinfo->Y_density;
+  for (int c = 0; c < dstinfo->num_components; ++c) {
+    jpeg_component_info* srccomp = &srcinfo->comp_info[c];
+    jpeg_component_info* dstcomp = &dstinfo->comp_info[c];
+    dstcomp->component_id = srccomp->component_id;
+    dstcomp->h_samp_factor = srccomp->h_samp_factor;
+    dstcomp->v_samp_factor = srccomp->v_samp_factor;
+    dstcomp->quant_tbl_no = srccomp->quant_tbl_no;
+  }
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!srcinfo->quant_tbl_ptrs[i]) continue;
+    if (dstinfo->quant_tbl_ptrs[i] == nullptr) {
+      dstinfo->quant_tbl_ptrs[i] = jpegli::Allocate<JQUANT_TBL>(dstinfo, 1);
+    }
+    memcpy(dstinfo->quant_tbl_ptrs[i], srcinfo->quant_tbl_ptrs[i],
+           sizeof(JQUANT_TBL));
+    dstinfo->quant_tbl_ptrs[i]->sent_table = FALSE;
+  }
+}
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli::SetSentTableFlag(cinfo->quant_tbl_ptrs, NUM_QUANT_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->dc_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->ac_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+}
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::ProcessCompressionParams(cinfo);
+  jpegli::InitProgressMonitor(cinfo);
+  jpegli::AllocateBuffers(cinfo);
+  jpegli::ChooseInputMethod(cinfo);
+  if (!cinfo->raw_data_in) {
+    jpegli::ChooseColorTransform(cinfo);
+    jpegli::ChooseDownsampleMethods(cinfo);
+  }
+  jpegli::InitQuantizer(cinfo);
+  if (write_all_tables) {
+    jpegli_suppress_tables(cinfo, FALSE);
+  }
+  (*cinfo->mem->realize_virt_arrays)(reinterpret_cast<j_common_ptr>(cinfo));
+  jpegli::ResetForImage(cinfo);
+  jpegli::WriteFileHeader(cinfo);
+  jpegli::JpegBitWriterInit(cinfo);
+  cinfo->next_scanline = 0;
+  cinfo->master->next_input_row = 0;
+  cinfo->global_state = jpegli::kEncHeader;
+}
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::ProcessCompressionParams(cinfo);
+  jpegli::InitProgressMonitor(cinfo);
+  (*cinfo->mem->realize_virt_arrays)(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->master->coeff_buffers = coef_arrays;
+  jpegli_suppress_tables(cinfo, FALSE);
+  jpegli::ResetForImage(cinfo);
+  jpegli::WriteFileHeader(cinfo);
+  jpegli::JpegBitWriterInit(cinfo);
+  cinfo->next_scanline = cinfo->image_height;
+  cinfo->master->next_input_row = cinfo->image_height;
+  cinfo->global_state = jpegli::kEncWriteCoeffs;
+}
+
+void jpegli_write_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  jpegli::ResetForImage(cinfo);
+  bool is_baseline = true;
+  jpeg_comp_master* m = cinfo->master;
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  jpegli::EncodeDQT(cinfo, /*write_all_tables=*/true, &is_baseline);
+  jpegli::CopyHuffmanCodes(cinfo, &is_baseline);
+  jpegli::EncodeDHT(cinfo, m->huffman_codes, m->num_huffman_codes);
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+  jpegli_suppress_tables(cinfo, TRUE);
+}
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncWriteCoeffs);
+  if (datalen > jpegli::kMaxBytesInMarker) {
+    JPEGLI_ERROR("Invalid marker length %u", datalen);
+  }
+  if (marker != 0xfe && (marker < 0xe0 || marker > 0xef)) {
+    JPEGLI_ERROR(
+        "jpegli_write_m_header: Only APP and COM markers are supported.");
+  }
+  std::vector<uint8_t> marker_data(4 + datalen);
+  marker_data[0] = 0xff;
+  marker_data[1] = marker;
+  marker_data[2] = (datalen + 2) >> 8;
+  marker_data[3] = (datalen + 2) & 0xff;
+  jpegli::WriteOutput(cinfo, &marker_data[0], 4);
+}
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val) {
+  uint8_t data = val;
+  jpegli::WriteOutput(cinfo, &data, 1);
+}
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+  jpegli::WriteOutput(cinfo, dataptr, datalen);
+}
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len) {
+  constexpr size_t kMaxIccBytesInMarker =
+      jpegli::kMaxBytesInMarker - sizeof jpegli::kICCSignature - 2;
+  const int num_markers =
+      static_cast<int>(jpegli::DivCeil(icc_data_len, kMaxIccBytesInMarker));
+  size_t begin = 0;
+  for (int current_marker = 0; current_marker < num_markers; ++current_marker) {
+    const size_t length = std::min(kMaxIccBytesInMarker, icc_data_len - begin);
+    jpegli_write_m_header(
+        cinfo, jpegli::kICCMarker,
+        static_cast<unsigned int>(length + sizeof jpegli::kICCSignature + 2));
+    for (const unsigned char c : jpegli::kICCSignature) {
+      jpegli_write_m_byte(cinfo, c);
+    }
+    jpegli_write_m_byte(cinfo, current_marker + 1);
+    jpegli_write_m_byte(cinfo, num_markers);
+    for (size_t i = 0; i < length; ++i) {
+      jpegli_write_m_byte(cinfo, icc_data_ptr[begin]);
+      ++begin;
+    }
+  }
+}
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data() must be called for raw data mode.");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::WriteHeaderMarkers(cinfo);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (num_lines + cinfo->next_scanline > cinfo->image_height) {
+    num_lines = cinfo->image_height - cinfo->next_scanline;
+  }
+  JDIMENSION prev_scanline = cinfo->next_scanline;
+  size_t input_lag = (std::min<size_t>(cinfo->image_height, m->next_input_row) -
+                      cinfo->next_scanline);
+  if (input_lag > num_lines) {
+    JPEGLI_ERROR("Need at least %u lines to continue", input_lag);
+  }
+  if (input_lag > 0) {
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline += input_lag;
+  }
+  float* rows[jpegli::kMaxComponents];
+  for (size_t i = input_lag; i < num_lines; ++i) {
+    jpegli::ReadInputRow(cinfo, scanlines[i], rows);
+    (*m->color_transform)(rows, cinfo->image_width);
+    jpegli::PadInputBuffer(cinfo, rows);
+    jpegli::ProcessiMCURows(cinfo);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      break;
+    }
+    ++cinfo->next_scanline;
+  }
+  return cinfo->next_scanline - prev_scanline;
+}
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (!cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data(): raw data mode was not set");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::WriteHeaderMarkers(cinfo);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline >= cinfo->image_height) {
+    return 0;
+  }
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  if (num_lines < iMCU_height) {
+    JPEGLI_ERROR("Missing input lines, minimum is %u", iMCU_height);
+  }
+  if (cinfo->next_scanline < m->next_input_row) {
+    JXL_ASSERT(m->next_input_row - cinfo->next_scanline == iMCU_height);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline = m->next_input_row;
+    return iMCU_height;
+  }
+  size_t iMCU_y = m->next_input_row / iMCU_height;
+  float* rows[jpegli::kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    JSAMPARRAY plane = data[c];
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = comp->width_in_blocks * DCTSIZE;
+    size_t ysize = comp->v_samp_factor * DCTSIZE;
+    size_t y0 = iMCU_y * ysize;
+    auto& buffer = m->input_buffer[c];
+    for (size_t i = 0; i < ysize; ++i) {
+      rows[0] = buffer.Row(y0 + i);
+      if (plane[i] == nullptr) {
+        memset(rows[0], 0, xsize * sizeof(rows[0][0]));
+      } else {
+        (*m->input_method)(plane[i], xsize, rows);
+      }
+      // We need a border of 1 repeated pixel for adaptive quant field.
+      buffer.PadRow(y0 + i, xsize, /*border=*/1);
+    }
+  }
+  m->next_input_row += iMCU_height;
+  jpegli::ProcessiMCURows(cinfo);
+  if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+    return 0;
+  }
+  cinfo->next_scanline += iMCU_height;
+  return iMCU_height;
+}
+
+void jpegli_finish_compress(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncReadImage, jpegli::kEncWriteCoeffs);
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline < cinfo->image_height) {
+    JPEGLI_ERROR("Incomplete image, expected %d rows, got %d",
+                 cinfo->image_height, cinfo->next_scanline);
+  }
+
+  if (jpegli::IsStreamingSupported(cinfo)) {
+    jpegli::JumpToByteBoundary(&m->bw);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+    }
+    if (!m->bw.healthy) {
+      JPEGLI_ERROR("Failed to encode scan.");
+    }
+  } else {
+    jpegli::EncodeScans(cinfo);
+  }
+
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+
+  // Release memory and reset global state.
+  jpegli_abort_compress(cinfo);
+}
+
+void jpegli_abort_compress(j_compress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/encode.h b/third_party/jpeg-xl/lib/jpegli/encode.h
new file mode 100644
index 0000000000..075b6b855f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode.h
@@ -0,0 +1,159 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file conatins the C API of the encoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while compressor object definitions are included directly
+// from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_ENCODE_H_
+#define LIB_JPEGLI_ENCODE_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_compress(cinfo)              \
+  jpegli_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_compress_struct))
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize);
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile);
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize);
+
+void jpegli_set_defaults(j_compress_ptr cinfo);
+
+void jpegli_default_colorspace(j_compress_ptr cinfo);
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace);
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline);
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline);
+#endif
+
+int jpegli_quality_scaling(int quality);
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline);
+
+void jpegli_simple_progression(j_compress_ptr cinfo);
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo);
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo);
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen);
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val);
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen);
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len);
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables);
+
+void jpegli_write_tables(j_compress_ptr cinfo);
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines);
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines);
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays);
+
+void jpegli_finish_compress(j_compress_ptr cinfo);
+
+void jpegli_abort_compress(j_compress_ptr cinfo);
+
+void jpegli_destroy_compress(j_compress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+// Sets the butteraugli target distance for the compressor. This may override
+// the default quantization table indexes based on jpeg colorspace, therefore
+// it must be called after jpegli_set_defaults() or after the last
+// jpegli_set_colorspace() or jpegli_default_colorspace() calls.
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline);
+
+// Returns the butteraugli target distance for the given quality parameter.
+float jpegli_quality_to_distance(int quality);
+
+// Changes the default behaviour of the encoder in the selection of quantization
+// matrices and chroma subsampling. Must be called before jpegli_set_defaults()
+// because some default setting depend on the XYB mode.
+void jpegli_set_xyb_mode(j_compress_ptr cinfo);
+
+// Signals to the encoder that the pixel data that will be provided later
+// through jpegli_write_scanlines() has this transfer function. This must be
+// called before jpegli_set_defaults() because it changes the default
+// quantization tables.
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code);
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness);
+
+// Sets whether or not the encoder uses adaptive quantization for createing more
+// zero coefficients based on the local properties of the image.
+// Enabled by default.
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value);
+
+// Sets the default progression parameters, where level 0 is sequential, and
+// greater level value means more progression steps. Default is 2.
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level);
+
+// If this function is called before starting compression, the quality and
+// linear quality parameters will be used to scale the standard quantization
+// tables from Annex K of the JPEG standard. By default jpegli uses a different
+// set of quantization tables and used different scaling parameters for DC and
+// AC coefficients. Must be called before jpegli_set_defaults().
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_ENCODE_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc
new file mode 100644
index 0000000000..4358b2b6e0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode_api_test.cc
@@ -0,0 +1,856 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  JpegIOMode input_mode = PIXELS;
+  double max_bpp;
+  double max_dist;
+};
+
+class EncodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+void GenerateInput(JpegIOMode input_mode, const CompressParams& jparams,
+                   TestImage* input) {
+  GeneratePixels(input);
+  if (input_mode == RAW_DATA) {
+    GenerateRawData(jparams, input);
+  } else if (input_mode == COEFFICIENTS) {
+    GenerateCoeffs(jparams, input);
+  }
+}
+
+TEST_P(EncodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  GenerateInput(config.input_mode, config.jparams, &config.input);
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  if (config.jparams.icc.empty()) {
+    double bpp =
+        compressed.size() * 8.0 / (config.input.xsize * config.input.ysize);
+    printf("bpp: %f\n", bpp);
+    EXPECT_LT(bpp, config.max_bpp);
+  }
+  DecompressParams dparams;
+  dparams.output_mode =
+      config.input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+  if (config.jparams.set_jpeg_colorspace &&
+      config.jparams.jpeg_color_space == JCS_GRAYSCALE) {
+    ConvertToGrayscale(&config.input);
+  } else {
+    dparams.set_out_color_space = true;
+    dparams.out_color_space = config.input.color_space;
+  }
+  TestImage output;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output);
+  VerifyOutputImage(config.input, output, config.max_dist);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameImageTwice) {
+  TestImage input;
+  input.xsize = 129;
+  input.ysize = 73;
+  CompressParams jparams;
+  GenerateInput(PIXELS, jparams, &input);
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed0;
+  std::vector<uint8_t> compressed1;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed0.assign(buffer, buffer + buffer_size);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed1.assign(buffer, buffer + buffer_size);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+  ASSERT_EQ(compressed0.size(), compressed1.size());
+  EXPECT_EQ(0,
+            memcmp(compressed0.data(), compressed1.data(), compressed0.size()));
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257 + samp * 37;
+        config.input.ysize = 265 + optimize * 17;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        config.jparams.optimize_coding = optimize;
+        config.max_dist = 2.4f;
+        GeneratePixels(&config.input);
+        all_configs.push_back(config);
+      }
+    }
+  }
+  return all_configs;
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameMemOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, buffer, buffer_size);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(), &cinfo,
+                          &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i],
+                      all_configs[i].max_dist);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameStdOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  rewind(tmpf);
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpeg_create_decompress(&cinfo);
+      jpeg_stdio_src(&cinfo, tmpf);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(), &cinfo,
+                          &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i],
+                      all_configs[i].max_dist);
+  }
+  fclose(tmpf);
+}
+
+TEST(EncodeAPITest, ReuseCinfoChangeParams) {
+  TestImage input, output;
+  CompressParams jparams;
+  DecompressParams dparams;
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed;
+  jpeg_compress_struct cinfo;
+  const auto max_rms = [](int q, int hs, int vs) {
+    if (hs == 1 && vs == 1) return q == 90 ? 2.2 : 0.6;
+    if (hs == 2 && vs == 2) return q == 90 ? 2.8 : 1.2;
+    return q == 90 ? 2.4 : 1.0;
+  };
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    dparams.set_out_color_space = true;
+    for (JpegIOMode input_mode : {PIXELS, RAW_DATA, PIXELS, COEFFICIENTS}) {
+      for (int h_samp : {2, 1}) {
+        for (int v_samp : {2, 1}) {
+          for (int progr : {0, 2}) {
+            for (int quality : {90, 100}) {
+              input.Clear();
+              input.color_space =
+                  (input_mode == RAW_DATA ? JCS_YCbCr : JCS_RGB);
+              jparams.quality = quality;
+              jparams.h_sampling = {h_samp, 1, 1};
+              jparams.v_sampling = {v_samp, 1, 1};
+              jparams.progressive_mode = progr;
+              printf(
+                  "Generating input with quality %d chroma subsampling %dx%d "
+                  "input mode %d progressive_mode %d\n",
+                  quality, h_samp, v_samp, input_mode, progr);
+              GenerateInput(input_mode, jparams, &input);
+              jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              if (input_mode != COEFFICIENTS) {
+                cinfo.image_width = input.xsize;
+                cinfo.image_height = input.ysize;
+                cinfo.input_components = input.components;
+                jpegli_set_defaults(&cinfo);
+                jpegli_start_compress(&cinfo, TRUE);
+                jpegli_abort_compress(&cinfo);
+                jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              }
+              EncodeWithJpegli(input, jparams, &cinfo);
+              compressed.resize(buffer_size);
+              std::copy_n(buffer, buffer_size, compressed.data());
+              dparams.output_mode =
+                  input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+              dparams.out_color_space = input.color_space;
+              output.Clear();
+              DecodeWithLibjpeg(jparams, dparams, compressed, &output);
+              VerifyOutputImage(input, output,
+                                max_rms(quality, h_samp, v_samp));
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpeg);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, table_stream, table_stream_size);
+      jpeg_read_header(&cinfo, FALSE);
+      jpeg_mem_src(&cinfo, data_stream, data_stream_size);
+      jpeg_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      EXPECT_EQ(3, cinfo.num_components);
+      jpeg_start_decompress(&cinfo);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpeg_read_scanlines(&cinfo, row, 1);
+      jxl::msan::UnpoisonMemory(image, 3);
+      EXPECT_EQ(0, image[0]);
+      EXPECT_EQ(0, image[1]);
+      EXPECT_EQ(0, image[2]);
+      jpeg_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+void CopyQuantTables(j_compress_ptr cinfo, uint16_t* quant_tables) {
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      quant_tables[c * DCTSIZE2 + k] = quant_table->quantval[k];
+    }
+  }
+}
+
+TEST(EncodeAPITest, QualitySettings) {
+  // Test that jpegli_set_quality, jpegli_set_linear_quality and
+  // jpegli_quality_scaling are consistent with each other.
+  uint16_t quant_tables0[3 * DCTSIZE2];
+  uint16_t quant_tables1[3 * DCTSIZE2];
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    for (boolean baseline : {FALSE, TRUE}) {
+      for (int q = 1; q <= 100; ++q) {
+        jpegli_set_quality(&cinfo, q, baseline);
+        CopyQuantTables(&cinfo, quant_tables0);
+        jpegli_set_linear_quality(&cinfo, jpegli_quality_scaling(q), baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#if JPEG_LIB_VERSION >= 70
+        for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+          cinfo.q_scale_factor[i] = jpegli_quality_scaling(q);
+        }
+        jpegli_default_qtables(&cinfo, baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#endif
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  // Test jpegli_quality_scaling for some specific values .
+  EXPECT_EQ(5000, jpegli_quality_scaling(-1));
+  EXPECT_EQ(5000, jpegli_quality_scaling(0));
+  EXPECT_EQ(5000, jpegli_quality_scaling(1));
+  EXPECT_EQ(100, jpegli_quality_scaling(50));
+  EXPECT_EQ(50, jpegli_quality_scaling(75));
+  EXPECT_EQ(20, jpegli_quality_scaling(90));
+  EXPECT_EQ(0, jpegli_quality_scaling(100));
+  EXPECT_EQ(0, jpegli_quality_scaling(101));
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  for (int h_samp : {1, 2}) {
+    for (int v_samp : {1, 2}) {
+      for (int progr : {0, 2}) {
+        for (int optimize : {0, 1}) {
+          if (progr && optimize) continue;
+          TestConfig config;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          if (!progr) {
+            config.jparams.optimize_coding = optimize;
+          }
+          const float kMaxBpp[4] = {1.55, 1.45, 1.45, 1.32};
+          const float kMaxDist[4] = {1.95, 2.1, 2.1, 2.0};
+          const int idx = v_samp * 2 + h_samp - 3;
+          config.max_bpp =
+              kMaxBpp[idx] * (optimize ? 0.97 : 1.0) * (progr ? 0.97 : 1.0);
+          config.max_dist = kMaxDist[idx];
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 100;
+    config.max_bpp = 6.6;
+    config.max_dist = 0.6;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 80;
+    config.max_bpp = 1.05;
+    config.max_dist = 2.7;
+    all_tests.push_back(config);
+  }
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        if (!progr) {
+          config.jparams.optimize_coding = optimize;
+        }
+        config.jparams.use_adaptive_quantization = false;
+        config.max_bpp = 2.05f;
+        config.max_dist = 2.3f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 12.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 10.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      TestConfig config;
+      config.input.xsize = 217;
+      config.input.ysize = 129;
+      config.jparams.progressive_mode = 2;
+      config.jparams.h_sampling = {h0_samp, 1, 1};
+      config.jparams.v_sampling = {v0_samp, 1, 1};
+      config.max_bpp = 2.0;
+      config.max_dist = 5.5;
+      all_tests.push_back(config);
+    }
+  }
+  for (int p = 0; p < 3 + kNumTestScripts; ++p) {
+    TestConfig config;
+    config.jparams.progressive_mode = p;
+    const float kMaxBpp[] = {1.59, 1.51, 1.48, 1.59, 1.55, 1.55, 1.51};
+    config.max_bpp = kMaxBpp[p];
+    config.max_dist = 2.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.simple_progression = true;
+    config.max_bpp = 1.48;
+    config.max_dist = 2.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.input_mode = COEFFICIENTS;
+    config.jparams.h_sampling = {2, 1, 1};
+    config.jparams.v_sampling = {2, 1, 1};
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 16;
+    config.max_dist = 0.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.xyb_mode = true;
+    config.jparams.progressive_mode = 2;
+    config.max_bpp = 1.5;
+    config.max_dist = 3.5;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.libjpeg_mode = true;
+    config.max_bpp = 2.1;
+    config.max_dist = 1.7;
+    all_tests.push_back(config);
+  }
+
+  for (J_COLOR_SPACE in_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && in_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.max_bpp = jpeg_color_space == JCS_RGB ? 4.5 : 1.85;
+      config.max_dist = jpeg_color_space == JCS_RGB ? 1.4 : 2.05;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE in_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && in_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      if (in_color_space != jpeg_color_space) {
+        config.jparams.set_jpeg_colorspace = true;
+        config.jparams.jpeg_color_space = jpeg_color_space;
+      }
+      config.max_bpp = jpeg_color_space == JCS_CMYK ? 4.0 : 3.6;
+      config.max_dist = jpeg_color_space == JCS_CMYK ? 1.2 : 1.5;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.color_space = JCS_YCbCr;
+    config.max_bpp = 1.6;
+    config.max_dist = 1.35;
+    all_tests.push_back(config);
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.jparams.xyb_mode = xyb;
+    config.max_bpp = 1.35;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (int channels = 1; channels <= 4; ++channels) {
+    TestConfig config;
+    config.input.color_space = JCS_UNKNOWN;
+    config.input.components = channels;
+    config.max_bpp = 1.35 * channels;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (size_t r : {1, 3, 17, 1024}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.jparams.restart_interval = r;
+      config.jparams.progressive_mode = progr;
+      config.max_bpp = 1.58 + 5.5 / r;
+      config.max_dist = 2.2;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    config.max_bpp = 1.6;
+    config.max_dist = 2.2;
+    all_tests.push_back(config);
+  }
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.optimize_coding = 1;
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          float q = (type == 0 ? 16 : type) * scale * 0.01f;
+          if (baseline && !add_raw) q = std::max(1.0f, std::min(255.0f, q));
+          config.max_bpp = 1.5f + 25.0f / q;
+          config.max_dist = 0.6f + 0.25f * q;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    config.max_bpp = 2.25;
+    config.max_dist = 2.8;
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      config.max_bpp = 2.3;
+      config.max_dist = 2.9;
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.max_bpp = 2.0;
+      config.max_dist = 3.85;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.max_bpp = 1.5;
+    config.max_dist = 3.75;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.comp_ids = {7, 17, 177};
+    config.input.xsize = config.input.ysize = 128;
+    config.max_bpp = 2.25;
+    config.max_dist = 2.4;
+    all_tests.push_back(config);
+  }
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      config.max_bpp = 2.25;
+      config.max_dist = 2.4;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_dist = 2.05;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  for (JpegIOMode input_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input_mode = input_mode;
+    if (input_mode == RAW_DATA) {
+      config.input.color_space = JCS_YCbCr;
+    }
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    if (input_mode == COEFFICIENTS) {
+      config.max_bpp = 3.5;
+      config.max_dist = 0.0;
+    }
+    all_tests.push_back(config);
+    config.jparams.use_flat_dc_luma_code = true;
+    all_tests.push_back(config);
+  }
+  for (int xsize : {640, 641, 648, 649}) {
+    for (int ysize : {640, 641, 648, 649}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          if (h_sampling == 1 && v_sampling == 1) continue;
+          for (int progr : {0, 2}) {
+            TestConfig config;
+            config.input.xsize = xsize;
+            config.input.ysize = ysize;
+            config.input.color_space = JCS_YCbCr;
+            config.jparams.h_sampling = {h_sampling, 1, 1};
+            config.jparams.v_sampling = {v_sampling, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.input_mode = RAW_DATA;
+            config.max_bpp = 1.75;
+            config.max_dist = 2.0;
+            all_tests.push_back(config);
+            config.input_mode = COEFFICIENTS;
+            if (xsize & 1) {
+              config.jparams.add_marker = true;
+            }
+            config.max_bpp = 24.0;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+  for (JpegliDataType data_type : {JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN, JPEGLI_NATIVE_ENDIAN}) {
+      J_COLOR_SPACE colorspace[4] = {JCS_GRAYSCALE, JCS_UNKNOWN, JCS_RGB,
+                                     JCS_CMYK};
+      float max_bpp[4] = {1.32, 2.7, 1.6, 4.0};
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.input.data_type = data_type;
+        config.input.endianness = endianness;
+        config.input.components = channels;
+        config.input.color_space = colorspace[channels - 1];
+        config.max_bpp = max_bpp[channels - 1];
+        config.max_dist = 2.2;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int smoothing : {1, 5, 50, 100}) {
+    for (int h_sampling : {1, 2}) {
+      for (int v_sampling : {1, 2}) {
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.smoothing_factor = smoothing;
+        config.jparams.h_sampling = {h_sampling, 1, 1};
+        config.jparams.v_sampling = {v_sampling, 1, 1};
+        config.max_bpp = 1.85;
+        config.max_dist = 3.05f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  return all_tests;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  if (c.input_mode == RAW_DATA) {
+    os << "RawDataIn";
+  } else if (c.input_mode == COEFFICIENTS) {
+    os << "WriteCoeffs";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<EncodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(EncodeAPITest, EncodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/encode_internal.h b/third_party/jpeg-xl/lib/jpegli/encode_internal.h
new file mode 100644
index 0000000000..8f08272fd2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/encode_internal.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENCODE_INTERNAL_H_
+#define LIB_JPEGLI_ENCODE_INTERNAL_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/encode.h"
+
+namespace jpegli {
+
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+struct JPEGHuffmanCode {
+  // Bit length histogram.
+  uint32_t counts[kJpegHuffmanMaxBitLength + 1];
+  // Symbol values sorted by increasing bit lengths.
+  uint32_t values[kJpegHuffmanAlphabetSize + 1];
+  // The index of the Huffman code in the current set of Huffman codes. For AC
+  // component Huffman codes, 0x10 is added to the index.
+  int slot_id;
+  boolean sent_table;
+};
+
+// DCTCodingState: maximum number of correction bits to buffer
+const int kJPEGMaxCorrectionBits = 1u << 16;
+
+constexpr int kDefaultProgressiveLevel = 0;
+
+struct HuffmanCodeTable {
+  int depth[256];
+  int code[256];
+};
+
+struct ScanCodingInfo {
+  uint32_t dc_tbl_idx[MAX_COMPS_IN_SCAN];
+  uint32_t ac_tbl_idx[MAX_COMPS_IN_SCAN];
+  // Number of Huffman codes defined in the DHT segment preceding this scan.
+  size_t num_huffman_codes;
+};
+
+typedef int16_t coeff_t;
+
+}  // namespace jpegli
+
+struct jpeg_comp_master {
+  jpegli::RowBuffer<float> input_buffer[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* smooth_input[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* raw_data[jpegli::kMaxComponents];
+  bool force_baseline;
+  bool xyb_mode;
+  uint8_t cicp_transfer_function;
+  bool use_std_tables;
+  bool use_adaptive_quantization;
+  int progressive_level;
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  size_t blocks_per_iMCU_row;
+  jpegli::ScanCodingInfo* scan_coding_info;
+  JpegliDataType data_type;
+  JpegliEndianness endianness;
+  void (*input_method)(const uint8_t* row_in, size_t len,
+                       float* row_out[jpegli::kMaxComponents]);
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+  void (*downsample_method[jpegli::kMaxComponents])(
+      float* rows_in[MAX_SAMP_FACTOR], size_t len, float* row_out);
+  float* quant_mul[jpegli::kMaxComponents];
+  float* zero_bias_offset[jpegli::kMaxComponents];
+  float* zero_bias_mul[jpegli::kMaxComponents];
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+  jpegli::JPEGHuffmanCode* huffman_codes;
+  size_t num_huffman_codes;
+  jpegli::HuffmanCodeTable huff_tables[8];
+  float* diff_buffer;
+  jpegli::RowBuffer<float> fuzzy_erosion_tmp;
+  jpegli::RowBuffer<float> pre_erosion;
+  jpegli::RowBuffer<float> quant_field;
+  jvirt_barray_ptr* coeff_buffers;
+  size_t next_input_row;
+  size_t next_iMCU_row;
+  size_t last_dht_index;
+  size_t last_restart_interval;
+  JCOEF last_dc_coeff[MAX_COMPS_IN_SCAN];
+  jpegli::JpegBitWriter bw;
+  float* dct_buffer;
+  int32_t* block_tmp;
+};
+
+#endif  // LIB_JPEGLI_ENCODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc b/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc
new file mode 100644
index 0000000000..110a36a3e9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/entropy_coding.cc
@@ -0,0 +1,605 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/entropy_coding.h"
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jxl/base/bits.h"
+
+namespace jpegli {
+namespace {
+
+float HistogramCost(const Histogram& histo) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    counts[i] = histo.count[i];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  size_t header_bits = (1 + kJpegHuffmanMaxBitLength) * 8;
+  size_t data_bits = 0;
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      header_bits += 8;
+      data_bits += counts[i] * depths[i];
+    }
+  }
+  return header_bits + data_bits;
+}
+
+void AddHistograms(const Histogram& a, const Histogram& b, Histogram* c) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    c->count[i] = a.count[i] + b.count[i];
+  }
+}
+
+bool IsEmptyHistogram(const Histogram& histo) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (histo.count[i]) return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+void ClusterJpegHistograms(const Histogram* histograms, size_t num,
+                           JpegClusteredHistograms* clusters) {
+  clusters->histogram_indexes.resize(num);
+  std::vector<uint32_t> slot_histograms;
+  std::vector<float> slot_costs;
+  for (size_t i = 0; i < num; ++i) {
+    const Histogram& cur = histograms[i];
+    if (IsEmptyHistogram(cur)) {
+      continue;
+    }
+    float best_cost = HistogramCost(cur);
+    size_t best_slot = slot_histograms.size();
+    for (size_t j = 0; j < slot_histograms.size(); ++j) {
+      size_t prev_idx = slot_histograms[j];
+      const Histogram& prev = clusters->histograms[prev_idx];
+      Histogram combined;
+      AddHistograms(prev, cur, &combined);
+      float combined_cost = HistogramCost(combined);
+      float cost = combined_cost - slot_costs[j];
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_slot = j;
+      }
+    }
+    if (best_slot == slot_histograms.size()) {
+      // Create new histogram.
+      size_t histogram_index = clusters->histograms.size();
+      clusters->histograms.push_back(cur);
+      clusters->histogram_indexes[i] = histogram_index;
+      if (best_slot < 4) {
+        // We have a free slot, so we put the new histogram there.
+        slot_histograms.push_back(histogram_index);
+        slot_costs.push_back(best_cost);
+      } else {
+        // TODO(szabadka) Find the best histogram to replce.
+        best_slot = (clusters->slot_ids.back() + 1) % 4;
+      }
+      slot_histograms[best_slot] = histogram_index;
+      slot_costs[best_slot] = best_cost;
+      clusters->slot_ids.push_back(best_slot);
+    } else {
+      // Merge this histogram with a previous one.
+      size_t histogram_index = slot_histograms[best_slot];
+      const Histogram& prev = clusters->histograms[histogram_index];
+      AddHistograms(prev, cur, &clusters->histograms[histogram_index]);
+      clusters->histogram_indexes[i] = histogram_index;
+      JXL_ASSERT(clusters->slot_ids[histogram_index] == best_slot);
+      slot_costs[best_slot] += best_cost;
+    }
+  }
+}
+
+void BuildJpegHuffmanCode(const Histogram& histo, JPEGHuffmanCode* huff) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t j = 0; j < kJpegHuffmanAlphabetSize; ++j) {
+    counts[j] = histo.count[j];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  std::fill(std::begin(huff->counts), std::end(huff->counts), 0);
+  std::fill(std::begin(huff->values), std::end(huff->values), 0);
+  for (size_t i = 0; i <= kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      ++huff->counts[depths[i]];
+    }
+  }
+  int offset[kJpegHuffmanMaxBitLength + 1] = {0};
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    offset[i] = offset[i - 1] + huff->counts[i - 1];
+  }
+  for (size_t i = 0; i <= kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      huff->values[offset[depths[i]]++] = i;
+    }
+  }
+}
+
+void AddJpegHuffmanCode(const Histogram& histogram, size_t slot_id,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes) {
+  JPEGHuffmanCode huff_code = {};
+  huff_code.slot_id = slot_id;
+  BuildJpegHuffmanCode(histogram, &huff_code);
+  memcpy(&huff_codes[*num_huff_codes], &huff_code, sizeof(huff_code));
+  ++(*num_huff_codes);
+}
+
+namespace {
+void SetJpegHuffmanCode(const JpegClusteredHistograms& clusters,
+                        size_t histogram_id, size_t slot_id_offset,
+                        std::vector<uint32_t>& slot_histograms,
+                        uint32_t* slot_id, bool* is_baseline,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes) {
+  JXL_ASSERT(histogram_id < clusters.histogram_indexes.size());
+  uint32_t histogram_index = clusters.histogram_indexes[histogram_id];
+  uint32_t id = clusters.slot_ids[histogram_index];
+  if (id > 1) {
+    *is_baseline = false;
+  }
+  *slot_id = id + (slot_id_offset / 4);
+  if (slot_histograms[id] != histogram_index) {
+    AddJpegHuffmanCode(clusters.histograms[histogram_index],
+                       slot_id_offset + id, huff_codes, num_huff_codes);
+    slot_histograms[id] = histogram_index;
+  }
+}
+
+struct DCTState {
+  int eob_run = 0;
+  size_t num_refinement_bits = 0;
+  Histogram* ac_histo = nullptr;
+};
+
+static JXL_INLINE void ProcessFlush(DCTState* s) {
+  if (s->eob_run > 0) {
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(s->eob_run);
+    int symbol = nbits << 4u;
+    ++s->ac_histo->count[symbol];
+    s->eob_run = 0;
+  }
+  s->num_refinement_bits = 0;
+}
+
+static JXL_INLINE void ProcessEndOfBand(DCTState* s, size_t new_refinement_bits,
+                                        Histogram* new_ac_histo) {
+  if (s->eob_run == 0) {
+    s->ac_histo = new_ac_histo;
+  }
+  ++s->eob_run;
+  s->num_refinement_bits += new_refinement_bits;
+  if (s->eob_run == 0x7FFF ||
+      s->num_refinement_bits > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    ProcessFlush(s);
+  }
+}
+
+bool ProcessDCTBlockSequential(const coeff_t* coeffs, Histogram* dc_histo,
+                               Histogram* ac_histo, coeff_t* last_dc_coeff) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0];
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    if (temp < 0) return false;
+    temp2--;
+  }
+  int dc_nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+  ++dc_histo->count[dc_nbits];
+  if (dc_nbits >= 12) return false;
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      ++ac_histo->count[0xf0];
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    if (ac_nbits >= 16) return false;
+    int symbol = (r << 4u) + ac_nbits;
+    ++ac_histo->count[symbol];
+    r = 0;
+  }
+  if (r > 0) {
+    ++ac_histo->count[0];
+  }
+  return true;
+}
+
+bool ProcessDCTBlockProgressive(const coeff_t* coeffs, Histogram* dc_histo,
+                                Histogram* ac_histo, int Ss, int Se, int Al,
+                                DCTState* s, coeff_t* last_dc_coeff) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+    ++dc_histo->count[nbits];
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    ProcessFlush(s);
+    while (r > 15) {
+      ++ac_histo->count[0xf0];
+      r -= 16;
+    }
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    ++ac_histo->count[symbol];
+    r = 0;
+  }
+  if (r > 0) {
+    ProcessEndOfBand(s, 0, ac_histo);
+    if (!eob_run_allowed) {
+      ProcessFlush(s);
+    }
+  }
+  return true;
+}
+
+bool ProcessRefinementBits(const coeff_t* coeffs, Histogram* ac_histo, int Ss,
+                           int Se, int Al, DCTState* s) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  size_t num_refinement_bits = 0;
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      ProcessFlush(s);
+      ++ac_histo->count[0xf0];
+      r -= 16;
+      num_refinement_bits = 0;
+    }
+    if (abs_values[k] > 1) {
+      ++num_refinement_bits;
+      continue;
+    }
+    ProcessFlush(s);
+    int symbol = (r << 4u) + 1;
+    ++ac_histo->count[symbol];
+    num_refinement_bits = 0;
+    r = 0;
+  }
+  if (r > 0 || num_refinement_bits > 0) {
+    ProcessEndOfBand(s, num_refinement_bits, ac_histo);
+    if (!eob_run_allowed) {
+      ProcessFlush(s);
+    }
+  }
+  return true;
+}
+
+void ProgressMonitorHistogramPass(j_compress_ptr cinfo, size_t scan_index,
+                                  size_t mcu_y) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 1 + scan_index;
+  cinfo->progress->pass_counter = mcu_y;
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+bool ProcessScan(j_compress_ptr cinfo,
+                 size_t scan_index, int* histo_index, Histogram* dc_histograms,
+                 Histogram* ac_histograms) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t restart_interval = RestartIntervalForScan(cinfo, scan_index);
+  int restarts_to_go = restart_interval;
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+  DCTState s;
+
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info->comps_in_scan > 1);
+  jpeg_component_info* base_comp =
+      &cinfo->comp_info[scan_info->component_index[0]];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_comp->v_samp_factor;
+  int MCUs_per_row =
+      DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+  int MCU_rows =
+      DivCeil(cinfo->image_height * v_group, 8 * cinfo->max_v_samp_factor);
+  const bool is_progressive = cinfo->progressive_mode;
+  const int Al = scan_info->Al;
+  const int Ah = scan_info->Ah;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  constexpr coeff_t kDummyBlock[DCTSIZE2] = {0};
+
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    ProgressMonitorHistogramPass(cinfo, scan_index, mcu_y);
+    for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+      int comp_idx = scan_info->component_index[i];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+      int by0 = mcu_y * n_blocks_y;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(n_blocks_y, block_rows_left);
+      ba[i] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx],
+          by0, max_block_rows, false);
+    }
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        ProcessFlush(&s);
+        restarts_to_go = restart_interval;
+        memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+      }
+      // Encode one MCU
+      for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+        int comp_idx = scan_info->component_index[i];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        int histo_idx = *histo_index + i;
+        Histogram* dc_histo = &dc_histograms[histo_idx];
+        Histogram* ac_histo = &ac_histograms[histo_idx];
+        int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? comp->h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            size_t block_y = mcu_y * n_blocks_y + iy;
+            size_t block_x = mcu_x * n_blocks_x + ix;
+            const coeff_t* block;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              block = kDummyBlock;
+            } else {
+              block = &ba[i][iy][block_x][0];
+            }
+            bool ok;
+            if (!is_progressive) {
+              ok = ProcessDCTBlockSequential(block, dc_histo, ac_histo,
+                                             last_dc_coeff + i);
+            } else if (Ah == 0) {
+              ok = ProcessDCTBlockProgressive(block, dc_histo, ac_histo, Ss, Se,
+                                              Al, &s, last_dc_coeff + i);
+            } else {
+              ok = ProcessRefinementBits(block, ac_histo, Ss, Se, Al, &s);
+            }
+            if (!ok) return false;
+          }
+        }
+      }
+      --restarts_to_go;
+    }
+  }
+  ProcessFlush(&s);
+  *histo_index += scan_info->comps_in_scan;
+  return true;
+}
+
+void ProcessJpeg(j_compress_ptr cinfo,
+                 std::vector<Histogram>* dc_histograms,
+                 std::vector<Histogram>* ac_histograms) {
+  int histo_index = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    if (!ProcessScan(cinfo, i, &histo_index, &(*dc_histograms)[0],
+                     &(*ac_histograms)[0])) {
+      JPEGLI_ERROR("Invalid scan.");
+    }
+  }
+}
+
+void CopyHuffmanTable(j_compress_ptr cinfo, int index, bool is_dc,
+                      JPEGHuffmanCode* huffman_codes,
+                      size_t* num_huffman_codes) {
+  const char* type = is_dc ? "DC" : "AC";
+  if (index < 0 || index >= NUM_HUFF_TBLS) {
+    JPEGLI_ERROR("Invalid %s Huffman table index %d", type, index);
+  }
+  JHUFF_TBL* table =
+      is_dc ? cinfo->dc_huff_tbl_ptrs[index] : cinfo->ac_huff_tbl_ptrs[index];
+  if (table == nullptr) {
+    JPEGLI_ERROR("Missing %s Huffman table %d", type, index);
+  }
+  ValidateHuffmanTable(reinterpret_cast<j_common_ptr>(cinfo), table, is_dc);
+  JPEGHuffmanCode huff = {};
+  size_t max_depth = 0;
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    if (table->bits[i] != 0) max_depth = i;
+    huff.counts[i] = table->bits[i];
+  }
+  ++huff.counts[max_depth];
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    huff.values[i] = table->huffval[i];
+  }
+  huff.slot_id = index + (is_dc ? 0 : 0x10);
+  huff.sent_table = table->sent_table;
+  bool have_slot = false;
+  for (size_t i = 0; i < *num_huffman_codes; ++i) {
+    if (huffman_codes[i].slot_id == huff.slot_id) have_slot = true;
+  }
+  if (!have_slot) {
+    memcpy(&huffman_codes[*num_huffman_codes], &huff, sizeof(huff));
+    ++(*num_huffman_codes);
+  }
+}
+
+}  // namespace
+
+void CopyHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline) {
+  jpeg_comp_master* m = cinfo->master;
+  m->huffman_codes =
+      Allocate<JPEGHuffmanCode>(cinfo, 2 * cinfo->num_components, JPOOL_IMAGE);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (comp->dc_tbl_no > 1 || comp->ac_tbl_no > 1) {
+      *is_baseline = false;
+    }
+    CopyHuffmanTable(cinfo, comp->dc_tbl_no, /*is_dc=*/true, m->huffman_codes,
+                     &m->num_huffman_codes);
+    CopyHuffmanTable(cinfo, comp->ac_tbl_no, /*is_dc=*/false, m->huffman_codes,
+                     &m->num_huffman_codes);
+  }
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info* si = &cinfo->scan_info[i];
+    ScanCodingInfo sci = {};
+    for (int j = 0; j < si->comps_in_scan; ++j) {
+      int ci = si->component_index[j];
+      sci.dc_tbl_idx[j] = cinfo->comp_info[ci].dc_tbl_no;
+      sci.ac_tbl_idx[j] = cinfo->comp_info[ci].ac_tbl_no + 4;
+    }
+    if (i == 0) {
+      sci.num_huffman_codes = m->num_huffman_codes;
+    }
+    memcpy(&m->scan_coding_info[i], &sci, sizeof(sci));
+  }
+}
+
+size_t RestartIntervalForScan(j_compress_ptr cinfo, size_t scan_index) {
+  if (cinfo->restart_in_rows <= 0) {
+    return cinfo->restart_interval;
+  } else {
+    const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+    const bool is_interleaved = (scan_info->comps_in_scan > 1);
+    jpeg_component_info* base_comp =
+        &cinfo->comp_info[scan_info->component_index[0]];
+    const int h_group = is_interleaved ? 1 : base_comp->h_samp_factor;
+    int MCUs_per_row =
+        DivCeil(cinfo->image_width * h_group, 8 * cinfo->max_h_samp_factor);
+    return std::min<size_t>(MCUs_per_row * cinfo->restart_in_rows, 65535u);
+  }
+}
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline) {
+  jpeg_comp_master* m = cinfo->master;
+  // Gather histograms.
+  size_t num_histo = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    num_histo += cinfo->scan_info[i].comps_in_scan;
+  }
+  std::vector<Histogram> dc_histograms(num_histo);
+  std::vector<Histogram> ac_histograms(num_histo);
+  ProcessJpeg(cinfo, &dc_histograms, &ac_histograms);
+
+  // Cluster DC histograms.
+  JpegClusteredHistograms dc_clusters;
+  ClusterJpegHistograms(dc_histograms.data(), dc_histograms.size(),
+                        &dc_clusters);
+
+  // Cluster AC histograms.
+  JpegClusteredHistograms ac_clusters;
+  ClusterJpegHistograms(ac_histograms.data(), ac_histograms.size(),
+                        &ac_clusters);
+
+  // Add the first 4 DC and AC histograms in the first DHT segment.
+  std::vector<uint32_t> dc_slot_histograms;
+  std::vector<uint32_t> ac_slot_histograms;
+  m->huffman_codes = Allocate<JPEGHuffmanCode>(cinfo, num_histo, JPOOL_IMAGE);
+  for (size_t i = 0; i < dc_clusters.histograms.size(); ++i) {
+    if (i >= 4) break;
+    JXL_ASSERT(dc_clusters.slot_ids[i] == i);
+    AddJpegHuffmanCode(dc_clusters.histograms[i], i, m->huffman_codes,
+                       &m->num_huffman_codes);
+    dc_slot_histograms.push_back(i);
+  }
+  for (size_t i = 0; i < ac_clusters.histograms.size(); ++i) {
+    if (i >= 4) break;
+    JXL_ASSERT(ac_clusters.slot_ids[i] == i);
+    AddJpegHuffmanCode(ac_clusters.histograms[i], 0x10 + i, m->huffman_codes,
+                       &m->num_huffman_codes);
+    ac_slot_histograms.push_back(i);
+  }
+
+  // Set the Huffman table indexes in the scan_infos and emit additional DHT
+  // segments if necessary.
+  size_t histogram_id = 0;
+  size_t num_huffman_codes_sent = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    ScanCodingInfo sci = {};
+    for (int c = 0; c < cinfo->scan_info[i].comps_in_scan; ++c) {
+      SetJpegHuffmanCode(dc_clusters, histogram_id, 0, dc_slot_histograms,
+                         &sci.dc_tbl_idx[c], is_baseline, m->huffman_codes,
+                         &m->num_huffman_codes);
+      SetJpegHuffmanCode(ac_clusters, histogram_id, 0x10, ac_slot_histograms,
+                         &sci.ac_tbl_idx[c], is_baseline, m->huffman_codes,
+                         &m->num_huffman_codes);
+      ++histogram_id;
+    }
+    sci.num_huffman_codes = m->num_huffman_codes - num_huffman_codes_sent;
+    num_huffman_codes_sent = m->num_huffman_codes;
+    memcpy(&m->scan_coding_info[i], &sci, sizeof(sci));
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/entropy_coding.h b/third_party/jpeg-xl/lib/jpegli/entropy_coding.h
new file mode 100644
index 0000000000..6d9dd2303b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/entropy_coding.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENTROPY_CODING_H_
+#define LIB_JPEGLI_ENTROPY_CODING_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void CopyHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline);
+
+size_t RestartIntervalForScan(j_compress_ptr cinfo, size_t scan_index);
+
+struct Histogram {
+  int count[kJpegHuffmanAlphabetSize];
+  Histogram() { memset(count, 0, sizeof(count)); }
+};
+
+struct JpegClusteredHistograms {
+  std::vector<Histogram> histograms;
+  std::vector<uint32_t> histogram_indexes;
+  std::vector<uint32_t> slot_ids;
+};
+
+void ClusterJpegHistograms(const Histogram* histograms, size_t num,
+                           JpegClusteredHistograms* clusters);
+
+void AddJpegHuffmanCode(const Histogram& histogram, size_t slot_id,
+                        JPEGHuffmanCode* huff_codes, size_t* num_huff_codes);
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo, bool* is_baseline);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ENTROPY_CODING_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/error.cc b/third_party/jpeg-xl/lib/jpegli/error.cc
new file mode 100644
index 0000000000..289261672d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error.cc
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/error.h"
+
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+const char* const kErrorMessageTable[] = {
+    "Message codes are not supported, error message is in msg_parm.s string",
+};
+
+bool FormatString(char* buffer, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buffer, JMSG_STR_PARM_MAX, format, args);
+  va_end(args);
+  return false;
+}
+
+void ExitWithAbort(j_common_ptr cinfo) {
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy(cinfo);
+  exit(EXIT_FAILURE);
+}
+
+void EmitMessage(j_common_ptr cinfo, int msg_level) {
+  if (msg_level < 0) {
+    if (cinfo->err->num_warnings <= 5 || cinfo->err->trace_level >= 3) {
+      (*cinfo->err->output_message)(cinfo);
+    }
+    ++cinfo->err->num_warnings;
+  } else if (cinfo->err->trace_level >= msg_level) {
+    (*cinfo->err->output_message)(cinfo);
+  }
+}
+
+void OutputMessage(j_common_ptr cinfo) {
+  char buffer[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buffer);
+  fprintf(stderr, "%s\n", buffer);
+}
+
+void FormatMessage(j_common_ptr cinfo, char* buffer) {
+  jpeg_error_mgr* err = cinfo->err;
+  int code = err->msg_code;
+  if (code == 0) {
+    memcpy(buffer, cinfo->err->msg_parm.s, JMSG_STR_PARM_MAX);
+  } else if (err->addon_message_table != nullptr &&
+             code >= err->first_addon_message &&
+             code <= err->last_addon_message) {
+    std::string msg(err->addon_message_table[code - err->first_addon_message]);
+    if (msg.find("%s") != std::string::npos) {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.s);
+    } else {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.i[0],
+               err->msg_parm.i[1], err->msg_parm.i[2], err->msg_parm.i[3],
+               err->msg_parm.i[4], err->msg_parm.i[5], err->msg_parm.i[6],
+               err->msg_parm.i[7]);
+    }
+  } else {
+    snprintf(buffer, JMSG_LENGTH_MAX, "%s", kErrorMessageTable[0]);
+  }
+}
+
+void ResetErrorManager(j_common_ptr cinfo) {
+  memset(cinfo->err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  cinfo->err->msg_code = 0;
+  cinfo->err->num_warnings = 0;
+}
+
+}  // namespace jpegli
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err) {
+  err->error_exit = jpegli::ExitWithAbort;
+  err->emit_message = jpegli::EmitMessage;
+  err->output_message = jpegli::OutputMessage;
+  err->format_message = jpegli::FormatMessage;
+  err->reset_error_mgr = jpegli::ResetErrorManager;
+  memset(err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  err->trace_level = 0;
+  err->num_warnings = 0;
+  // We don't support message codes and message table, but we define one here
+  // in case the application has a custom format_message and tries to access
+  // these fields there.
+  err->msg_code = 0;
+  err->jpeg_message_table = jpegli::kErrorMessageTable;
+  err->last_jpeg_message = 0;
+  err->addon_message_table = nullptr;
+  err->first_addon_message = 0;
+  err->last_addon_message = 0;
+  return err;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/error.h b/third_party/jpeg-xl/lib/jpegli/error.h
new file mode 100644
index 0000000000..9de030ab3b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error.h
@@ -0,0 +1,39 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ERROR_H_
+#define LIB_JPEGLI_ERROR_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdarg.h>
+/* clang-format on */
+
+namespace jpegli {
+
+bool FormatString(char* buffer, const char* format, ...);
+
+}  // namespace jpegli
+
+#define JPEGLI_ERROR(format, ...)                                            \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->error_exit)(reinterpret_cast<j_common_ptr>(cinfo))
+
+#define JPEGLI_WARN(format, ...)                                             \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo), -1)
+
+#define JPEGLI_TRACE(level, format, ...)                                     \
+  if (cinfo->err->trace_level >= (level))                                    \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo),     \
+                                  (level))
+
+#endif  // LIB_JPEGLI_ERROR_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc
new file mode 100644
index 0000000000..f652993827
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/error_handling_test.cc
@@ -0,0 +1,1290 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+TEST(EncoderErrorHandlingTest, MinimalSuccess) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.input_components = 1;
+      jpegli_set_defaults(&cinfo);
+      jpegli_start_compress(&cinfo, TRUE);
+      JSAMPLE image[1] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpeg);
+      jpeg_create_decompress(&cinfo);
+      jpeg_mem_src(&cinfo, buffer, buffer_size);
+      jpeg_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      jpeg_start_decompress(&cinfo);
+      JSAMPLE image[1];
+      JSAMPROW row[] = {image};
+      jpeg_read_scanlines(&cinfo, row, 1);
+      jxl::msan::UnpoisonMemory(image, 1);
+      EXPECT_EQ(0, image[0]);
+      jpeg_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpeg_destroy_decompress(&cinfo);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoDestination) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, NoImageDimensions) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ImageTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 100000;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, TooManyInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1000;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoSetDefaults) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoStartCompress) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteAllScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 2;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantValue) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.quant_tbl_ptrs[0] = jpegli_alloc_quant_table((j_common_ptr)&cinfo);
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      cinfo.quant_tbl_ptrs[0]->quantval[k] = 0;
+    }
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantTableIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].quant_tbl_no = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 100;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    cinfo.comp_info[1].h_samp_factor = cinfo.comp_info[1].v_samp_factor = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_GRAYSCALE;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidColorTransform) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.jpeg_color_space = JCS_RGB;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, DuplicateComponentIds) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_id = 0;
+    cinfo.comp_info[1].component_id = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidComponentIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_index = 17;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ArithmeticCoding) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.arith_code = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, CCIR601Sampling) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.CCIR601_sampling = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 1}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{5, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {1, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 64, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript7) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 2, 1, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript8) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 63, 0, 0}, {1, {1}, 0, 0, 0, 0}, {1, {1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript9) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 1, 0, 0}, {1, {0}, 2, 63, 0, 0},  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript10) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {2, {0, 1}, 0, 0, 0, 0}, {2, {0, 1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript11) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 1, 63, 0, 0}, {1, {0}, 0, 0, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript12) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 10, 1}, {1, {0}, 0, 0, 1, 0}, {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript13) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 0, 2},
+        {1, {0}, 0, 0, 1, 0},
+        {1, {0}, 0, 0, 2, 1},  //
+        {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, MCUSizeTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[0].v_samp_factor = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, RestartIntervalTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.restart_interval = 1000000;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, SamplingFactorTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 5;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NonIntegralSamplingRatio) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[1].h_samp_factor = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+constexpr const char* kAddOnTable[] = {"First message",
+                                       "Second message with int param %d",
+                                       "Third message with string param %s"};
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10000;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableIntParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10001;
+    cinfo.err->msg_parm.i[0] = 17;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoStringParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10002;
+    memcpy(cinfo.err->msg_parm.s, "MESSAGE PARAM", 14);
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+static const uint8_t kCompressed0[] = {
+    // SOI
+    0xff, 0xd8,  //
+    // DQT
+    0xff, 0xdb, 0x00, 0x43, 0x00, 0x03, 0x02, 0x02, 0x03, 0x02,  //
+    0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05,  //
+    0x08, 0x05, 0x05, 0x04, 0x04, 0x05, 0x0a, 0x07, 0x07, 0x06,  //
+    0x08, 0x0c, 0x0a, 0x0c, 0x0c, 0x0b, 0x0a, 0x0b, 0x0b, 0x0d,  //
+    0x0e, 0x12, 0x10, 0x0d, 0x0e, 0x11, 0x0e, 0x0b, 0x0b, 0x10,  //
+    0x16, 0x10, 0x11, 0x13, 0x14, 0x15, 0x15, 0x15, 0x0c, 0x0f,  //
+    0x17, 0x18, 0x16, 0x14, 0x18, 0x12, 0x14, 0x15, 0x14,        //
+    // SOF
+    0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01,  //
+    0x01, 0x11, 0x00,                                            //
+    // DHT
+    0xff, 0xc4, 0x00, 0xd2, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,  //
+    0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+    0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,  //
+    0x09, 0x0a, 0x0b, 0x10, 0x00, 0x02, 0x01, 0x03, 0x03, 0x02,  //
+    0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7d,  //
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31,  //
+    0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32,  //
+    0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52,  //
+    0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,  //
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a,  //
+    0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,  //
+    0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57,  //
+    0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,  //
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83,  //
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94,  //
+    0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,  //
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,  //
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,  //
+    0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,  //
+    0xd9, 0xda, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,  //
+    0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,  //
+    0xf9, 0xfa,                                                  //
+    // SOS
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00,  //
+    // entropy coded data
+    0xfc, 0xaa, 0xaf,  //
+    // EOI
+    0xff, 0xd9,  //
+};
+static const size_t kLen0 = sizeof(kCompressed0);
+
+static const size_t kDQTOffset = 2;
+static const size_t kSOFOffset = 71;
+static const size_t kDHTOffset = 84;
+static const size_t kSOSOffset = 296;
+
+TEST(DecoderErrorHandlingTest, MinimalSuccess) {
+  JXL_CHECK(kCompressed0[kDQTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOFOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kDHTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOSOffset] == 0xff);
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoSource) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_read_header(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadHeader) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_start_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoStartDecompress) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadScanlines) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+static const size_t kMaxImageWidth = 0xffff;
+JSAMPLE kOutputBuffer[MAX_COMPONENTS * kMaxImageWidth];
+
+bool ParseCompressed(const std::vector<uint8_t>& compressed) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    jpegli_read_header(&cinfo, TRUE);
+    jpegli_start_decompress(&cinfo);
+    for (JDIMENSION i = 0; i < cinfo.output_height; ++i) {
+      JSAMPROW row[] = {kOutputBuffer};
+      jpegli_read_scanlines(&cinfo, row, 1);
+    }
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool retval = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return retval;
+}
+
+TEST(DecoderErrorHandlingTest, NoSOI) {
+  for (int pos : {0, 1}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDQT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table index / precision
+  for (int val : {0x20, 0x05}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero quant value
+  for (int k : {0, 1, 17, 63}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 5 + k] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOF) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero width, height or num_components
+  for (int pos : {6, 8, 9}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid data precision
+  for (int val : {0, 1, 127}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // too many num_components
+  for (int val : {5, 255}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 9] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid sampling factors
+  for (int val : {0x00, 0x01, 0x10, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 11] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid quant table index
+  for (int val : {5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 12] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDHT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 2] += 17;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table slot_id
+  for (int val : {0x05, 0x15, 0x20}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOS) {
+  // Invalid comps_in_scan
+  for (int val : {2, 5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Huffman table indexes
+  for (int val : {0x05, 0x50, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 6] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Ss/Se
+  for (int pos : {7, 8}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + pos] = 64;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, MutateSingleBytes) {
+  for (size_t pos = 0; pos < kLen0; ++pos) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    for (int val : {0x00, 0x0f, 0xf0, 0xff}) {
+      compressed[pos] = val;
+      ParseCompressed(compressed);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/huffman.cc b/third_party/jpeg-xl/lib/jpegli/huffman.cc
new file mode 100644
index 0000000000..1cf88a5536
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/huffman.cc
@@ -0,0 +1,321 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/huffman.h"
+
+#include <limits>
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < static_cast<int>(kJpegHuffmanMaxBitLength)) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;                   // current code length
+  int idx;                   // symbol index
+  int key;                   // prefix code
+  int reps;                  // number of replicate key values in current table
+  int low;                   // low bits for current root entry
+  int table_bits;            // key length of current table
+  int table_size;            // size of current table
+
+  // Make a local copy of the input bit length histogram.
+  int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0};
+  int total_count = 0;
+  for (len = 1; len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    tmp_count[len] = count[len];
+    total_count += tmp_count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < table_size; ++key) {
+      table[key] = code;
+    }
+    return;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(tmp_count, len);
+        table_size = 1 << table_bits;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+}
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count(count), index_left(left), index_right_or_value(right) {}
+  uint32_t total_count;
+  int16_t index_left;
+  int16_t index_right_or_value;
+};
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level) {
+  if (p.index_left >= 0) {
+    ++level;
+    SetDepth(pool[p.index_left], pool, depth, level);
+    SetDepth(pool[p.index_right_or_value], pool, depth, level);
+  } else {
+    depth[p.index_right_or_value] = level;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) {
+  return v0.total_count < v1.total_count;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, const size_t length,
+                       const int tree_limit, uint8_t* depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1;; count_limit *= 2) {
+    std::vector<HuffmanTree> tree;
+    tree.reserve(2 * length + 1);
+
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max(data[i], count_limit - 1);
+        tree.emplace_back(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    const size_t n = tree.size();
+    if (n == 1) {
+      // Fake value; will be fixed on upper level.
+      depth[tree[0].index_right_or_value] = 1;
+      break;
+    }
+
+    std::stable_sort(tree.begin(), tree.end(), Compare);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
+    tree.push_back(sentinel);
+    tree.push_back(sentinel);
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count <= tree[j].total_count) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count <= tree[j].total_count) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = tree.size() - 1;
+      tree[j_end].total_count =
+          tree[left].total_count + tree[right].total_count;
+      tree[j_end].index_left = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree.push_back(sentinel);
+    }
+    JXL_DASSERT(tree.size() == 2 * n + 1);
+    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
+
+    // We need to pack the Huffman tree in tree_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
+      break;
+    }
+  }
+}
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc) {
+  size_t total_symbols = 0;
+  size_t total_p = 0;
+  size_t max_depth = 0;
+  for (size_t d = 1; d <= kJpegHuffmanMaxBitLength; ++d) {
+    uint8_t count = table->bits[d];
+    if (count) {
+      total_symbols += count;
+      total_p += (1u << (kJpegHuffmanMaxBitLength - d)) * count;
+      max_depth = d;
+    }
+  }
+  total_p += 1u << (kJpegHuffmanMaxBitLength - max_depth);  // sentinel symbol
+  if (total_symbols == 0) {
+    JPEGLI_ERROR("Empty Huffman table");
+  }
+  if (total_symbols > kJpegHuffmanAlphabetSize) {
+    JPEGLI_ERROR("Too many symbols in Huffman table");
+  }
+  if (total_p != (1u << kJpegHuffmanMaxBitLength)) {
+    JPEGLI_ERROR("Invalid bit length distribution");
+  }
+  uint8_t symbol_seen[kJpegHuffmanAlphabetSize] = {};
+  for (size_t i = 0; i < total_symbols; ++i) {
+    uint8_t symbol = table->huffval[i];
+    if (symbol_seen[symbol]) {
+      JPEGLI_ERROR("Duplicate symbol %d in Huffman table", symbol);
+    }
+    symbol_seen[symbol] = 1;
+  }
+}
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc) {
+  // Huffman tables from the JPEG standard.
+  static constexpr JHUFF_TBL kStandardDCTables[2] = {
+      // DC luma
+      {{0, 0, 1, 5, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE},
+      // DC chroma
+      {{0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE}};
+  static constexpr JHUFF_TBL kStandardACTables[2] = {
+      // AC luma
+      {{0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125},
+       {0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
+        0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+        0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72,
+        0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+        0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
+        0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+        0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75,
+        0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+        0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3,
+        0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+        0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
+        0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+        0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE},
+      // AC chroma
+      {{0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119},
+       {0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41,
+        0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+        0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1,
+        0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+        0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
+        0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+        0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74,
+        0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+        0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
+        0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+        0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+        0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+        0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE}};
+  const JHUFF_TBL* std_tables = is_dc ? kStandardDCTables : kStandardACTables;
+  JHUFF_TBL** tables;
+  if (cinfo->is_decompressor) {
+    j_decompress_ptr cinfo_d = reinterpret_cast<j_decompress_ptr>(cinfo);
+    tables = is_dc ? cinfo_d->dc_huff_tbl_ptrs : cinfo_d->ac_huff_tbl_ptrs;
+  } else {
+    j_compress_ptr cinfo_c = reinterpret_cast<j_compress_ptr>(cinfo);
+    tables = is_dc ? cinfo_c->dc_huff_tbl_ptrs : cinfo_c->ac_huff_tbl_ptrs;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (tables[i] == nullptr) {
+      tables[i] = jpegli_alloc_huff_table(cinfo);
+      memcpy(tables[i], &std_tables[i], sizeof(JHUFF_TBL));
+      ValidateHuffmanTable(cinfo, tables[i], is_dc);
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/huffman.h b/third_party/jpeg-xl/lib/jpegli/huffman.h
new file mode 100644
index 0000000000..f0e5e1de40
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/huffman.h
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_HUFFMAN_H_
+#define LIB_JPEGLI_HUFFMAN_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jpegli/common_internal.h"
+
+namespace jpegli {
+
+constexpr int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+constexpr int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  uint8_t bits;    // number of bits used for this symbol
+  uint16_t value;  // symbol value or table offset
+};
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc);
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/idct.cc b/third_party/jpeg-xl/lib/jpegli/idct.cc
new file mode 100644
index 0000000000..4d10563583
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/idct.cc
@@ -0,0 +1,692 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/idct.h"
+
+#include <cmath>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jxl/base/status.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/idct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/transpose-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+using D8 = HWY_CAPPED(float, 8);
+constexpr D8 d8;
+
+void DequantBlock(const int16_t* JXL_RESTRICT qblock,
+                  const float* JXL_RESTRICT dequant,
+                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
+  for (size_t k = 0; k < 64; k += Lanes(d)) {
+    const auto mul = Load(d, dequant + k);
+    const auto bias = Load(d, biases + k);
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
+    const Rebind<float, DI> df;
+    const auto quant = ConvertTo(df, quant_i);
+    const auto abs_quant = Abs(quant);
+    const auto not_0 = Gt(abs_quant, Zero(df));
+    const auto sign_quant = Xor(quant, abs_quant);
+    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
+    const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
+    Store(dequant, d, block + k);
+  }
+}
+
+template <size_t N>
+void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
+                    float* JXL_RESTRICT aout) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void BTranspose(float* JXL_RESTRICT coeff) {
+  for (size_t i = N - 1; i > 0; i--) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i - 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  Store(Mul(in1, sqrt2), d8, coeff);
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+template <size_t N>
+void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
+                    size_t out_stride) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto out1 = MulAdd(mul, in2, in1);
+    auto out2 = NegMulAdd(mul, in2, in1);
+    StoreU(out1, d8, out + i * out_stride);
+    StoreU(out2, d8, out + (N - i - 1) * out_stride);
+  }
+}
+
+template <size_t N>
+struct IDCT1DImpl;
+
+template <>
+struct IDCT1DImpl<1> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    StoreU(LoadU(d8, from), d8, to);
+  }
+};
+
+template <>
+struct IDCT1DImpl<2> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    auto in1 = LoadU(d8, from);
+    auto in2 = LoadU(d8, from + from_stride);
+    StoreU(Add(in1, in2), d8, to);
+    StoreU(Sub(in1, in2), d8, to + to_stride);
+  }
+};
+
+template <size_t N>
+struct IDCT1DImpl {
+  void operator()(const float* from, size_t from_stride, float* to,
+                  size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    HWY_ALIGN float tmp[64];
+    ForwardEvenOdd<N>(from, from_stride, tmp);
+    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
+    BTranspose<N / 2>(tmp + N * 4);
+    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
+    MultiplyAndAdd<N>(tmp, to, to_stride);
+  }
+};
+
+template <size_t N>
+void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
+            size_t output_stride) {
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
+  }
+}
+
+void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
+                       float* JXL_RESTRICT output, size_t output_stride) {
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, block0, 8);
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, output, output_stride);
+}
+
+void InverseTransformBlock8x8(const int16_t* JXL_RESTRICT qblock,
+                              const float* JXL_RESTRICT dequant,
+                              const float* JXL_RESTRICT biases,
+                              float* JXL_RESTRICT scratch_space,
+                              float* JXL_RESTRICT output, size_t output_stride,
+                              size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  ComputeScaledIDCT(block0, block1, output, output_stride);
+}
+
+// Computes the N-point IDCT of in[], and stores the result in out[]. The in[]
+// array is at most 8 values long, values in[8:N-1] are assumed to be 0.
+void Compute1dIDCT(float* in, float* out, size_t N) {
+  switch (N) {
+    case 3: {
+      static constexpr float kC3[3] = {
+          1.414213562373,
+          1.224744871392,
+          0.707106781187,
+      };
+      float even0 = in[0] + kC3[2] * in[2];
+      float even1 = in[0] - kC3[0] * in[2];
+      float odd0 = kC3[1] * in[1];
+      out[0] = even0 + odd0;
+      out[2] = even0 - odd0;
+      out[1] = even1;
+      break;
+    }
+    case 5: {
+      static constexpr float kC5[5] = {
+          1.414213562373, 1.344997023928, 1.144122805635,
+          0.831253875555, 0.437016024449,
+      };
+      float even0 = in[0] + kC5[2] * in[2] + kC5[4] * in[4];
+      float even1 = in[0] - kC5[4] * in[2] - kC5[2] * in[4];
+      float even2 = in[0] - kC5[0] * in[2] + kC5[0] * in[4];
+      float odd0 = kC5[1] * in[1] + kC5[3] * in[3];
+      float odd1 = kC5[3] * in[1] - kC5[1] * in[3];
+      out[0] = even0 + odd0;
+      out[4] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[3] = even1 - odd1;
+      out[2] = even2;
+      break;
+    }
+    case 6: {
+      static constexpr float kC6[6] = {
+          1.414213562373, 1.366025403784, 1.224744871392,
+          1.000000000000, 0.707106781187, 0.366025403784,
+      };
+      float even0 = in[0] + kC6[2] * in[2] + kC6[4] * in[4];
+      float even1 = in[0] - kC6[0] * in[4];
+      float even2 = in[0] - kC6[2] * in[2] + kC6[4] * in[4];
+      float odd0 = kC6[1] * in[1] + kC6[3] * in[3] + kC6[5] * in[5];
+      float odd1 = kC6[3] * in[1] - kC6[3] * in[3] - kC6[3] * in[5];
+      float odd2 = kC6[5] * in[1] - kC6[3] * in[3] + kC6[1] * in[5];
+      out[0] = even0 + odd0;
+      out[5] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[4] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[3] = even2 - odd2;
+      break;
+    }
+    case 7: {
+      static constexpr float kC7[7] = {
+          1.414213562373, 1.378756275744, 1.274162392264, 1.105676685997,
+          0.881747733790, 0.613604268353, 0.314692122713,
+      };
+      float even0 = in[0] + kC7[2] * in[2] + kC7[4] * in[4] + kC7[6] * in[6];
+      float even1 = in[0] + kC7[6] * in[2] - kC7[2] * in[4] - kC7[4] * in[6];
+      float even2 = in[0] - kC7[4] * in[2] - kC7[6] * in[4] + kC7[2] * in[6];
+      float even3 = in[0] - kC7[0] * in[2] + kC7[0] * in[4] - kC7[0] * in[6];
+      float odd0 = kC7[1] * in[1] + kC7[3] * in[3] + kC7[5] * in[5];
+      float odd1 = kC7[3] * in[1] - kC7[5] * in[3] - kC7[1] * in[5];
+      float odd2 = kC7[5] * in[1] - kC7[1] * in[3] + kC7[3] * in[5];
+      out[0] = even0 + odd0;
+      out[6] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[5] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[4] = even2 - odd2;
+      out[3] = even3;
+      break;
+    }
+    case 9: {
+      static constexpr float kC9[9] = {
+          1.414213562373, 1.392728480640, 1.328926048777,
+          1.224744871392, 1.083350440839, 0.909038955344,
+          0.707106781187, 0.483689525296, 0.245575607938,
+      };
+      float even0 = in[0] + kC9[2] * in[2] + kC9[4] * in[4] + kC9[6] * in[6];
+      float even1 = in[0] + kC9[6] * in[2] - kC9[6] * in[4] - kC9[0] * in[6];
+      float even2 = in[0] - kC9[8] * in[2] - kC9[2] * in[4] + kC9[6] * in[6];
+      float even3 = in[0] - kC9[4] * in[2] + kC9[8] * in[4] + kC9[6] * in[6];
+      float even4 = in[0] - kC9[0] * in[2] + kC9[0] * in[4] - kC9[0] * in[6];
+      float odd0 =
+          kC9[1] * in[1] + kC9[3] * in[3] + kC9[5] * in[5] + kC9[7] * in[7];
+      float odd1 = kC9[3] * in[1] - kC9[3] * in[5] - kC9[3] * in[7];
+      float odd2 =
+          kC9[5] * in[1] - kC9[3] * in[3] - kC9[7] * in[5] + kC9[1] * in[7];
+      float odd3 =
+          kC9[7] * in[1] - kC9[3] * in[3] + kC9[1] * in[5] - kC9[5] * in[7];
+      out[0] = even0 + odd0;
+      out[8] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[7] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[6] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[5] = even3 - odd3;
+      out[4] = even4;
+      break;
+    }
+    case 10: {
+      static constexpr float kC10[10] = {
+          1.414213562373, 1.396802246667, 1.344997023928, 1.260073510670,
+          1.144122805635, 1.000000000000, 0.831253875555, 0.642039521920,
+          0.437016024449, 0.221231742082,
+      };
+      float even0 = in[0] + kC10[2] * in[2] + kC10[4] * in[4] + kC10[6] * in[6];
+      float even1 = in[0] + kC10[6] * in[2] - kC10[8] * in[4] - kC10[2] * in[6];
+      float even2 = in[0] - kC10[0] * in[4];
+      float even3 = in[0] - kC10[6] * in[2] - kC10[8] * in[4] + kC10[2] * in[6];
+      float even4 = in[0] - kC10[2] * in[2] + kC10[4] * in[4] - kC10[6] * in[6];
+      float odd0 =
+          kC10[1] * in[1] + kC10[3] * in[3] + kC10[5] * in[5] + kC10[7] * in[7];
+      float odd1 =
+          kC10[3] * in[1] + kC10[9] * in[3] - kC10[5] * in[5] - kC10[1] * in[7];
+      float odd2 =
+          kC10[5] * in[1] - kC10[5] * in[3] - kC10[5] * in[5] + kC10[5] * in[7];
+      float odd3 =
+          kC10[7] * in[1] - kC10[1] * in[3] + kC10[5] * in[5] + kC10[9] * in[7];
+      float odd4 =
+          kC10[9] * in[1] - kC10[7] * in[3] + kC10[5] * in[5] - kC10[3] * in[7];
+      out[0] = even0 + odd0;
+      out[9] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[8] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[7] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[6] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[5] = even4 - odd4;
+      break;
+    }
+    case 11: {
+      static constexpr float kC11[11] = {
+          1.414213562373, 1.399818907436, 1.356927976287, 1.286413904599,
+          1.189712155524, 1.068791297809, 0.926112931411, 0.764581576418,
+          0.587485545401, 0.398430002847, 0.201263574413,
+      };
+      float even0 = in[0] + kC11[2] * in[2] + kC11[4] * in[4] + kC11[6] * in[6];
+      float even1 =
+          in[0] + kC11[6] * in[2] - kC11[10] * in[4] - kC11[4] * in[6];
+      float even2 =
+          in[0] + kC11[10] * in[2] - kC11[2] * in[4] - kC11[8] * in[6];
+      float even3 = in[0] - kC11[8] * in[2] - kC11[6] * in[4] + kC11[2] * in[6];
+      float even4 =
+          in[0] - kC11[4] * in[2] + kC11[8] * in[4] + kC11[10] * in[6];
+      float even5 = in[0] - kC11[0] * in[2] + kC11[0] * in[4] - kC11[0] * in[6];
+      float odd0 =
+          kC11[1] * in[1] + kC11[3] * in[3] + kC11[5] * in[5] + kC11[7] * in[7];
+      float odd1 =
+          kC11[3] * in[1] + kC11[9] * in[3] - kC11[7] * in[5] - kC11[1] * in[7];
+      float odd2 =
+          kC11[5] * in[1] - kC11[7] * in[3] - kC11[3] * in[5] + kC11[9] * in[7];
+      float odd3 =
+          kC11[7] * in[1] - kC11[1] * in[3] + kC11[9] * in[5] + kC11[5] * in[7];
+      float odd4 =
+          kC11[9] * in[1] - kC11[5] * in[3] + kC11[1] * in[5] - kC11[3] * in[7];
+      out[0] = even0 + odd0;
+      out[10] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[9] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[8] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[7] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[6] = even4 - odd4;
+      out[5] = even5;
+      break;
+    }
+    case 12: {
+      static constexpr float kC12[12] = {
+          1.414213562373, 1.402114769300, 1.366025403784, 1.306562964876,
+          1.224744871392, 1.121971053594, 1.000000000000, 0.860918669154,
+          0.707106781187, 0.541196100146, 0.366025403784, 0.184591911283,
+      };
+      float even0 = in[0] + kC12[2] * in[2] + kC12[4] * in[4] + kC12[6] * in[6];
+      float even1 = in[0] + kC12[6] * in[2] - kC12[6] * in[6];
+      float even2 =
+          in[0] + kC12[10] * in[2] - kC12[4] * in[4] - kC12[6] * in[6];
+      float even3 =
+          in[0] - kC12[10] * in[2] - kC12[4] * in[4] + kC12[6] * in[6];
+      float even4 = in[0] - kC12[6] * in[2] + kC12[6] * in[6];
+      float even5 = in[0] - kC12[2] * in[2] + kC12[4] * in[4] - kC12[6] * in[6];
+      float odd0 =
+          kC12[1] * in[1] + kC12[3] * in[3] + kC12[5] * in[5] + kC12[7] * in[7];
+      float odd1 =
+          kC12[3] * in[1] + kC12[9] * in[3] - kC12[9] * in[5] - kC12[3] * in[7];
+      float odd2 = kC12[5] * in[1] - kC12[9] * in[3] - kC12[1] * in[5] -
+                   kC12[11] * in[7];
+      float odd3 = kC12[7] * in[1] - kC12[3] * in[3] - kC12[11] * in[5] +
+                   kC12[1] * in[7];
+      float odd4 =
+          kC12[9] * in[1] - kC12[3] * in[3] + kC12[3] * in[5] - kC12[9] * in[7];
+      float odd5 = kC12[11] * in[1] - kC12[9] * in[3] + kC12[7] * in[5] -
+                   kC12[5] * in[7];
+      out[0] = even0 + odd0;
+      out[11] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[10] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[9] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[8] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[7] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[6] = even5 - odd5;
+      break;
+    }
+    case 13: {
+      static constexpr float kC13[13] = {
+          1.414213562373, 1.403902353238, 1.373119086479, 1.322312651445,
+          1.252223920364, 1.163874944761, 1.058554051646, 0.937797056801,
+          0.803364869133, 0.657217812653, 0.501487040539, 0.338443458124,
+          0.170464607981,
+      };
+      float even0 = in[0] + kC13[2] * in[2] + kC13[4] * in[4] + kC13[6] * in[6];
+      float even1 =
+          in[0] + kC13[6] * in[2] + kC13[12] * in[4] - kC13[8] * in[6];
+      float even2 =
+          in[0] + kC13[10] * in[2] - kC13[6] * in[4] - kC13[4] * in[6];
+      float even3 =
+          in[0] - kC13[12] * in[2] - kC13[2] * in[4] + kC13[10] * in[6];
+      float even4 =
+          in[0] - kC13[8] * in[2] - kC13[10] * in[4] + kC13[2] * in[6];
+      float even5 =
+          in[0] - kC13[4] * in[2] + kC13[8] * in[4] - kC13[12] * in[6];
+      float even6 = in[0] - kC13[0] * in[2] + kC13[0] * in[4] - kC13[0] * in[6];
+      float odd0 =
+          kC13[1] * in[1] + kC13[3] * in[3] + kC13[5] * in[5] + kC13[7] * in[7];
+      float odd1 = kC13[3] * in[1] + kC13[9] * in[3] - kC13[11] * in[5] -
+                   kC13[5] * in[7];
+      float odd2 = kC13[5] * in[1] - kC13[11] * in[3] - kC13[1] * in[5] -
+                   kC13[9] * in[7];
+      float odd3 =
+          kC13[7] * in[1] - kC13[5] * in[3] - kC13[9] * in[5] + kC13[3] * in[7];
+      float odd4 = kC13[9] * in[1] - kC13[1] * in[3] + kC13[7] * in[5] +
+                   kC13[11] * in[7];
+      float odd5 = kC13[11] * in[1] - kC13[7] * in[3] + kC13[3] * in[5] -
+                   kC13[1] * in[7];
+      out[0] = even0 + odd0;
+      out[12] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[11] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[10] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[9] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[8] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[7] = even5 - odd5;
+      out[6] = even6;
+      break;
+    }
+    case 14: {
+      static constexpr float kC14[14] = {
+          1.414213562373, 1.405321284327, 1.378756275744, 1.334852607020,
+          1.274162392264, 1.197448846138, 1.105676685997, 1.000000000000,
+          0.881747733790, 0.752406978226, 0.613604268353, 0.467085128785,
+          0.314692122713, 0.158341680609,
+      };
+      float even0 = in[0] + kC14[2] * in[2] + kC14[4] * in[4] + kC14[6] * in[6];
+      float even1 =
+          in[0] + kC14[6] * in[2] + kC14[12] * in[4] - kC14[10] * in[6];
+      float even2 =
+          in[0] + kC14[10] * in[2] - kC14[8] * in[4] - kC14[2] * in[6];
+      float even3 = in[0] - kC14[0] * in[4];
+      float even4 =
+          in[0] - kC14[10] * in[2] - kC14[8] * in[4] + kC14[2] * in[6];
+      float even5 =
+          in[0] - kC14[6] * in[2] + kC14[12] * in[4] + kC14[10] * in[6];
+      float even6 = in[0] - kC14[2] * in[2] + kC14[4] * in[4] - kC14[6] * in[6];
+      float odd0 =
+          kC14[1] * in[1] + kC14[3] * in[3] + kC14[5] * in[5] + kC14[7] * in[7];
+      float odd1 = kC14[3] * in[1] + kC14[9] * in[3] - kC14[13] * in[5] -
+                   kC14[7] * in[7];
+      float odd2 = kC14[5] * in[1] - kC14[13] * in[3] - kC14[3] * in[5] -
+                   kC14[7] * in[7];
+      float odd3 =
+          kC14[7] * in[1] - kC14[7] * in[3] - kC14[7] * in[5] + kC14[7] * in[7];
+      float odd4 = kC14[9] * in[1] - kC14[1] * in[3] + kC14[11] * in[5] +
+                   kC14[7] * in[7];
+      float odd5 = kC14[11] * in[1] - kC14[5] * in[3] + kC14[1] * in[5] -
+                   kC14[7] * in[7];
+      float odd6 = kC14[13] * in[1] - kC14[11] * in[3] + kC14[9] * in[5] -
+                   kC14[7] * in[7];
+      out[0] = even0 + odd0;
+      out[13] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[12] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[11] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[10] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[9] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[8] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[7] = even6 - odd6;
+      break;
+    }
+    case 15: {
+      static constexpr float kC15[15] = {
+          1.414213562373, 1.406466352507, 1.383309602960, 1.344997023928,
+          1.291948376043, 1.224744871392, 1.144122805635, 1.050965490998,
+          0.946293578512, 0.831253875555, 0.707106781187, 0.575212476952,
+          0.437016024449, 0.294031532930, 0.147825570407,
+      };
+      float even0 = in[0] + kC15[2] * in[2] + kC15[4] * in[4] + kC15[6] * in[6];
+      float even1 =
+          in[0] + kC15[6] * in[2] + kC15[12] * in[4] - kC15[12] * in[6];
+      float even2 =
+          in[0] + kC15[10] * in[2] - kC15[10] * in[4] - kC15[0] * in[6];
+      float even3 =
+          in[0] + kC15[14] * in[2] - kC15[2] * in[4] - kC15[12] * in[6];
+      float even4 =
+          in[0] - kC15[12] * in[2] - kC15[6] * in[4] + kC15[6] * in[6];
+      float even5 =
+          in[0] - kC15[8] * in[2] - kC15[14] * in[4] + kC15[6] * in[6];
+      float even6 =
+          in[0] - kC15[4] * in[2] + kC15[8] * in[4] - kC15[12] * in[6];
+      float even7 = in[0] - kC15[0] * in[2] + kC15[0] * in[4] - kC15[0] * in[6];
+      float odd0 =
+          kC15[1] * in[1] + kC15[3] * in[3] + kC15[5] * in[5] + kC15[7] * in[7];
+      float odd1 = kC15[3] * in[1] + kC15[9] * in[3] - kC15[9] * in[7];
+      float odd2 = kC15[5] * in[1] - kC15[5] * in[5] - kC15[5] * in[7];
+      float odd3 = kC15[7] * in[1] - kC15[9] * in[3] - kC15[5] * in[5] +
+                   kC15[11] * in[7];
+      float odd4 = kC15[9] * in[1] - kC15[3] * in[3] + kC15[3] * in[7];
+      float odd5 = kC15[11] * in[1] - kC15[3] * in[3] + kC15[5] * in[5] -
+                   kC15[13] * in[7];
+      float odd6 = kC15[13] * in[1] - kC15[9] * in[3] + kC15[5] * in[5] -
+                   kC15[1] * in[7];
+      out[0] = even0 + odd0;
+      out[14] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[13] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[12] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[11] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[10] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[9] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[8] = even6 - odd6;
+      out[7] = even7;
+      break;
+    }
+    case 16: {
+      static constexpr float kC16[16] = {
+          1.414213562373, 1.407403737526, 1.387039845322, 1.353318001174,
+          1.306562964876, 1.247225012987, 1.175875602419, 1.093201867002,
+          1.000000000000, 0.897167586343, 0.785694958387, 0.666655658478,
+          0.541196100146, 0.410524527522, 0.275899379283, 0.138617169199,
+      };
+      float even0 = in[0] + kC16[2] * in[2] + kC16[4] * in[4] + kC16[6] * in[6];
+      float even1 =
+          in[0] + kC16[6] * in[2] + kC16[12] * in[4] - kC16[14] * in[6];
+      float even2 =
+          in[0] + kC16[10] * in[2] - kC16[12] * in[4] - kC16[2] * in[6];
+      float even3 =
+          in[0] + kC16[14] * in[2] - kC16[4] * in[4] - kC16[10] * in[6];
+      float even4 =
+          in[0] - kC16[14] * in[2] - kC16[4] * in[4] + kC16[10] * in[6];
+      float even5 =
+          in[0] - kC16[10] * in[2] - kC16[12] * in[4] + kC16[2] * in[6];
+      float even6 =
+          in[0] - kC16[6] * in[2] + kC16[12] * in[4] + kC16[14] * in[6];
+      float even7 = in[0] - kC16[2] * in[2] + kC16[4] * in[4] - kC16[6] * in[6];
+      float odd0 = (kC16[1] * in[1] + kC16[3] * in[3] + kC16[5] * in[5] +
+                    kC16[7] * in[7]);
+      float odd1 = (kC16[3] * in[1] + kC16[9] * in[3] + kC16[15] * in[5] -
+                    kC16[11] * in[7]);
+      float odd2 = (kC16[5] * in[1] + kC16[15] * in[3] - kC16[7] * in[5] -
+                    kC16[3] * in[7]);
+      float odd3 = (kC16[7] * in[1] - kC16[11] * in[3] - kC16[3] * in[5] +
+                    kC16[15] * in[7]);
+      float odd4 = (kC16[9] * in[1] - kC16[5] * in[3] - kC16[13] * in[5] +
+                    kC16[1] * in[7]);
+      float odd5 = (kC16[11] * in[1] - kC16[1] * in[3] + kC16[9] * in[5] +
+                    kC16[13] * in[7]);
+      float odd6 = (kC16[13] * in[1] - kC16[7] * in[3] + kC16[1] * in[5] -
+                    kC16[5] * in[7]);
+      float odd7 = (kC16[15] * in[1] - kC16[13] * in[3] + kC16[11] * in[5] -
+                    kC16[9] * in[7]);
+      out[0] = even0 + odd0;
+      out[15] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[14] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[13] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[12] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[11] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[10] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[9] = even6 - odd6;
+      out[7] = even7 + odd7;
+      out[8] = even7 - odd7;
+      break;
+    }
+  }
+}
+
+void InverseTransformBlockGeneric(const int16_t* JXL_RESTRICT qblock,
+                                  const float* JXL_RESTRICT dequant,
+                                  const float* JXL_RESTRICT biases,
+                                  float* JXL_RESTRICT scratch_space,
+                                  float* JXL_RESTRICT output,
+                                  size_t output_stride, size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  if (dctsize == 1) {
+    *output = *block0;
+  } else if (dctsize == 2 || dctsize == 4) {
+    float* JXL_RESTRICT block2 = scratch_space + 2 * DCTSIZE2;
+    ComputeScaledIDCT(block0, block1, block2, 8);
+    if (dctsize == 4) {
+      for (size_t iy = 0; iy < 4; ++iy) {
+        for (size_t ix = 0; ix < 4; ++ix) {
+          float* block = &block2[16 * iy + 2 * ix];
+          output[iy * output_stride + ix] =
+              0.25f * (block[0] + block[1] + block[8] + block[9]);
+        }
+      }
+    } else {
+      for (size_t iy = 0; iy < 2; ++iy) {
+        for (size_t ix = 0; ix < 2; ++ix) {
+          float* block = &block2[32 * iy + 4 * ix];
+          output[iy * output_stride + ix] =
+              0.0625f *
+              (block[0] + block[1] + block[2] + block[3] + block[8] + block[9] +
+               block[10] + block[11] + block[16] + block[17] + block[18] +
+               block[19] + block[24] + block[25] + block[26] + block[27]);
+        }
+      }
+    }
+  } else {
+    float dctin[DCTSIZE];
+    float dctout[DCTSIZE * 2];
+    size_t insize = std::min<size_t>(dctsize, DCTSIZE);
+    for (size_t ix = 0; ix < insize; ++ix) {
+      for (size_t iy = 0; iy < insize; ++iy) {
+        dctin[iy] = block0[iy * DCTSIZE + ix];
+      }
+      Compute1dIDCT(dctin, dctout, dctsize);
+      for (size_t iy = 0; iy < dctsize; ++iy) {
+        block1[iy * dctsize + ix] = dctout[iy];
+      }
+    }
+    for (size_t iy = 0; iy < dctsize; ++iy) {
+      Compute1dIDCT(block1 + iy * dctsize, output + iy * output_stride,
+                    dctsize);
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(InverseTransformBlock8x8);
+HWY_EXPORT(InverseTransformBlockGeneric);
+
+void ChooseInverseTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (m->scaled_dct_size[c] == DCTSIZE) {
+      m->inverse_transform[c] = HWY_DYNAMIC_DISPATCH(InverseTransformBlock8x8);
+    } else {
+      m->inverse_transform[c] =
+          HWY_DYNAMIC_DISPATCH(InverseTransformBlockGeneric);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/idct.h b/third_party/jpeg-xl/lib/jpegli/idct.h
new file mode 100644
index 0000000000..21c5c452e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/idct.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_IDCT_H_
+#define LIB_JPEGLI_IDCT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stddef.h>
+#include <stdint.h>
+/* clang-format on */
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseInverseTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_IDCT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/input.cc b/third_party/jpeg-xl/lib/jpegli/input.cc
new file mode 100644
index 0000000000..765bf98946
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input.cc
@@ -0,0 +1,414 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/input.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DU = HWY_FULL(uint32_t);
+using DU8 = Rebind<uint8_t, D>;
+using DU16 = Rebind<uint16_t, D>;
+
+constexpr D d;
+constexpr DU du;
+constexpr DU8 du8;
+constexpr DU16 du16;
+
+static constexpr double kMul16 = 1.0 / 257.0;
+static constexpr double kMulFloat = 255.0;
+
+template <size_t C>
+void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      row_out[c][x] = row_in[C * x + c];
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
+                   float* row_out[kMaxComponents]) {
+  const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      uint16_t val = row16[C * x + c];
+      if (swap_endianness) val = JXL_BSWAP16(val);
+      row_out[c][x] = val * kMul16;
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  const float* rowf = reinterpret_cast<const float*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      float val = rowf[C * x + c];
+      if (swap_endianness) val = BSwapFloat(val);
+      row_out[c][x] = val * kMulFloat;
+    }
+  }
+}
+
+void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
+  }
+  ReadUint8Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU8> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+  }
+  ReadUint8Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU8> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+  }
+  ReadUint8Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU8> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+    Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
+  }
+  ReadUint8Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
+                         float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
+          row0 + x);
+  }
+  ReadUint16Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU16> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du16, row + 2 * x, out0, out1);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+  }
+  ReadUint16Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU16> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+  }
+  ReadUint16Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU16> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
+  }
+  ReadUint16Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
+                             float* row_out[kMaxComponents]) {
+  ReadUint16Row<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<4, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
+  }
+  ReadFloatRow<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<D> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(d, row + 2 * x, out0, out1);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+  }
+  ReadFloatRow<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<D> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+  }
+  ReadFloatRow<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<D> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+    Store(Mul(mul, out3), d, row3 + x);
+  }
+  ReadFloatRow<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
+                            float* row_out[kMaxComponents]) {
+  ReadFloatRow<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<4, true>(row_in, 0, len, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ReadUint8RowSingle);
+HWY_EXPORT(ReadUint8RowInterleaved2);
+HWY_EXPORT(ReadUint8RowInterleaved3);
+HWY_EXPORT(ReadUint8RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingle);
+HWY_EXPORT(ReadUint16RowInterleaved2);
+HWY_EXPORT(ReadUint16RowInterleaved3);
+HWY_EXPORT(ReadUint16RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingleSwap);
+HWY_EXPORT(ReadUint16RowInterleaved2Swap);
+HWY_EXPORT(ReadUint16RowInterleaved3Swap);
+HWY_EXPORT(ReadUint16RowInterleaved4Swap);
+HWY_EXPORT(ReadFloatRowSingle);
+HWY_EXPORT(ReadFloatRowInterleaved2);
+HWY_EXPORT(ReadFloatRowInterleaved3);
+HWY_EXPORT(ReadFloatRowInterleaved4);
+HWY_EXPORT(ReadFloatRowSingleSwap);
+HWY_EXPORT(ReadFloatRowInterleaved2Swap);
+HWY_EXPORT(ReadFloatRowInterleaved3Swap);
+HWY_EXPORT(ReadFloatRowInterleaved4Swap);
+
+void ChooseInputMethod(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  bool swap_endianness =
+      (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  m->input_method = nullptr;
+  if (m->data_type == JPEGLI_TYPE_UINT8) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
+    }
+  }
+  if (m->input_method == nullptr) {
+    JPEGLI_ERROR("Could not find input method.");
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/input.h b/third_party/jpeg-xl/lib/jpegli/input.h
new file mode 100644
index 0000000000..27b0e80fdb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_INPUT_H_
+#define LIB_JPEGLI_INPUT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void ChooseInputMethod(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_INPUT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc
new file mode 100644
index 0000000000..4914e5e34b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/input_suspension_test.cc
@@ -0,0 +1,612 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+
+struct SourceManager {
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size,
+                bool is_partial_file)
+      : data_(data),
+        len_(len),
+        pos_(0),
+        max_chunk_size_(max_chunk_size),
+        is_partial_file_(is_partial_file) {
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    if (!is_partial_file_) {
+      EXPECT_EQ(len_, pos_);
+    }
+  }
+
+  bool LoadNextChunk() {
+    if (pos_ >= len_ && !is_partial_file_) {
+      return false;
+    }
+    if (pub_.bytes_in_buffer > 0) {
+      EXPECT_LE(pub_.bytes_in_buffer, buffer_.size());
+      memmove(&buffer_[0], pub_.next_input_byte, pub_.bytes_in_buffer);
+    }
+    size_t chunk_size =
+        pos_ < len_ ? std::min(len_ - pos_, max_chunk_size_) : 2;
+    buffer_.resize(pub_.bytes_in_buffer + chunk_size);
+    memcpy(&buffer_[pub_.bytes_in_buffer],
+           pos_ < len_ ? data_ + pos_ : kFakeEoiMarker, chunk_size);
+    pub_.next_input_byte = &buffer_[0];
+    pub_.bytes_in_buffer += chunk_size;
+    pos_ += chunk_size;
+    return true;
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  std::vector<uint8_t> buffer_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  bool is_partial_file_;
+
+  static void init_source(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    src->pub_.next_input_byte = nullptr;
+    src->pub_.bytes_in_buffer = 0;
+  }
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  if (cinfo->src->bytes_in_buffer < 2) {
+    return FALSE;
+  }
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     SourceManager* src, TestImage* output) {
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      while ((num_output_lines =
+                  jpegli_read_raw_data(cinfo, &data[0], max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      size_t lines_left = cinfo->output_height - cinfo->output_scanline;
+      max_lines = std::min<size_t>(max_output_lines, lines_left);
+      size_t stride = cinfo->output_width * cinfo->num_components;
+      std::vector<JSAMPROW> scanlines(max_lines);
+      for (size_t i = 0; i < max_lines; ++i) {
+        size_t yidx = cinfo->output_scanline + i;
+        scanlines[i] = &output->pixels[yidx * stride];
+      }
+      while ((num_output_lines = jpegli_read_scanlines(cinfo, &scanlines[0],
+                                                       max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    if (num_output_lines < max_lines) {
+      JXL_CHECK(src && src->LoadNextChunk());
+    }
+  }
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  float max_rms_dist = 1.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  if (!config.fn.empty()) {
+    return ReadTestData(config.fn.c_str());
+  }
+  GeneratePixels(&config.input);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  return compressed;
+}
+
+bool IsSequential(const TestConfig& config) {
+  if (!config.fn.empty()) {
+    return config.fn_desc.find("PROGR") == std::string::npos;
+  }
+  return config.jparams.progressive_mode <= 0;
+}
+
+class InputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepNonBuffered) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    if (config.jparams.add_marker) {
+      jpegli_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpegli_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+      num_markers_seen = 0;
+      jpegli_set_marker_processor(&cinfo, 0xe6, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe7, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe8, test_marker_processor);
+    }
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo, true);
+    if (config.jparams.add_marker) {
+      EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+      EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+    }
+    VerifyHeader(config.jparams, &cinfo);
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays;
+      while ((coef_arrays = jpegli_read_coefficients(&cinfo)) == nullptr) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ReadOutputImage(dparams, &cinfo, &src, &output0);
+    }
+
+    while (!jpegli_finish_decompress(&cinfo)) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo, true);
+
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+    while (!jpegli_input_complete(&cinfo)) {
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      TestImage output;
+      ReadOutputImage(dparams, &cinfo, &src, &output);
+      output_progression0.emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      while (!jpegli_finish_output(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression0.back());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    VerifyOutputImage(expected, output, config.max_rms_dist);
+  }
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(1, cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    EXPECT_TRUE(jpegli_finish_output(&cinfo));
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    }
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  VerifyOutputImage(output_progression1.back(), output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputNonBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker || IsSequential(config)) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jpegli_read_coefficients(&cinfo);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  std::vector<std::pair<std::string, std::string>> testfiles({
+      {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+      {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+      {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+  });
+  for (const auto& it : testfiles) {
+    for (size_t chunk_size : {1, 64, 65536}) {
+      for (size_t max_output_lines : {0, 1, 8, 16}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.chunk_size = chunk_size;
+        config.dparams.max_output_lines = max_output_lines;
+        all_tests.push_back(config);
+        if (max_output_lines == 16) {
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+          config.dparams.output_mode = COEFFICIENTS;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      TestConfig config;
+      config.dparams.chunk_size = chunk_size;
+      config.jparams.progressive_mode = 2;
+      config.jparams.restart_interval = r;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t chunk_size : {1, 4, 1024}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+          TestConfig config;
+          config.input.xsize = 517;
+          config.input.ysize = 523;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.size_factor = size_factor;
+          config.dparams.output_mode = output_mode;
+          // The last partially available block can behave differently.
+          // TODO(szabadka) Figure out if we can make the behaviour more
+          // similar.
+          config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = 517;
+      config.input.ysize = 523;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = 2;
+      config.dparams.size_factor = size_factor;
+      config.dparams.do_block_smoothing = true;
+      // libjpeg does smoothing for incomplete scans differently at
+      // the border between current and previous scans.
+      config.max_rms_dist = 8.0f;
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  if (c.dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << c.dparams.chunk_size;
+  }
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  if (c.dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << c.dparams.max_output_lines;
+  }
+  if (c.dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (c.dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  if (c.dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<InputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(InputSuspensionTest, InputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/jpeg.version.62 b/third_party/jpeg-xl/lib/jpegli/jpeg.version.62
new file mode 100644
index 0000000000..3a8d1f5ec5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/jpeg.version.62
@@ -0,0 +1,11 @@
+LIBJPEG_6.2 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_6.2 {
+  global:
+    jpeg_mem_src*;
+    jpeg_mem_dest*;
+    tj*;
+};
+\ No newline at end of file
diff --git a/third_party/jpeg-xl/lib/jpegli/jpeg.version.8 b/third_party/jpeg-xl/lib/jpegli/jpeg.version.8
new file mode 100644
index 0000000000..aa891f8571
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/jpeg.version.8
@@ -0,0 +1,9 @@
+LIBJPEG_8.0 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_8.0 {
+  global:
+    tj*;
+};
diff --git a/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc b/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc
new file mode 100644
index 0000000000..ef5ef224d3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/libjpeg_wrapper.cc
@@ -0,0 +1,260 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains wrapper-functions that are used to build the libjpeg.so
+// shared library that is API- and ABI-compatible with libjpeg-turbo's version
+// of libjpeg.so.
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+
+struct jpeg_error_mgr *jpeg_std_error(struct jpeg_error_mgr *err) {
+  return jpegli_std_error(err);
+}
+
+void jpeg_abort(j_common_ptr cinfo) { jpegli_abort(cinfo); }
+
+void jpeg_destroy(j_common_ptr cinfo) { jpegli_destroy(cinfo); }
+
+JQUANT_TBL *jpeg_alloc_quant_table(j_common_ptr cinfo) {
+  return jpegli_alloc_quant_table(cinfo);
+}
+
+JHUFF_TBL *jpeg_alloc_huff_table(j_common_ptr cinfo) {
+  return jpegli_alloc_huff_table(cinfo);
+}
+
+void jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                           size_t structsize) {
+  jpegli_CreateDecompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile) {
+  jpegli_stdio_src(cinfo, infile);
+}
+
+void jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                  unsigned long insize) {
+  jpegli_mem_src(cinfo, inbuffer, insize);
+}
+
+int jpeg_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  return jpegli_read_header(cinfo, require_image);
+}
+
+boolean jpeg_start_decompress(j_decompress_ptr cinfo) {
+  return jpegli_start_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                               JDIMENSION max_lines) {
+  return jpegli_read_scanlines(cinfo, scanlines, max_lines);
+}
+
+JDIMENSION jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  return jpegli_skip_scanlines(cinfo, num_lines);
+}
+
+void jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                        JDIMENSION *width) {
+  jpegli_crop_scanline(cinfo, xoffset, width);
+}
+
+boolean jpeg_finish_decompress(j_decompress_ptr cinfo) {
+  return jpegli_finish_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                              JDIMENSION max_lines) {
+  return jpegli_read_raw_data(cinfo, data, max_lines);
+}
+
+jvirt_barray_ptr *jpeg_read_coefficients(j_decompress_ptr cinfo) {
+  return jpegli_read_coefficients(cinfo);
+}
+
+boolean jpeg_has_multiple_scans(j_decompress_ptr cinfo) {
+  return jpegli_has_multiple_scans(cinfo);
+}
+
+boolean jpeg_start_output(j_decompress_ptr cinfo, int scan_number) {
+  return jpegli_start_output(cinfo, scan_number);
+}
+
+boolean jpeg_finish_output(j_decompress_ptr cinfo) {
+  return jpegli_finish_output(cinfo);
+}
+
+boolean jpeg_input_complete(j_decompress_ptr cinfo) {
+  return jpegli_input_complete(cinfo);
+}
+
+int jpeg_consume_input(j_decompress_ptr cinfo) {
+  return jpegli_consume_input(cinfo);
+}
+
+#if JPEG_LIB_VERSION >= 80
+void jpeg_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_core_output_dimensions(cinfo);
+}
+#endif
+void jpeg_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_calc_output_dimensions(cinfo);
+}
+
+void jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                       unsigned int length_limit) {
+  jpegli_save_markers(cinfo, marker_code, length_limit);
+}
+
+void jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                               jpeg_marker_parser_method routine) {
+  jpegli_set_marker_processor(cinfo, marker_code, routine);
+}
+
+boolean jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                              unsigned int *icc_data_len) {
+  return jpegli_read_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_abort_decompress(j_decompress_ptr cinfo) {
+  return jpegli_abort_decompress(cinfo);
+}
+
+void jpeg_destroy_decompress(j_decompress_ptr cinfo) {
+  return jpegli_destroy_decompress(cinfo);
+}
+
+void jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize) {
+  jpegli_CreateCompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile) {
+  jpegli_stdio_dest(cinfo, outfile);
+}
+
+void jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                   unsigned long *outsize) {
+  jpegli_mem_dest(cinfo, outbuffer, outsize);
+}
+
+void jpeg_set_defaults(j_compress_ptr cinfo) { jpegli_set_defaults(cinfo); }
+
+void jpeg_default_colorspace(j_compress_ptr cinfo) {
+  jpegli_default_colorspace(cinfo);
+}
+
+void jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  jpegli_set_colorspace(cinfo, colorspace);
+}
+
+void jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                      boolean force_baseline) {
+  jpegli_set_quality(cinfo, quality, force_baseline);
+}
+
+void jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                             boolean force_baseline) {
+  jpegli_set_linear_quality(cinfo, scale_factor, force_baseline);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  jpegli_default_qtables(cinfo, force_baseline);
+}
+#endif
+
+int jpeg_quality_scaling(int quality) {
+  return jpegli_quality_scaling(quality);
+}
+
+void jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                          const unsigned int *basic_table, int scale_factor,
+                          boolean force_baseline) {
+  jpegli_add_quant_table(cinfo, which_tbl, basic_table, scale_factor,
+                         force_baseline);
+}
+
+void jpeg_simple_progression(j_compress_ptr cinfo) {
+  jpegli_simple_progression(cinfo);
+}
+
+void jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli_suppress_tables(cinfo, suppress);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  jpegli_calc_jpeg_dimensions(cinfo);
+}
+#endif
+
+void jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                   j_compress_ptr dstinfo) {
+  jpegli_copy_critical_parameters(srcinfo, dstinfo);
+}
+
+void jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                         unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+}
+
+void jpeg_write_m_byte(j_compress_ptr cinfo, int val) {
+  jpegli_write_m_byte(cinfo, val);
+}
+
+void jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                       unsigned int datalen) {
+  jpegli_write_marker(cinfo, marker, dataptr, datalen);
+}
+
+void jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                            unsigned int icc_data_len) {
+  jpegli_write_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  jpegli_start_compress(cinfo, write_all_tables);
+}
+
+void jpeg_write_tables(j_compress_ptr cinfo) { jpegli_write_tables(cinfo); }
+
+JDIMENSION jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                JDIMENSION num_lines) {
+  return jpegli_write_scanlines(cinfo, scanlines, num_lines);
+}
+
+JDIMENSION jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                               JDIMENSION num_lines) {
+  return jpegli_write_raw_data(cinfo, data, num_lines);
+}
+
+void jpeg_write_coefficients(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays) {
+  jpegli_write_coefficients(cinfo, coef_arrays);
+}
+
+void jpeg_finish_compress(j_compress_ptr cinfo) {
+  jpegli_finish_compress(cinfo);
+}
+
+void jpeg_abort_compress(j_compress_ptr cinfo) { jpegli_abort_compress(cinfo); }
+
+void jpeg_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy_compress(cinfo);
+}
+
+boolean jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  return jpegli_resync_to_restart(cinfo, desired);
+}
+
+void jpeg_new_colormap(j_decompress_ptr cinfo) { jpegli_new_colormap(cinfo); }
diff --git a/third_party/jpeg-xl/lib/jpegli/memory_manager.cc b/third_party/jpeg-xl/lib/jpegli/memory_manager.cc
new file mode 100644
index 0000000000..f6530d8f02
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/memory_manager.cc
@@ -0,0 +1,181 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/memory_manager.h"
+
+#include <string.h>
+
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/error.h"
+
+struct jvirt_sarray_control {
+  JSAMPARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+struct jvirt_barray_control {
+  JBLOCKARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+namespace jpegli {
+
+namespace {
+
+struct MemoryManager {
+  struct jpeg_memory_mgr pub;
+  std::vector<void*> owned_ptrs[2 * JPOOL_NUMPOOLS];
+  uint64_t pool_memory_usage[2 * JPOOL_NUMPOOLS];
+  uint64_t total_memory_usage;
+  uint64_t peak_memory_usage;
+};
+
+void* Alloc(j_common_ptr cinfo, int pool_id, size_t sizeofobject) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= 2 * JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  if (mem->pub.max_memory_to_use > 0 &&
+      mem->total_memory_usage + static_cast<uint64_t>(sizeofobject) >
+          static_cast<uint64_t>(mem->pub.max_memory_to_use)) {
+    JPEGLI_ERROR("Total memory usage exceeding %ld",
+                 mem->pub.max_memory_to_use);
+  }
+  void* p;
+  if (pool_id < JPOOL_NUMPOOLS) {
+    p = malloc(sizeofobject);
+  } else {
+    p = hwy::AllocateAlignedBytes(sizeofobject, nullptr, nullptr);
+  }
+  if (p == nullptr) {
+    JPEGLI_ERROR("Out of memory");
+  }
+  mem->owned_ptrs[pool_id].push_back(p);
+  mem->pool_memory_usage[pool_id] += sizeofobject;
+  mem->total_memory_usage += sizeofobject;
+  mem->peak_memory_usage =
+      std::max(mem->peak_memory_usage, mem->total_memory_usage);
+  return p;
+}
+
+template <typename T>
+T** Alloc2dArray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+                 JDIMENSION numrows) {
+  T** array = Allocate<T*>(cinfo, numrows, pool_id);
+  // Always use aligned allocator for large 2d arrays.
+  if (pool_id < JPOOL_NUMPOOLS) {
+    pool_id += JPOOL_NUMPOOLS;
+  }
+  size_t stride = RoundUpTo(samplesperrow, HWY_ALIGNMENT);
+  T* buffer = Allocate<T>(cinfo, numrows * stride, pool_id);
+  for (size_t i = 0; i < numrows; ++i) {
+    array[i] = &buffer[i * stride];
+  }
+  return array;
+}
+
+template <typename Control, typename T>
+Control* RequestVirtualArray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                             JDIMENSION samplesperrow, JDIMENSION numrows,
+                             JDIMENSION maxaccess) {
+  if (pool_id != JPOOL_IMAGE) {
+    JPEGLI_ERROR("Only image lifetime virtual arrays are supported.");
+  }
+  Control* p = Allocate<Control>(cinfo, 1, pool_id);
+  p->full_buffer = Alloc2dArray<T>(cinfo, pool_id, samplesperrow, numrows);
+  p->numrows = numrows;
+  p->maxaccess = maxaccess;
+  if (pre_zero) {
+    for (size_t i = 0; i < numrows; ++i) {
+      memset(p->full_buffer[i], 0, samplesperrow * sizeof(T));
+    }
+  }
+  return p;
+}
+
+void RealizeVirtualArrays(j_common_ptr cinfo) {
+  // Nothing to do, the full arrays were realized at request time already.
+}
+
+template <typename Control, typename T>
+T** AccessVirtualArray(j_common_ptr cinfo, Control* ptr, JDIMENSION start_row,
+                       JDIMENSION num_rows, boolean writable) {
+  if (num_rows > ptr->maxaccess) {
+    JPEGLI_ERROR("Invalid virtual array access, num rows %u vs max rows %u",
+                 num_rows, ptr->maxaccess);
+  }
+  if (start_row + num_rows > ptr->numrows) {
+    JPEGLI_ERROR("Invalid virtual array access, %u vs %u total rows",
+                 start_row + num_rows, ptr->numrows);
+  }
+  if (ptr->full_buffer == nullptr) {
+    JPEGLI_ERROR("Invalid virtual array access, array not realized.");
+  }
+  return ptr->full_buffer + start_row;
+}
+
+void ClearPool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  mem->owned_ptrs[pool_id].clear();
+  mem->total_memory_usage -= mem->pool_memory_usage[pool_id];
+  mem->pool_memory_usage[pool_id] = 0;
+}
+
+void FreePool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  for (void* ptr : mem->owned_ptrs[pool_id]) {
+    free(ptr);
+  }
+  ClearPool(cinfo, pool_id);
+  for (void* ptr : mem->owned_ptrs[JPOOL_NUMPOOLS + pool_id]) {
+    hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
+  }
+  ClearPool(cinfo, JPOOL_NUMPOOLS + pool_id);
+}
+
+void SelfDestruct(j_common_ptr cinfo) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    FreePool(cinfo, pool_id);
+  }
+  delete mem;
+  cinfo->mem = nullptr;
+}
+
+}  // namespace
+
+void InitMemoryManager(j_common_ptr cinfo) {
+  MemoryManager* mem = new MemoryManager;
+  mem->pub.alloc_small = jpegli::Alloc;
+  mem->pub.alloc_large = jpegli::Alloc;
+  mem->pub.alloc_sarray = jpegli::Alloc2dArray<JSAMPLE>;
+  mem->pub.alloc_barray = jpegli::Alloc2dArray<JBLOCK>;
+  mem->pub.request_virt_sarray =
+      jpegli::RequestVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.request_virt_barray =
+      jpegli::RequestVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.realize_virt_arrays = jpegli::RealizeVirtualArrays;
+  mem->pub.access_virt_sarray =
+      jpegli::AccessVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.access_virt_barray =
+      jpegli::AccessVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.free_pool = jpegli::FreePool;
+  mem->pub.self_destruct = jpegli::SelfDestruct;
+  mem->pub.max_memory_to_use = 0;
+  mem->total_memory_usage = 0;
+  mem->peak_memory_usage = 0;
+  memset(mem->pool_memory_usage, 0, sizeof(mem->pool_memory_usage));
+  cinfo->mem = reinterpret_cast<struct jpeg_memory_mgr*>(mem);
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/memory_manager.h b/third_party/jpeg-xl/lib/jpegli/memory_manager.h
new file mode 100644
index 0000000000..238f85a308
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/memory_manager.h
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_MEMORY_MANAGER_H_
+#define LIB_JPEGLI_MEMORY_MANAGER_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <stdlib.h>
+/* clang-format on */
+
+#define JPOOL_PERMANENT_ALIGNED (JPOOL_NUMPOOLS + JPOOL_PERMANENT)
+#define JPOOL_IMAGE_ALIGNED (JPOOL_NUMPOOLS + JPOOL_IMAGE)
+
+namespace jpegli {
+
+void InitMemoryManager(j_common_ptr cinfo);
+
+template <typename T>
+T* Allocate(j_common_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  void* p = (*cinfo->mem->alloc_small)(cinfo, pool_id, len * sizeof(T));
+  return reinterpret_cast<T*>(p);
+}
+
+template <typename T>
+T* Allocate(j_decompress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+template <typename T>
+T* Allocate(j_compress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_MEMORY_MANAGER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc
new file mode 100644
index 0000000000..73db791727
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/output_suspension_test.cc
@@ -0,0 +1,219 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr size_t kInitialBufferSize = 1024;
+static constexpr size_t kFinalBufferSize = 18;
+
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  DestinationManager() {
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  void Rewind() {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+  }
+
+  void EmptyTo(std::vector<uint8_t>* output, size_t new_size = 0) {
+    output->insert(output->end(), buffer.data(), pub.next_output_byte);
+    if (new_size > 0) {
+      buffer.resize(new_size);
+    }
+    Rewind();
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    us->buffer.resize(kInitialBufferSize);
+    us->Rewind();
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) { return FALSE; }
+
+  static void term_destination(j_compress_ptr cinfo) {}
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  size_t buffer_size;
+  size_t lines_batch_size;
+};
+
+class OutputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(OutputSuspensionTestParam, PixelData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  GeneratePixels(&input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    std::vector<uint8_t> row_bytes(config.lines_batch_size * stride);
+    while (cinfo.next_scanline < cinfo.image_height) {
+      size_t lines_left = cinfo.image_height - cinfo.next_scanline;
+      size_t num_lines = std::min(config.lines_batch_size, lines_left);
+      memcpy(&row_bytes[0], &input.pixels[cinfo.next_scanline * stride],
+             num_lines * stride);
+      std::vector<JSAMPROW> rows(num_lines);
+      for (size_t i = 0; i < num_lines; ++i) {
+        rows[i] = &row_bytes[i * stride];
+      }
+      size_t lines_done = 0;
+      while (lines_done < num_lines) {
+        lines_done += jpegli_write_scanlines(&cinfo, &rows[lines_done],
+                                             num_lines - lines_done);
+        if (lines_done < num_lines) {
+          dest.EmptyTo(&compressed, config.buffer_size);
+        }
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output);
+  VerifyOutputImage(input, output, 2.5);
+}
+
+TEST_P(OutputSuspensionTestParam, RawData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  if (config.lines_batch_size != 1) return;
+  TestImage& input = config.input;
+  input.color_space = JCS_YCbCr;
+  GeneratePixels(&input);
+  GenerateRawData(config.jparams, &input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    cinfo.raw_data_in = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = config.jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo.num_components);
+    std::vector<JSAMPARRAY> data(cinfo.num_components);
+    for (int c = 0; c < cinfo.num_components; ++c) {
+      rowdata[c].resize(config.jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo.next_scanline < cinfo.image_height) {
+      for (int c = 0; c < cinfo.num_components; ++c) {
+        size_t cwidth = cinfo.comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo.comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = config.jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo.next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      while (jpegli_write_raw_data(&cinfo, &data[0], max_lines) == 0) {
+        dest.EmptyTo(&compressed, config.buffer_size);
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  DecompressParams dparams;
+  dparams.output_mode = RAW_DATA;
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), dparams, compressed, &output);
+  VerifyOutputImage(input, output, 3.5);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      for (int nlines : {1, 8, 117}) {
+        for (int bufsize : {1, 16, 16 << 10}) {
+          TestConfig config;
+          config.lines_batch_size = nlines;
+          config.buffer_size = bufsize;
+          config.input.xsize = xsize0;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {1, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  os << "Lines" << c.lines_batch_size;
+  os << "BufSize" << c.buffer_size;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<OutputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(OutputSuspensionTest, OutputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/quant.cc b/third_party/jpeg-xl/lib/jpegli/quant.cc
new file mode 100644
index 0000000000..3ab9bcf856
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/quant.cc
@@ -0,0 +1,748 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/quant.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+namespace {
+
+// Global scale is chosen in a way that butteraugli 3-norm matches libjpeg
+// with the same quality setting. Fitted for quality 90 on jyrki31 corpus.
+constexpr float kGlobalScaleXYB = 1.43951668f;
+constexpr float kGlobalScaleYCbCr = 1.66986909f;
+
+static constexpr float kBaseQuantMatrixXYB[] = {
+    // c = 0
+    7.5629935265f,
+    19.8247814178f,
+    22.5724945068f,
+    20.6706695557f,
+    22.6864585876f,
+    23.5696277618f,
+    25.8129081726f,
+    36.3307571411f,
+    19.8247814178f,
+    21.5503177643f,
+    19.9372234344f,
+    20.5424213409f,
+    21.8645496368f,
+    23.9041385651f,
+    28.2844066620f,
+    32.6609764099f,
+    22.5724945068f,
+    19.9372234344f,
+    21.9017257690f,
+    19.1223449707f,
+    21.7515811920f,
+    24.6724700928f,
+    25.4249649048f,
+    32.6653823853f,
+    20.6706695557f,
+    20.5424213409f,
+    19.1223449707f,
+    20.1610221863f,
+    25.3719692230f,
+    25.9668903351f,
+    30.9804954529f,
+    31.3406009674f,
+    22.6864585876f,
+    21.8645496368f,
+    21.7515811920f,
+    25.3719692230f,
+    26.2431850433f,
+    40.5992202759f,
+    43.2624626160f,
+    63.3010940552f,
+    23.5696277618f,
+    23.9041385651f,
+    24.6724700928f,
+    25.9668903351f,
+    40.5992202759f,
+    48.3026771545f,
+    34.0964355469f,
+    61.9852142334f,
+    25.8129081726f,
+    28.2844066620f,
+    25.4249649048f,
+    30.9804954529f,
+    43.2624626160f,
+    34.0964355469f,
+    34.4937438965f,
+    66.9702758789f,
+    36.3307571411f,
+    32.6609764099f,
+    32.6653823853f,
+    31.3406009674f,
+    63.3010940552f,
+    61.9852142334f,
+    66.9702758789f,
+    39.9652709961f,
+    // c = 1
+    1.6262000799f,
+    3.2199242115f,
+    3.4903779030f,
+    3.9148359299f,
+    4.8337211609f,
+    4.9108843803f,
+    5.3137121201f,
+    6.1676793098f,
+    3.2199242115f,
+    3.4547898769f,
+    3.6036829948f,
+    4.2652835846f,
+    4.8368387222f,
+    4.8226222992f,
+    5.6120514870f,
+    6.3431472778f,
+    3.4903779030f,
+    3.6036829948f,
+    3.9044559002f,
+    4.3374395370f,
+    4.8435096741f,
+    5.4057979584f,
+    5.6066360474f,
+    6.1075134277f,
+    3.9148359299f,
+    4.2652835846f,
+    4.3374395370f,
+    4.6064834595f,
+    5.1751475334f,
+    5.4013924599f,
+    6.0399808884f,
+    6.7825231552f,
+    4.8337211609f,
+    4.8368387222f,
+    4.8435096741f,
+    5.1751475334f,
+    5.3748049736f,
+    6.1410837173f,
+    7.6529307365f,
+    7.5235214233f,
+    4.9108843803f,
+    4.8226222992f,
+    5.4057979584f,
+    5.4013924599f,
+    6.1410837173f,
+    6.3431472778f,
+    7.1083049774f,
+    7.6008300781f,
+    5.3137121201f,
+    5.6120514870f,
+    5.6066360474f,
+    6.0399808884f,
+    7.6529307365f,
+    7.1083049774f,
+    7.0943155289f,
+    7.0478363037f,
+    6.1676793098f,
+    6.3431472778f,
+    6.1075134277f,
+    6.7825231552f,
+    7.5235214233f,
+    7.6008300781f,
+    7.0478363037f,
+    6.9186143875f,
+    // c = 2
+    3.3038473129f,
+    10.0689258575f,
+    12.2785224915f,
+    14.6041173935f,
+    16.2107315063f,
+    19.2314529419f,
+    28.0129547119f,
+    55.6682891846f,
+    10.0689258575f,
+    11.4085016251f,
+    11.3871345520f,
+    15.4934167862f,
+    16.5364933014f,
+    14.9153423309f,
+    26.3748722076f,
+    40.8614425659f,
+    12.2785224915f,
+    11.3871345520f,
+    17.0886878967f,
+    13.9500350952f,
+    16.0003223419f,
+    28.5660629272f,
+    26.2124195099f,
+    30.1260128021f,
+    14.6041173935f,
+    15.4934167862f,
+    13.9500350952f,
+    21.1235027313f,
+    26.1579780579f,
+    25.5579223633f,
+    40.6859359741f,
+    33.8056335449f,
+    16.2107315063f,
+    16.5364933014f,
+    16.0003223419f,
+    26.1579780579f,
+    26.8042831421f,
+    26.1587715149f,
+    35.7343978882f,
+    43.6857032776f,
+    19.2314529419f,
+    14.9153423309f,
+    28.5660629272f,
+    25.5579223633f,
+    26.1587715149f,
+    34.5418128967f,
+    41.3197937012f,
+    48.7867660522f,
+    28.0129547119f,
+    26.3748722076f,
+    26.2124195099f,
+    40.6859359741f,
+    35.7343978882f,
+    41.3197937012f,
+    47.6329460144f,
+    55.3498458862f,
+    55.6682891846f,
+    40.8614425659f,
+    30.1260128021f,
+    33.8056335449f,
+    43.6857032776f,
+    48.7867660522f,
+    55.3498458862f,
+    63.6065597534f,
+};
+
+static const float kBaseQuantMatrixYCbCr[] = {
+    // c = 0
+    1.4076321125f,
+    2.6927082539f,
+    2.6927735806f,
+    2.9220938683f,
+    3.0870633125f,
+    3.4968640804f,
+    3.5730612278f,
+    3.5978596210f,
+    2.6927082539f,
+    2.6926636696f,
+    2.7195601463f,
+    2.9238407612f,
+    3.1882488728f,
+    3.0607142448f,
+    3.1882314682f,
+    3.8304426670f,
+    2.6927735806f,
+    2.7195601463f,
+    2.9532215595f,
+    3.5562388897f,
+    3.7088179588f,
+    3.0576279163f,
+    3.7443304062f,
+    4.2484717369f,
+    2.9220938683f,
+    2.9238407612f,
+    3.5562388897f,
+    3.0594384670f,
+    4.1780085564f,
+    4.9221563339f,
+    4.7842588425f,
+    4.6059336662f,
+    3.0870633125f,
+    3.1882488728f,
+    3.7088179588f,
+    4.1780085564f,
+    4.3475294113f,
+    5.5422372818f,
+    5.5741071701f,
+    5.4531836510f,
+    3.4968640804f,
+    3.0607142448f,
+    3.0576279163f,
+    4.9221563339f,
+    5.5422372818f,
+    5.4393601418f,
+    5.1039180756f,
+    6.0990614891f,
+    3.5730612278f,
+    3.1882314682f,
+    3.7443304062f,
+    4.7842588425f,
+    5.5741071701f,
+    5.1039180756f,
+    5.4144043922f,
+    5.4524297714f,
+    3.5978596210f,
+    3.8304426670f,
+    4.2484717369f,
+    4.6059336662f,
+    5.4531836510f,
+    6.0990614891f,
+    5.4524297714f,
+    4.3595433235f,
+    // c = 1
+    2.8152642250f,
+    10.4298934937f,
+    16.1451492310f,
+    15.3725156784f,
+    17.6543502808f,
+    19.1104965210f,
+    17.5021877289f,
+    29.5177459717f,
+    10.4298934937f,
+    15.7448558807f,
+    16.8441677094f,
+    15.3214502335f,
+    17.5918464661f,
+    16.8787574768f,
+    27.0867996216f,
+    21.3443832397f,
+    16.1451492310f,
+    16.8441677094f,
+    14.7525558472f,
+    18.0765247345f,
+    18.2206096649f,
+    23.2126445770f,
+    98.1291885376f,
+    23.6039886475f,
+    15.3725156784f,
+    15.3214502335f,
+    18.0765247345f,
+    17.2925109863f,
+    16.1435356140f,
+    24.0464611053f,
+    27.1577339172f,
+    35.3269882202f,
+    17.6543502808f,
+    17.5918464661f,
+    18.2206096649f,
+    16.1435356140f,
+    19.2819595337f,
+    16.2939300537f,
+    19.6862888336f,
+    51.0941123962f,
+    19.1104965210f,
+    16.8787574768f,
+    23.2126445770f,
+    24.0464611053f,
+    16.2939300537f,
+    32.3153648376f,
+    45.7272338867f,
+    64.6245880127f,
+    17.5021877289f,
+    27.0867996216f,
+    98.1291885376f,
+    27.1577339172f,
+    19.6862888336f,
+    45.7272338867f,
+    61.8331909180f,
+    85.0626754761f,
+    29.5177459717f,
+    21.3443832397f,
+    23.6039886475f,
+    35.3269882202f,
+    51.0941123962f,
+    64.6245880127f,
+    85.0626754761f,
+    112.7605514526f,
+    // c = 2
+    2.8152642250f,
+    5.4735932350f,
+    7.3637795448f,
+    6.5195322037f,
+    8.1501169205f,
+    8.7243938446f,
+    8.7219915390f,
+    9.3618907928f,
+    5.4735932350f,
+    7.1514792442f,
+    7.2054982185f,
+    8.1126995087f,
+    8.1497650146f,
+    7.1335659027f,
+    7.8453893661f,
+    8.3512821198f,
+    7.3637795448f,
+    7.2054982185f,
+    6.9224662781f,
+    8.0766754150f,
+    9.1168527603f,
+    7.3714752197f,
+    7.3646650314f,
+    8.6790895462f,
+    6.5195322037f,
+    8.1126995087f,
+    8.0766754150f,
+    7.8294739723f,
+    7.7385902405f,
+    7.8628563881f,
+    7.4404106140f,
+    8.4759435654f,
+    8.1501169205f,
+    8.1497650146f,
+    9.1168527603f,
+    7.7385902405f,
+    7.0960793495f,
+    8.9185447693f,
+    8.2047510147f,
+    7.8465061188f,
+    8.7243938446f,
+    7.1335659027f,
+    7.3714752197f,
+    7.8628563881f,
+    8.9185447693f,
+    8.6063842773f,
+    9.7156696320f,
+    64.6700744629f,
+    8.7219915390f,
+    7.8453893661f,
+    7.3646650314f,
+    7.4404106140f,
+    8.2047510147f,
+    9.7156696320f,
+    61.9934043884f,
+    83.2930450439f,
+    9.3618907928f,
+    8.3512821198f,
+    8.6790895462f,
+    8.4759435654f,
+    7.8465061188f,
+    64.6700744629f,
+    83.2930450439f,
+    113.0502548218f,
+};
+
+static const float k420GlobalScale = 1.2;
+static const float k420Rescale[64] = {
+    0.6386, 0.4213, 0.3994, 0.3333, 0.3143, 0.3367, 0.3612, 0.3794,  //
+    0.4213, 0.4026, 0.3309, 0.3344, 0.3059, 0.3118, 0.4069, 0.3595,  //
+    0.3994, 0.3309, 0.4080, 0.2531, 0.2645, 0.3630, 0.3502, 0.3231,  //
+    0.3333, 0.3344, 0.2531, 0.2960, 0.3153, 0.3476, 0.3430, 0.4004,  //
+    0.3143, 0.3059, 0.2645, 0.3153, 0.2733, 0.3296, 0.3338, 0.3418,  //
+    0.3367, 0.3118, 0.3630, 0.3476, 0.3296, 0.3144, 0.2262, 0.1326,  //
+    0.3612, 0.4069, 0.3502, 0.3430, 0.3338, 0.2262, 0.1000, 0.1000,  //
+    0.3794, 0.3595, 0.3231, 0.4004, 0.3418, 0.1326, 0.1000, 0.3366,  //
+};
+
+static const float kBaseQuantMatrixStd[] = {
+    // c = 0
+    16.0f, 11.0f, 10.0f, 16.0f, 24.0f, 40.0f, 51.0f, 61.0f,      //
+    12.0f, 12.0f, 14.0f, 19.0f, 26.0f, 58.0f, 60.0f, 55.0f,      //
+    14.0f, 13.0f, 16.0f, 24.0f, 40.0f, 57.0f, 69.0f, 56.0f,      //
+    14.0f, 17.0f, 22.0f, 29.0f, 51.0f, 87.0f, 80.0f, 62.0f,      //
+    18.0f, 22.0f, 37.0f, 56.0f, 68.0f, 109.0f, 103.0f, 77.0f,    //
+    24.0f, 35.0f, 55.0f, 64.0f, 81.0f, 104.0f, 113.0f, 92.0f,    //
+    49.0f, 64.0f, 78.0f, 87.0f, 103.0f, 121.0f, 120.0f, 101.0f,  //
+    72.0f, 92.0f, 95.0f, 98.0f, 112.0f, 100.0f, 103.0f, 99.0f,   //
+    // c = 1
+    17.0f, 18.0f, 24.0f, 47.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    18.0f, 21.0f, 26.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    24.0f, 26.0f, 56.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    47.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+};
+
+static const float kZeroBiasMulYCbCrLQ[] = {
+    // c = 0
+    0.6190f, 0.0568f, 0.3880f, 0.6190f, 0.6190f, 0.4490f, 0.4490f, 0.6187f,  //
+    0.0568f, 0.5829f, 0.6189f, 0.6190f, 0.6190f, 0.7190f, 0.6190f, 0.6189f,  //
+    0.3880f, 0.6189f, 0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.6187f, 0.6100f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.3839f, 0.7160f, 0.6190f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.6190f, 0.3880f, 0.5860f, 0.4790f,  //
+    0.4490f, 0.7190f, 0.6190f, 0.3839f, 0.3880f, 0.6190f, 0.6190f, 0.6190f,  //
+    0.4490f, 0.6190f, 0.6187f, 0.7160f, 0.5860f, 0.6190f, 0.6204f, 0.6190f,  //
+    0.6187f, 0.6189f, 0.6100f, 0.6190f, 0.4790f, 0.6190f, 0.6190f, 0.3480f,  //
+    // c = 1
+    0.9430f, 1.1640f, 0.9373f, 1.1319f, 0.8016f, 0.9136f, 1.1530f, 0.9430f,  //
+    1.1640f, 0.9188f, 0.9160f, 1.1980f, 1.1830f, 0.9758f, 0.9430f, 0.9430f,  //
+    0.9373f, 0.9160f, 0.8430f, 1.1720f, 0.7083f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1319f, 1.1980f, 1.1720f, 1.1490f, 0.8547f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.8016f, 1.1830f, 0.7083f, 0.8547f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.9136f, 0.9758f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1530f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f,  //
+    0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f, 0.9430f,  //
+    // c = 2
+    0.3060f, 1.3190f, 0.4308f, 0.4460f, 0.0661f, 0.0660f, 0.2660f, 0.2960f,  //
+    1.3190f, 0.3280f, 0.3093f, 0.0750f, 0.0505f, 0.1594f, 0.3060f, 0.2113f,  //
+    0.4308f, 0.3093f, 0.3060f, 0.1182f, 0.0500f, 0.3060f, 0.3915f, 0.2426f,  //
+    0.4460f, 0.0750f, 0.1182f, 0.0512f, 0.0500f, 0.2130f, 0.3930f, 0.1590f,  //
+    0.0661f, 0.0505f, 0.0500f, 0.0500f, 0.3055f, 0.3360f, 0.5148f, 0.5403f,  //
+    0.0660f, 0.1594f, 0.3060f, 0.2130f, 0.3360f, 0.5060f, 0.5874f, 0.3060f,  //
+    0.2660f, 0.3060f, 0.3915f, 0.3930f, 0.5148f, 0.5874f, 0.3060f, 0.3060f,  //
+    0.2960f, 0.2113f, 0.2426f, 0.1590f, 0.5403f, 0.3060f, 0.3060f, 0.3060f,  //
+};
+
+static const float kZeroBiasMulYCbCrHQ[] = {
+    // c = 0
+    0.7830f, 0.0044f, 0.2521f, 0.6547f, 0.8161f, 0.6130f, 0.8841f, 0.8155f,  //
+    0.0044f, 0.6831f, 0.6553f, 0.6295f, 0.7848f, 0.7843f, 0.8474f, 0.7836f,  //
+    0.2521f, 0.6553f, 0.7834f, 0.7829f, 0.8161f, 0.8072f, 0.7743f, 0.9242f,  //
+    0.6547f, 0.6295f, 0.7829f, 0.8654f, 0.7829f, 0.6986f, 0.7818f, 0.7726f,  //
+    0.8161f, 0.7848f, 0.8161f, 0.7829f, 0.7471f, 0.7827f, 0.7843f, 0.7653f,  //
+    0.6130f, 0.7843f, 0.8072f, 0.6986f, 0.7827f, 0.7848f, 0.9508f, 0.7653f,  //
+    0.8841f, 0.8474f, 0.7743f, 0.7818f, 0.7843f, 0.9508f, 0.7839f, 0.8437f,  //
+    0.8155f, 0.7836f, 0.9242f, 0.7726f, 0.7653f, 0.7653f, 0.8437f, 0.7819f,  //
+    // c = 1
+    1.0540f, 1.0816f, 1.0556f, 1.2876f, 1.1554f, 1.1567f, 1.8851f, 0.5488f,  //
+    1.0816f, 1.1537f, 1.1850f, 1.0712f, 1.1671f, 2.0719f, 1.0544f, 1.4764f,  //
+    1.0556f, 1.1850f, 1.2870f, 1.1981f, 1.8181f, 1.2618f, 1.0564f, 1.1191f,  //
+    1.2876f, 1.0712f, 1.1981f, 1.4753f, 2.0609f, 1.0564f, 1.2645f, 1.0564f,  //
+    1.1554f, 1.1671f, 1.8181f, 2.0609f, 0.7324f, 1.1163f, 0.8464f, 1.0564f,  //
+    1.1567f, 2.0719f, 1.2618f, 1.0564f, 1.1163f, 1.0040f, 1.0564f, 1.0564f,  //
+    1.8851f, 1.0544f, 1.0564f, 1.2645f, 0.8464f, 1.0564f, 1.0564f, 1.0564f,  //
+    0.5488f, 1.4764f, 1.1191f, 1.0564f, 1.0564f, 1.0564f, 1.0564f, 1.0564f,  //
+    // c = 2
+    0.6620f, 0.5392f, 0.6659f, 0.8968f, 0.6829f, 0.6328f, 0.5802f, 0.4836f,  //
+    0.5392f, 0.6746f, 0.6760f, 0.6102f, 0.6015f, 0.6958f, 0.7327f, 0.4897f,  //
+    0.6659f, 0.6760f, 0.6957f, 0.6543f, 0.4396f, 0.6330f, 0.7081f, 0.2583f,  //
+    0.8968f, 0.6102f, 0.6543f, 0.5913f, 0.6457f, 0.5828f, 0.5139f, 0.3565f,  //
+    0.6829f, 0.6015f, 0.4396f, 0.6457f, 0.5633f, 0.4263f, 0.6371f, 0.5949f,  //
+    0.6328f, 0.6958f, 0.6330f, 0.5828f, 0.4263f, 0.2847f, 0.2909f, 0.6629f,  //
+    0.5802f, 0.7327f, 0.7081f, 0.5139f, 0.6371f, 0.2909f, 0.6644f, 0.6644f,  //
+    0.4836f, 0.4897f, 0.2583f, 0.3565f, 0.5949f, 0.6629f, 0.6644f, 0.6644f,  //
+};
+
+static const float kZeroBiasOffsetYCbCr[] = {
+    0.59082f,
+    0.58146f,
+    0.57988f,
+};
+
+constexpr uint8_t kTransferFunctionPQ = 16;
+constexpr uint8_t kTransferFunctionHLG = 18;
+
+float DistanceToLinearQuality(float distance) {
+  if (distance <= 0.1f) {
+    return 1.0f;
+  } else if (distance <= 4.6f) {
+    return (200.0f / 9.0f) * (distance - 0.1f);
+  } else if (distance <= 6.4f) {
+    return 5000.0f / (100.0f - (distance - 0.1f) / 0.09f);
+  } else if (distance < 25.0f) {
+    return 530000.0f /
+           (3450.0f -
+            300.0f * std::sqrt((848.0f * distance - 5330.0f) / 120.0f));
+  } else {
+    return 5000.0f;
+  }
+}
+
+constexpr float kExponent[DCTSIZE2] = {
+    1.00f, 0.51f, 0.67f, 0.74f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.51f, 0.66f, 0.69f, 0.87f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.67f, 0.69f, 0.84f, 0.83f, 0.96f, 1.00f, 1.00f, 1.00f,  //
+    0.74f, 0.87f, 0.83f, 1.00f, 1.00f, 0.91f, 0.91f, 1.00f,  //
+    1.00f, 1.00f, 0.96f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+};
+constexpr float kDist0 = 1.5f;  // distance where non-linearity kicks in.
+
+float DistanceToScale(float distance, int k) {
+  if (distance < kDist0) {
+    return distance;
+  }
+  const float exp = kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::max<float>(0.5f * distance, mul * std::pow(distance, exp));
+}
+
+float ScaleToDistance(float scale, int k) {
+  if (scale < kDist0) {
+    return scale;
+  }
+  const float exp = 1.0 / kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::min<float>(2.0f * scale, mul * std::pow(scale, exp));
+}
+
+float QuantValsToDistance(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  float global_scale = kGlobalScaleYCbCr;
+  if (m->cicp_transfer_function == kTransferFunctionPQ) {
+    global_scale *= .4f;
+  } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+    global_scale *= .5f;
+  }
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  static const float kDistMax = 10000.0f;
+  float dist_min = 0.0f;
+  float dist_max = kDistMax;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    uint16_t* quantval = cinfo->quant_tbl_ptrs[quant_idx]->quantval;
+    const float* base_qm = &kBaseQuantMatrixYCbCr[quant_idx * DCTSIZE2];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float dmin = 0.0;
+      float dmax = kDistMax;
+      float invq = 1.0f / base_qm[k] / global_scale;
+      int qval = quantval[k];
+      if (qval > 1) {
+        float scale_min = (qval - 0.5f) * invq;
+        dmin = ScaleToDistance(scale_min, k);
+      }
+      if (qval < quant_max) {
+        float scale_max = (qval + 0.5f) * invq;
+        dmax = ScaleToDistance(scale_max, k);
+      }
+      if (dmin <= dist_max) {
+        dist_min = std::max(dmin, dist_min);
+      }
+      if (dmax >= dist_min) {
+        dist_max = std::min(dist_max, dmax);
+      }
+    }
+  }
+  float distance;
+  if (dist_min == 0) {
+    distance = dist_max;
+  } else if (dist_max == kDistMax) {
+    distance = dist_min;
+  } else {
+    distance = 0.5f * (dist_min + dist_max);
+  }
+  return distance;
+}
+
+bool IsYUV420(j_compress_ptr cinfo) {
+  return (cinfo->jpeg_color_space == JCS_YCbCr &&
+          cinfo->comp_info[0].h_samp_factor == 2 &&
+          cinfo->comp_info[0].v_samp_factor == 2 &&
+          cinfo->comp_info[1].h_samp_factor == 1 &&
+          cinfo->comp_info[1].v_samp_factor == 1 &&
+          cinfo->comp_info[2].h_samp_factor == 1 &&
+          cinfo->comp_info[2].v_samp_factor == 1);
+}
+
+}  // namespace
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables) {
+  jpeg_comp_master* m = cinfo->master;
+  const bool xyb = m->xyb_mode && cinfo->jpeg_color_space == JCS_RGB;
+  const bool is_yuv420 = IsYUV420(cinfo);
+
+  float global_scale;
+  bool non_linear_scaling = true;
+  const float* base_quant_matrix[NUM_QUANT_TBLS];
+  int num_base_tables;
+
+  if (xyb) {
+    global_scale = kGlobalScaleXYB;
+    num_base_tables = 3;
+    base_quant_matrix[0] = kBaseQuantMatrixXYB;
+    base_quant_matrix[1] = kBaseQuantMatrixXYB + DCTSIZE2;
+    base_quant_matrix[2] = kBaseQuantMatrixXYB + 2 * DCTSIZE2;
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr && !m->use_std_tables) {
+    global_scale = kGlobalScaleYCbCr;
+    if (m->cicp_transfer_function == kTransferFunctionPQ) {
+      global_scale *= .4f;
+    } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+      global_scale *= .5f;
+    }
+    if (is_yuv420) {
+      global_scale *= k420GlobalScale;
+    }
+    if (add_two_chroma_tables) {
+      cinfo->comp_info[2].quant_tbl_no = 2;
+      num_base_tables = 3;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + DCTSIZE2;
+      base_quant_matrix[2] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    } else {
+      num_base_tables = 2;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      // Use the Cr table for both Cb and Cr.
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    }
+  } else {
+    global_scale = 0.01f;
+    non_linear_scaling = false;
+    num_base_tables = 2;
+    base_quant_matrix[0] = kBaseQuantMatrixStd;
+    base_quant_matrix[1] = kBaseQuantMatrixStd + DCTSIZE2;
+  }
+
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  for (int quant_idx = 0; quant_idx < num_base_tables; ++quant_idx) {
+    const float* base_qm = base_quant_matrix[quant_idx];
+    JQUANT_TBL** qtable = &cinfo->quant_tbl_ptrs[quant_idx];
+    if (*qtable == nullptr) {
+      *qtable = jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float scale = global_scale;
+      if (non_linear_scaling) {
+        scale *= DistanceToScale(distances[quant_idx], k);
+        if (is_yuv420 && quant_idx > 0) {
+          scale *= k420Rescale[k];
+        }
+      } else {
+        scale *= DistanceToLinearQuality(distances[quant_idx]);
+      }
+      int qval = std::round(scale * base_qm[k]);
+      (*qtable)->quantval[k] = std::max(1, std::min(qval, quant_max));
+    }
+    (*qtable)->sent_table = FALSE;
+  }
+}
+
+void InitQuantizer(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  // Compute quantization multupliers from the quant table values.
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    if (!quant_table) {
+      JPEGLI_ERROR("Missing quantization table %d for component %d", quant_idx,
+                   c);
+    }
+    for (size_t k = 0; k < DCTSIZE2; k++) {
+      int val = quant_table->quantval[k];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      m->quant_mul[c][k] = 8.0f / val;
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        m->zero_bias_mul[c][k] = 0.5f;
+        m->zero_bias_offset[c][k] = 0.5f;
+      }
+    }
+    if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      float distance = QuantValsToDistance(cinfo);
+      static const float kDistHQ = 1.0f;
+      static const float kDistLQ = 3.0f;
+      float mix0 = (distance - kDistHQ) / (kDistLQ - kDistHQ);
+      mix0 = std::max(0.0f, std::min(1.0f, mix0));
+      float mix1 = 1.0f - mix0;
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          float mul0 = kZeroBiasMulYCbCrLQ[c * DCTSIZE2 + k];
+          float mul1 = kZeroBiasMulYCbCrHQ[c * DCTSIZE2 + k];
+          m->zero_bias_mul[c][k] = mix0 * mul0 + mix1 * mul1;
+          m->zero_bias_offset[c][k] = kZeroBiasOffsetYCbCr[c];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/quant.h b/third_party/jpeg-xl/lib/jpegli/quant.h
new file mode 100644
index 0000000000..44deb48d45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/quant.h
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_QUANT_H_
+#define LIB_JPEGLI_QUANT_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+namespace jpegli {
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables);
+
+void InitQuantizer(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_QUANT_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/render.cc b/third_party/jpeg-xl/lib/jpegli/render.cc
new file mode 100644
index 0000000000..026345552a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/render.cc
@@ -0,0 +1,802 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/render.h"
+
+#include <string.h>
+
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hwy/aligned_allocator.h>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/idct.h"
+#include "lib/jpegli/upsample.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#if JXL_MEMORY_SANITIZER
+#include "sanitizer/msan_interface.h"
+#endif
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+using hwy::HWY_NAMESPACE::Vec;
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
+    size_t k = i % DCTSIZE2;
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
+    const auto abs_coeff = Abs(coeff);
+    const auto not_0 = Gt(abs_coeff, Zero(di));
+    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
+    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
+    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
+  }
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  const auto c128 = Set(df, 128.0f / 255);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Add(Load(df, row + x), c128), df, row + x);
+  }
+}
+
+void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
+               size_t xsize) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->dither_[c]) return;
+  const float* dither_row =
+      &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
+  for (size_t x = 0; x < xsize; ++x) {
+    row[x] += dither_row[x & m->dither_mask_];
+  }
+}
+
+template <typename T>
+void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
+                      size_t num_channels, float multiplier, T* output) {
+  const HWY_CAPPED(float, 8) d;
+  auto zero = Zero(d);
+  auto mul = Set(d, multiplier);
+  const Rebind<T, decltype(d)> du;
+#if JXL_MEMORY_SANITIZER
+  const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len;
+  for (size_t c = 0; c < num_channels; ++c) {
+    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
+  }
+#endif
+  if (num_channels == 1) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+    }
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul);
+      StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)),
+                        DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+    }
+  }
+#if JXL_MEMORY_SANITIZER
+  __msan_poison(output + num_channels * len,
+                sizeof(output[0]) * num_channels * padding);
+#endif
+}
+
+void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
+                   size_t num_channels, float* output) {
+  const HWY_CAPPED(float, 8) d;
+  if (num_channels == 1) {
+    memcpy(output, input[0] + x0, len * sizeof(output[0]));
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved2(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]), d, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved3(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]), d, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved4(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]),
+                        LoadU(d, &input[3][x0 + i]), d, &output[4 * i]);
+    }
+  }
+}
+
+static constexpr float kFSWeightMR = 7.0f / 16.0f;
+static constexpr float kFSWeightBL = 3.0f / 16.0f;
+static constexpr float kFSWeightBM = 5.0f / 16.0f;
+static constexpr float kFSWeightBR = 1.0f / 16.0f;
+
+float LimitError(float error) {
+  float abserror = std::abs(error);
+  if (abserror > 48.0f) {
+    abserror = 32.0f;
+  } else if (abserror > 16.0f) {
+    abserror = 0.5f * abserror + 8.0f;
+  }
+  return error > 0.0f ? abserror : -abserror;
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
+  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
+    float* error_row[kMaxComponents];
+    float* next_error_row[kMaxComponents];
+    if (cinfo->dither_mode == JDITHER_ORDERED) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
+                  cinfo->output_width);
+      }
+    } else if (cinfo->dither_mode == JDITHER_FS) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        if (cinfo->output_scanline % 2 == 0) {
+          error_row[c] = m->error_row_[c];
+          next_error_row[c] = m->error_row_[c + kMaxComponents];
+        } else {
+          error_row[c] = m->error_row_[c + kMaxComponents];
+          next_error_row[c] = m->error_row_[c];
+        }
+        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
+      }
+    }
+    const float mul = 255.0f;
+    if (cinfo->dither_mode != JDITHER_FS) {
+      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    }
+    for (size_t i = 0; i < len; ++i) {
+      uint8_t* pixel = &scratch_space[num_channels * i];
+      if (cinfo->dither_mode == JDITHER_FS) {
+        for (size_t c = 0; c < num_channels; ++c) {
+          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
+          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
+        }
+      }
+      int index = LookupColorIndex(cinfo, pixel);
+      output[i] = index;
+      if (cinfo->dither_mode == JDITHER_FS) {
+        size_t prev_i = i > 0 ? i - 1 : 0;
+        size_t next_i = i + 1 < len ? i + 1 : len - 1;
+        for (size_t c = 0; c < num_channels; ++c) {
+          float error = pixel[c] - cinfo->colormap[c][index];
+          error_row[c][next_i] += kFSWeightMR * error;
+          next_error_row[c][prev_i] += kFSWeightBL * error;
+          next_error_row[c][i] += kFSWeightBM * error;
+          next_error_row[c][next_i] += kFSWeightBR * error;
+        }
+      }
+    }
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
+    const float mul = 255.0;
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    memcpy(output, scratch_space, len * num_channels);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
+    const float mul = 65535.0;
+    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
+    if (m->swap_endianness_) {
+      const HWY_CAPPED(uint16_t, 8) du;
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; j += Lanes(du)) {
+        auto v = LoadU(du, tmp + j);
+        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+        StoreU(vswap, du, tmp + j);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 2);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
+    float* tmp = reinterpret_cast<float*>(scratch_space);
+    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
+    if (m->swap_endianness_) {
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; ++j) {
+        tmp[j] = BSwapFloat(tmp[j]);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 4);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jpegli {
+
+HWY_EXPORT(GatherBlockStats);
+HWY_EXPORT(WriteToOutput);
+HWY_EXPORT(DecenterRow);
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
+                                                sumabs);
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  return HWY_DYNAMIC_DISPATCH(WriteToOutput)(cinfo, rows, xoffset, len,
+                                             num_channels, output);
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
+}
+
+// Padding for horizontal chroma upsampling.
+constexpr size_t kPaddingLeft = 64;
+constexpr size_t kPaddingRight = 64;
+
+bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
+  const auto& compinfo = cinfo->comp_info[ci];
+  return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
+          compinfo.v_samp_factor == cinfo->max_v_samp_factor);
+}
+
+// See the following article for the details:
+// J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
+// Proceedings International Conference on Information Technology: Coding and
+// Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
+void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
+                                   const int* sumabs, float* biases) {
+  for (size_t k = 1; k < DCTSIZE2; ++k) {
+    if (nonzeros[k] == 0) {
+      biases[k] = 0.5f;
+      continue;
+    }
+    // Notation adapted from the article
+    float N = num_blocks;
+    float N1 = nonzeros[k];
+    float N0 = num_blocks - N1;
+    float S = sumabs[k];
+    // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
+    // temporary grouping of terms.
+    float A = 4.0 * S + 2.0 * N;
+    float B = 4.0 * S - 2.0 * N1;
+    float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
+    float gamma2 = gamma * gamma;
+    // The bias is computed from gamma with (eq. 5), where the quantization
+    // multiplier Q can be factored out and thus the bias can be applied
+    // directly on the quantized coefficient.
+    biases[k] =
+        0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
+  }
+}
+
+constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
+                                                2, 3, 10, 17, 24};
+
+bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
+  return std::all_of(Q_POS.begin(), Q_POS.end(),
+                     [&](int pos) { return qtable->quantval[pos] != 0; });
+}
+
+// Determine whether smoothing should be applied during decompression
+bool do_smoothing(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  bool smoothing_useful = false;
+
+  if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
+    return false;
+  }
+  auto coef_bits_latch = m->coef_bits_latch;
+  auto prev_coef_bits_latch = m->prev_coef_bits_latch;
+
+  for (int ci = 0; ci < cinfo->num_components; ci++) {
+    jpeg_component_info* compptr = &cinfo->comp_info[ci];
+    JQUANT_TBL* qtable = compptr->quant_table;
+    int* coef_bits = cinfo->coef_bits[ci];
+    int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
+
+    // Return early if conditions for smoothing are not met
+    if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
+        coef_bits[0] < 0) {
+      return false;
+    }
+
+    coef_bits_latch[ci][0] = coef_bits[0];
+
+    for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      prev_coef_bits_latch[ci][coefi] =
+          cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
+      if (coef_bits[coefi] != 0) {
+        smoothing_useful = true;
+      }
+      coef_bits_latch[ci][coefi] = coef_bits[coefi];
+    }
+  }
+
+  return smoothing_useful;
+}
+
+void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
+                   size_t bx, int iy) {
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  int16_t* scratch = cinfo->master->smoothing_scratch_;
+  std::vector<int> Q_VAL(SAVED_COEFS);
+  int* coef_bits;
+
+  std::array<std::array<int, 5>, 5> dc_values;
+  auto& compinfo = cinfo->comp_info[component];
+  const size_t by0 = imcu_row * compinfo.v_samp_factor;
+  const size_t by = by0 + iy;
+
+  int prev_iy = by > 0 ? iy - 1 : 0;
+  int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
+  int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
+  int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
+
+  const int16_t* cur_row = blocks[iy][bx];
+  const int16_t* prev_row = blocks[prev_iy][bx];
+  const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
+  const int16_t* next_row = blocks[next_iy][bx];
+  const int16_t* next_next_row = blocks[next_next_iy][bx];
+
+  int prev_block_ind = bx ? -DCTSIZE2 : 0;
+  int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
+  int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
+  int next_next_block_ind =
+      bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
+
+  std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
+                                            next_row, next_next_row};
+  std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
+                                   next_block_ind, next_next_block_ind};
+
+  memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
+
+  for (int r = 0; r < 5; ++r) {
+    for (int c = 0; c < 5; ++c) {
+      dc_values[r][c] = row_ptrs[r][block_inds[c]];
+    }
+  }
+  // Get the correct coef_bits: In case of an incomplete scan, we use the
+  // prev coeficients.
+  if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
+    coef_bits = cinfo->master->prev_coef_bits_latch[component];
+  } else {
+    coef_bits = cinfo->master->coef_bits_latch[component];
+  }
+
+  bool change_dc = true;
+  for (int i = 1; i < SAVED_COEFS; i++) {
+    if (coef_bits[i] != -1) {
+      change_dc = false;
+      break;
+    }
+  }
+
+  JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
+  for (size_t i = 0; i < 6; ++i) {
+    Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+  }
+  if (change_dc) {
+    for (size_t i = 6; i < SAVED_COEFS; ++i) {
+      Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+    }
+  }
+  auto calculate_dct_value = [&](int coef_index) {
+    int64_t num = 0;
+    int pred;
+    int Al;
+    // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
+    // matrix in that case.
+    bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
+                        coef_index == 9;
+    auto dc = [&](int i, int j) {
+      return swap_indices ? dc_values[j][i] : dc_values[i][j];
+    };
+    Al = coef_bits[coef_index];
+    switch (coef_index) {
+      case 0:
+        // set the DC
+        num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
+               2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
+               6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
+               152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
+               6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
+               2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
+               2 * dc(4, 4));
+        // special case: for the DC the dequantization is different
+        Al = 0;
+        break;
+      case 1:
+      case 2:
+        // set Q01 or Q10
+        num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
+                            3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
+                            3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
+                            38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
+                            13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
+                            dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
+                         : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
+                            7 * dc(2, 4)));
+        break;
+      case 3:
+      case 5:
+        // set Q02 or Q20
+        num = (change_dc
+                   ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
+                         5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
+                         2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
+                   : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
+                      13 * dc(3, 2) - dc(4, 2)));
+        break;
+      case 4:
+        // set Q11
+        num =
+            (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
+                             9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
+                       : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
+                          dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
+                          dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
+        break;
+      case 6:
+      case 9:
+        // set Q03 or Q30
+        num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
+               dc(3, 3));
+        break;
+      case 7:
+      case 8:
+        // set Q12 and Q21
+        num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
+               dc(3, 3));
+        break;
+    }
+    num = Q_VAL[0] * num;
+    if (num >= 0) {
+      pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+    } else {
+      pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+      pred = -pred;
+    }
+    return static_cast<int16_t>(pred);
+  };
+
+  int loop_end = change_dc ? SAVED_COEFS : 6;
+  for (int i = 1; i < loop_end; ++i) {
+    if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
+      scratch[Q_POS[i]] = calculate_dct_value(i);
+    }
+  }
+  if (change_dc) {
+    scratch[0] = calculate_dct_value(0);
+  }
+}
+
+void PrepareForOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t iMCU_width = cinfo->max_h_samp_factor * m->min_scaled_dct_size;
+  size_t output_stride = m->iMCU_cols_ * iMCU_width;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& comp = cinfo->comp_info[c];
+    size_t cheight = comp.v_samp_factor * m->scaled_dct_size[c];
+    m->raw_height_[c] = cinfo->total_iMCU_rows * cheight;
+    m->raw_output_[c].Allocate(cinfo, 3 * cheight, output_stride);
+  }
+  int num_all_components =
+      std::max(cinfo->out_color_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    m->render_output_[c].Allocate(cinfo, cinfo->max_v_samp_factor,
+                                  output_stride);
+  }
+  m->idct_scratch_ = Allocate<float>(cinfo, 5 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  m->upsample_scratch_ = Allocate<float>(
+      cinfo, output_stride + kPaddingLeft + kPaddingRight, JPOOL_IMAGE_ALIGNED);
+  size_t bytes_per_sample = jpegli_bytes_per_sample(m->output_data_type_);
+  size_t bytes_per_pixel = cinfo->out_color_components * bytes_per_sample;
+  size_t scratch_stride = RoundUpTo(output_stride, HWY_ALIGNMENT);
+  m->output_scratch_ = Allocate<uint8_t>(
+      cinfo, bytes_per_pixel * scratch_stride, JPOOL_IMAGE_ALIGNED);
+  m->smoothing_scratch_ =
+      Allocate<int16_t>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  bool smoothing = do_smoothing(cinfo);
+  m->apply_smoothing = smoothing && cinfo->do_block_smoothing;
+  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
+  m->nonzeros_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  m->sumabs_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
+  memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
+  memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
+  m->biases_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  const float kDequantScale = 1.0f / (8 * 255);
+  if (m->dequant_ == nullptr) {
+    m->dequant_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+    memset(m->dequant_, 0, coeffs_per_block * sizeof(float));
+  }
+  for (int c = 0; c < cinfo->num_components; c++) {
+    const auto& comp = cinfo->comp_info[c];
+    JQUANT_TBL* table = comp.quant_table;
+    if (table == nullptr) continue;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
+    }
+  }
+  ChooseInverseTransform(cinfo);
+  ChooseColorTransform(cinfo);
+}
+
+void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  JBLOCKARRAY ba[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const jpeg_component_info* comp = &cinfo->comp_info[c];
+    int by0 = imcu_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    ba[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, false);
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    size_t k0 = c * DCTSIZE2;
+    auto& compinfo = cinfo->comp_info[c];
+    size_t block_row = imcu_row * compinfo.v_samp_factor;
+    if (ShouldApplyDequantBiases(cinfo, c)) {
+      // Update statistics for this iMCU row.
+      for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+        size_t by = block_row + iy;
+        if (by >= compinfo.height_in_blocks) {
+          continue;
+        }
+        int16_t* JXL_RESTRICT coeffs = &ba[c][iy][0][0];
+        size_t num = compinfo.width_in_blocks * DCTSIZE2;
+        GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
+        m->num_processed_blocks_[c] += compinfo.width_in_blocks;
+      }
+      if (imcu_row % 4 == 3) {
+        // Re-compute optimal biases every few iMCU-rows.
+        ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
+                                      &m->nonzeros_[k0], &m->sumabs_[k0],
+                                      &m->biases_[k0]);
+      }
+    }
+    RowBuffer<float>* raw_out = &m->raw_output_[c];
+    for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+      size_t by = block_row + iy;
+      if (by >= compinfo.height_in_blocks) {
+        continue;
+      }
+      size_t dctsize = m->scaled_dct_size[c];
+      int16_t* JXL_RESTRICT row_in = &ba[c][iy][0][0];
+      float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
+      for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
+        if (m->apply_smoothing) {
+          PredictSmooth(cinfo, ba[c], c, bx, iy);
+          (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        } else {
+          (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        }
+      }
+      if (m->streaming_mode_) {
+        memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
+  jpegli::DecodeCurrentiMCURow(cinfo);
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& compinfo = cinfo->comp_info[c];
+    size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
+    size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
+    size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
+    size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE;
+    size_t y1 = std::min(y0 + comp_nrows, comp_height);
+    for (size_t y = y0; y < y1; ++y) {
+      float* rows[1] = {m->raw_output_[c].Row(y)};
+      uint8_t* output = data[c][y - y0];
+      DecenterRow(rows[0], comp_width);
+      WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
+    }
+  }
+  ++cinfo->output_iMCU_row;
+  cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
+  if (cinfo->output_scanline >= cinfo->output_height) {
+    ++m->output_passes_done_;
+  }
+}
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows) {
+  jpeg_decomp_master* m = cinfo->master;
+  const int vfactor = cinfo->max_v_samp_factor;
+  const int hfactor = cinfo->max_h_samp_factor;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  const size_t imcu_height = vfactor * m->min_scaled_dct_size;
+  const size_t imcu_width = hfactor * m->min_scaled_dct_size;
+  const size_t output_width = m->iMCU_cols_ * imcu_width;
+  if (imcu_row == cinfo->total_iMCU_rows ||
+      (imcu_row > 1 && cinfo->output_scanline < (imcu_row - 1) * imcu_height)) {
+    // We are ready to output some scanlines.
+    size_t ybegin = cinfo->output_scanline;
+    size_t yend =
+        (imcu_row == cinfo->total_iMCU_rows ? cinfo->output_height
+                                            : (imcu_row - 1) * imcu_height);
+    yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
+    size_t yb = (ybegin / vfactor) * vfactor;
+    size_t ye = DivCeil(yend, vfactor) * vfactor;
+    for (size_t y = yb; y < ye; y += vfactor) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        RowBuffer<float>* raw_out = &m->raw_output_[c];
+        RowBuffer<float>* render_out = &m->render_output_[c];
+        int line_groups = vfactor / m->v_factor[c];
+        size_t yc = y / m->v_factor[c];
+        for (int dy = 0; dy < line_groups; ++dy) {
+          if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
+            size_t ymid = yc + dy;
+            const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
+            const float* JXL_RESTRICT row_top =
+                ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
+            const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
+                                                    ? row_mid
+                                                    : raw_out->Row(ymid + 1);
+            Upsample2Vertical(row_top, row_mid, row_bot,
+                              render_out->Row(2 * dy),
+                              render_out->Row(2 * dy + 1), output_width);
+          } else {
+            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
+              size_t ymid = yc + dy;
+              memcpy(render_out->Row(m->v_factor[c] * dy + yix),
+                     raw_out->Row(ymid), raw_out->xsize() * sizeof(float));
+            }
+          }
+        }
+      }
+      for (int yix = 0; yix < vfactor; ++yix) {
+        if (y + yix < ybegin || y + yix >= yend) continue;
+        float* rows[kMaxComponents];
+        int num_all_components =
+            std::max(cinfo->out_color_components, cinfo->num_components);
+        for (int c = 0; c < num_all_components; ++c) {
+          rows[c] = m->render_output_[c].Row(yix);
+        }
+        (*m->color_transform)(rows, output_width);
+        for (int c = 0; c < cinfo->out_color_components; ++c) {
+          // Undo the centering of the sample values around zero.
+          DecenterRow(rows[c], output_width);
+        }
+        if (scanlines) {
+          uint8_t* output = scanlines[*num_output_rows];
+          WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
+                        cinfo->out_color_components, output);
+        }
+        JXL_ASSERT(cinfo->output_scanline == y + yix);
+        ++cinfo->output_scanline;
+        ++(*num_output_rows);
+        if (cinfo->output_scanline == cinfo->output_height) {
+          ++m->output_passes_done_;
+        }
+      }
+    }
+  } else {
+    DecodeCurrentiMCURow(cinfo);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      if (m->h_factor[c] == 1) continue;
+      const auto& compinfo = cinfo->comp_info[c];
+      RowBuffer<float>* raw_out = &m->raw_output_[c];
+      size_t cheight = compinfo.v_samp_factor * m->scaled_dct_size[c];
+      size_t y0 = imcu_row * cheight;
+      if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
+        for (size_t iy = 0; iy < cheight; ++iy) {
+          float* JXL_RESTRICT row = raw_out->Row(y0 + iy);
+          Upsample2Horizontal(row, m->upsample_scratch_, output_width);
+        }
+      } else {
+        for (size_t iy = 0; iy < cheight; ++iy) {
+          float* JXL_RESTRICT row = raw_out->Row(y0 + iy);
+          float* JXL_RESTRICT tmp = m->upsample_scratch_;
+          // TODO(szabadka) SIMDify this.
+          for (size_t x = 0; x < output_width; ++x) {
+            tmp[x] = row[x / m->h_factor[c]];
+          }
+          memcpy(row, tmp, output_width * sizeof(tmp[0]));
+        }
+      }
+    }
+    ++cinfo->output_iMCU_row;
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/render.h b/third_party/jpeg-xl/lib/jpegli/render.h
new file mode 100644
index 0000000000..93b80d975a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/render.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_RENDER_H_
+#define LIB_JPEGLI_RENDER_H_
+
+/* clang-format off */
+#include <stdint.h>
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include <vector>
+
+namespace jpegli {
+
+void PrepareForOutput(j_decompress_ptr cinfo);
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows);
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_RENDER_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/simd.cc b/third_party/jpeg-xl/lib/jpegli/simd.cc
new file mode 100644
index 0000000000..5e84939342
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/simd.cc
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/simd.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/simd.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+size_t GetVectorSize() { return HWY_LANES(uint8_t); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+
+HWY_EXPORT(GetVectorSize);  // Local function.
+
+}  // namespace
+
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/simd.h b/third_party/jpeg-xl/lib/jpegli/simd.h
new file mode 100644
index 0000000000..aec772e2d4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/simd.h
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_SIMD_H_
+#define LIB_JPEGLI_SIMD_H_
+
+#include <stddef.h>
+
+namespace jpegli {
+
+// Returns SIMD vector size in bytes.
+size_t VectorSize();
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_SIMD_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/source_manager.cc b/third_party/jpeg-xl/lib/jpegli/source_manager.cc
new file mode 100644
index 0000000000..0b8e0a5c8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/source_manager.cc
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+void init_mem_source(j_decompress_ptr cinfo) {}
+void init_stdio_source(j_decompress_ptr cinfo) {}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+  if (num_bytes <= 0) return;
+  while (num_bytes > static_cast<long>(cinfo->src->bytes_in_buffer)) {
+    num_bytes -= cinfo->src->bytes_in_buffer;
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->next_input_byte += num_bytes;
+  cinfo->src->bytes_in_buffer -= num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {}
+
+boolean EmitFakeEoiMarker(j_decompress_ptr cinfo) {
+  static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+  cinfo->src->next_input_byte = kFakeEoiMarker;
+  cinfo->src->bytes_in_buffer = 2;
+  return TRUE;
+}
+
+constexpr size_t kStdioBufferSize = 64 << 10;
+
+struct StdioSourceManager {
+  jpeg_source_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<StdioSourceManager*>(cinfo->src);
+    size_t num_bytes_read = fread(src->buffer, 1, kStdioBufferSize, src->f);
+    if (num_bytes_read == 0) {
+      return EmitFakeEoiMarker(cinfo);
+    }
+    src->pub.next_input_byte = src->buffer;
+    src->pub.bytes_in_buffer = num_bytes_read;
+    return TRUE;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char* inbuffer,
+                    unsigned long insize) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_mem_source) {
+    JPEGLI_ERROR("jpegli_mem_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = jpegli::Allocate<jpeg_source_mgr>(cinfo, 1);
+  }
+  cinfo->src->next_input_byte = inbuffer;
+  cinfo->src->bytes_in_buffer = insize;
+  cinfo->src->init_source = jpegli::init_mem_source;
+  cinfo->src->fill_input_buffer = jpegli::EmitFakeEoiMarker;
+  cinfo->src->skip_input_data = jpegli::skip_input_data;
+  cinfo->src->resync_to_restart = jpegli_resync_to_restart;
+  cinfo->src->term_source = jpegli::term_source;
+}
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE* infile) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_stdio_source) {
+    JPEGLI_ERROR("jpeg_stdio_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = reinterpret_cast<jpeg_source_mgr*>(
+        jpegli::Allocate<jpegli::StdioSourceManager>(cinfo, 1));
+  }
+  auto src = reinterpret_cast<jpegli::StdioSourceManager*>(cinfo->src);
+  src->f = infile;
+  src->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kStdioBufferSize);
+  src->pub.next_input_byte = src->buffer;
+  src->pub.bytes_in_buffer = 0;
+  src->pub.init_source = jpegli::init_stdio_source;
+  src->pub.fill_input_buffer = jpegli::StdioSourceManager::fill_input_buffer;
+  src->pub.skip_input_data = jpegli::skip_input_data;
+  src->pub.resync_to_restart = jpegli_resync_to_restart;
+  src->pub.term_source = jpegli::term_source;
+}
diff --git a/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc
new file mode 100644
index 0000000000..c8d1fbc053
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/source_manager_test.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void ReadOutputImage(j_decompress_ptr cinfo, TestImage* output) {
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  jpegli_start_decompress(cinfo);
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  output->AllocatePixels();
+  size_t stride = cinfo->output_width * cinfo->num_components;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    JSAMPROW scanline = &output->pixels[cinfo->output_scanline * stride];
+    jpegli_read_scanlines(cinfo, &scanline, 1);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  DecompressParams dparams;
+};
+
+class SourceManagerTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(SourceManagerTestParam, TestStdioSourceManager) {
+  TestConfig config = GetParam();
+  jxl::FileWrapper testfile(GetTestDataPath(config.fn), "rb");
+  FILE* src = nullptr;
+  if (config.dparams.size_factor != 1.0) {
+    std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+    src = tmpfile();
+    ASSERT_TRUE(src != nullptr);
+    fwrite(compressed.data(), 1, compressed.size(), src);
+    rewind(src);
+    return;
+  } else {
+    src = testfile;
+  }
+  ASSERT_TRUE(src != nullptr);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_stdio_src(&cinfo, src);
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(),
+                    ReadTestData(config.fn.c_str()), &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+TEST_P(SourceManagerTestParam, TestMemSourceManager) {
+  TestConfig config = GetParam();
+  std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+        {"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+    });
+    for (const auto& it : testfiles) {
+      for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.size_factor = size_factor;
+        all_tests.push_back(config);
+      }
+    }
+    return all_tests;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.fn_desc;
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<SourceManagerTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(SourceManagerTest, SourceManagerTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/streaming_test.cc b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc
new file mode 100644
index 0000000000..4a8981ee6a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/streaming_test.cc
@@ -0,0 +1,233 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+// A simple suspending source manager with an input buffer.
+struct SourceManager {
+  jpeg_source_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  SourceManager() {
+    pub.next_input_byte = nullptr;
+    pub.bytes_in_buffer = 0;
+    pub.init_source = init_source;
+    pub.fill_input_buffer = fill_input_buffer;
+    pub.skip_input_data = skip_input_data;
+    pub.resync_to_restart = jpegli_resync_to_restart;
+    pub.term_source = term_source;
+  }
+
+  static void init_source(j_decompress_ptr cinfo) {}
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {}
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+// A destination manager that empties its output buffer into a SourceManager's
+// input buffer. The buffer size is kept short because empty_output_buffer() is
+// called only when the output buffer is full, and we want to update the decoder
+// input frequently to demostrate that streaming works.
+static constexpr size_t kOutputBufferSize = 1024;
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+  SourceManager* dest;
+
+  DestinationManager(SourceManager* src)
+      : buffer(kOutputBufferSize), dest(src) {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    jpeg_destination_mgr* src = &us->pub;
+    jpeg_source_mgr* dst = &us->dest->pub;
+    std::vector<uint8_t>& src_buf = us->buffer;
+    std::vector<uint8_t>& dst_buf = us->dest->buffer;
+    if (dst->bytes_in_buffer > 0 && dst->bytes_in_buffer < dst_buf.size()) {
+      memmove(dst_buf.data(), dst->next_input_byte, dst->bytes_in_buffer);
+    }
+    size_t src_len = src_buf.size() - src->free_in_buffer;
+    dst_buf.resize(dst->bytes_in_buffer + src_len);
+    memcpy(&dst_buf[dst->bytes_in_buffer], src_buf.data(), src_len);
+    dst->next_input_byte = dst_buf.data();
+    dst->bytes_in_buffer = dst_buf.size();
+    src->next_output_byte = src_buf.data();
+    src->free_in_buffer = src_buf.size();
+    return true;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    empty_output_buffer(cinfo);
+  }
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class StreamingTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(StreamingTestParam, TestStreaming) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  TestImage output;
+  GeneratePixels(&input);
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    // Create a pair of compressor and decompressor objects, where the
+    // compressor's output is connected to the decompressor's input.
+    jpegli_create_decompress(&dinfo);
+    jpegli_create_compress(&cinfo);
+    SourceManager src;
+    dinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    DestinationManager dest(&src);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = input.color_space;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    size_t iMCU_height = 8 * cinfo.max_v_samp_factor;
+    std::vector<uint8_t> row_bytes(iMCU_height * stride);
+    size_t yin = 0;
+    size_t yout = 0;
+    while (yin < cinfo.image_height) {
+      // Feed one iMCU row at a time to the compressor.
+      size_t lines_in = std::min(iMCU_height, cinfo.image_height - yin);
+      memcpy(&row_bytes[0], &input.pixels[yin * stride], lines_in * stride);
+      std::vector<JSAMPROW> rows_in(lines_in);
+      for (size_t i = 0; i < lines_in; ++i) {
+        rows_in[i] = &row_bytes[i * stride];
+      }
+      EXPECT_EQ(lines_in,
+                jpegli_write_scanlines(&cinfo, &rows_in[0], lines_in));
+      yin += lines_in;
+      if (yin == cinfo.image_height) {
+        jpegli_finish_compress(&cinfo);
+      }
+
+      // Atfer the first iMCU row, we don't yet expect any output because the
+      // compressor delays processing to have context rows after the iMCU row.
+      if (yin < std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        continue;
+      }
+
+      // After two iMCU rows, the compressor has started emitting compressed
+      // data. We check here that at least the scan header was output, because
+      // we expect that the compressor's output buffer was filled at least once
+      // while emitting the first compressed iMCU row.
+      if (yin == std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        EXPECT_EQ(JPEG_REACHED_SOS,
+                  jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+        output.xsize = dinfo.image_width;
+        output.ysize = dinfo.image_height;
+        output.components = dinfo.num_components;
+        EXPECT_EQ(output.xsize, input.xsize);
+        EXPECT_EQ(output.ysize, input.ysize);
+        EXPECT_EQ(output.components, input.components);
+        EXPECT_TRUE(jpegli_start_decompress(&dinfo));
+        output.pixels.resize(output.ysize * stride);
+        if (yin < cinfo.image_height) {
+          continue;
+        }
+      }
+
+      // After six iMCU rows, the compressor has emitted five iMCU rows of
+      // compressed data, of which we expect four full iMCU row of compressed
+      // data to be in the decoder's input buffer, but since the decoder also
+      // needs context rows for upsampling and smoothing, we don't expect any
+      // output to be ready yet.
+      if (yin < 7 * iMCU_height && yin < cinfo.image_height) {
+        continue;
+      }
+
+      // After five iMCU rows, we expect the decoder to have rendered the output
+      // with four iMCU rows of delay.
+      // TODO(szabadka) Reduce the processing delay in the decoder if possible.
+      size_t lines_out =
+          (yin == cinfo.image_height ? cinfo.image_height - yout : iMCU_height);
+      std::vector<JSAMPROW> rows_out(lines_out);
+      for (size_t i = 0; i < lines_out; ++i) {
+        rows_out[i] =
+            reinterpret_cast<JSAMPLE*>(&output.pixels[(yout + i) * stride]);
+      }
+      EXPECT_EQ(lines_out,
+                jpegli_read_scanlines(&dinfo, &rows_out[0], lines_out));
+      VerifyOutputImage(input, output, yout, lines_out, 3.8f);
+      yout += lines_out;
+
+      if (yout == cinfo.image_height) {
+        EXPECT_TRUE(jpegli_finish_decompress(&dinfo));
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = xsize0;
+      config.input.ysize = ysize0 + dysize;
+      config.jparams.h_sampling = {1, 1, 1};
+      config.jparams.v_sampling = {v_sampling, 1, 1};
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<StreamingTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(StreamingTest, StreamingTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.cc b/third_party/jpeg-xl/lib/jpegli/test_utils.cc
new file mode 100644
index 0000000000..1ae5483d4a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/test_utils.cc
@@ -0,0 +1,1240 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/test_utils.h"
+
+#include <cmath>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jpegli {
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+std::vector<uint8_t> ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  std::vector<uint8_t> data;
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  JXL_CHECK(jxl::ReadFile(full_path, &data));
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+void CustomQuantTable::Generate() {
+  basic_table.resize(DCTSIZE2);
+  quantval.resize(DCTSIZE2);
+  switch (table_type) {
+    case 0: {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = k + 1;
+      }
+      break;
+    }
+    default:
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = table_type;
+      }
+  }
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    quantval[k] = (basic_table[k] * scale_factor + 50U) / 100U;
+    quantval[k] = std::max(quantval[k], 1U);
+    quantval[k] = std::min(quantval[k], 65535U);
+    if (!add_raw) {
+      quantval[k] = std::min(quantval[k], force_baseline ? 255U : 32767U);
+    }
+  }
+}
+
+bool PNMParser::ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                            size_t* num_channels, size_t* bitdepth) {
+  if (pos_[0] != 'P' || (pos_[1] != '5' && pos_[1] != '6')) {
+    fprintf(stderr, "Invalid PNM header.");
+    return false;
+  }
+  *num_channels = (pos_[1] == '5' ? 1 : 3);
+  pos_ += 2;
+
+  size_t maxval;
+  if (!SkipWhitespace() || !ParseUnsigned(xsize) || !SkipWhitespace() ||
+      !ParseUnsigned(ysize) || !SkipWhitespace() || !ParseUnsigned(&maxval) ||
+      !SkipWhitespace()) {
+    return false;
+  }
+  if (maxval == 0 || maxval >= 65536) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+  bool found_bitdepth = false;
+  for (int bits = 1; bits <= 16; ++bits) {
+    if (maxval == (1u << bits) - 1) {
+      *bitdepth = bits;
+      found_bitdepth = true;
+      break;
+    }
+  }
+  if (!found_bitdepth) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+
+  *pos = pos_;
+  return true;
+}
+
+bool PNMParser::ParseUnsigned(size_t* number) {
+  if (pos_ == end_ || *pos_ < '0' || *pos_ > '9') {
+    fprintf(stderr, "Expected unsigned number.\n");
+    return false;
+  }
+  *number = 0;
+  while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+    *number *= 10;
+    *number += *pos_ - '0';
+    ++pos_;
+  }
+
+  return true;
+}
+
+bool PNMParser::SkipWhitespace() {
+  if (pos_ == end_ || !IsWhitespace(*pos_)) {
+    fprintf(stderr, "Expected whitespace.\n");
+    return false;
+  }
+  while (pos_ < end_ && IsWhitespace(*pos_)) {
+    ++pos_;
+  }
+  return true;
+}
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels) {
+  if (data.size() < 2) {
+    fprintf(stderr, "PNM file too small.\n");
+    return false;
+  }
+  PNMParser parser(data.data(), data.size());
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&pos, xsize, ysize, num_channels, bitdepth)) {
+    return false;
+  }
+  pixels->resize(data.data() + data.size() - pos);
+  memcpy(&(*pixels)[0], pos, pixels->size());
+  return true;
+}
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_UNKNOWN:
+      return "UNKNOWN";
+    case JCS_GRAYSCALE:
+      return "GRAYSCALE";
+    case JCS_RGB:
+      return "RGB";
+    case JCS_YCbCr:
+      return "YCbCr";
+    case JCS_CMYK:
+      return "CMYK";
+    case JCS_YCCK:
+      return "YCCK";
+    default:
+      return "";
+  }
+}
+
+std::string IOMethodName(JpegliDataType data_type,
+                         JpegliEndianness endianness) {
+  std::string retval;
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    return "";
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    retval = "UINT16";
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    retval = "FLOAT";
+  }
+  if (endianness == JPEGLI_LITTLE_ENDIAN) {
+    retval += "LE";
+  } else if (endianness == JPEGLI_BIG_ENDIAN) {
+    retval += "BE";
+  }
+  return retval;
+}
+
+std::string SamplingId(const CompressParams& jparams) {
+  std::stringstream os;
+  JXL_CHECK(jparams.h_sampling.size() == jparams.v_sampling.size());
+  if (!jparams.h_sampling.empty()) {
+    size_t len = jparams.h_sampling.size();
+    while (len > 1 && jparams.h_sampling[len - 1] == 1 &&
+           jparams.v_sampling[len - 1] == 1) {
+      --len;
+    }
+    os << "SAMP";
+    for (size_t i = 0; i < len; ++i) {
+      if (i > 0) os << "_";
+      os << jparams.h_sampling[i] << "x" << jparams.v_sampling[i];
+    }
+  }
+  return os.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input) {
+  os << input.xsize << "x" << input.ysize;
+  os << IOMethodName(input.data_type, input.endianness);
+  if (input.color_space != JCS_RGB) {
+    os << "InputColor" << ColorSpaceName(input.color_space);
+  }
+  if (input.color_space == JCS_UNKNOWN) {
+    os << input.components;
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams) {
+  os << "Q" << jparams.quality;
+  os << SamplingId(jparams);
+  if (jparams.set_jpeg_colorspace) {
+    os << "JpegColor" << ColorSpaceName(jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    os << "CID";
+    for (size_t i = 0; i < jparams.comp_ids.size(); ++i) {
+      os << jparams.comp_ids[i];
+    }
+  }
+  if (!jparams.quant_indexes.empty()) {
+    os << "QIDX";
+    for (size_t i = 0; i < jparams.quant_indexes.size(); ++i) {
+      os << jparams.quant_indexes[i];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      os << "TABLE" << table.slot_idx << "T" << table.table_type << "F"
+         << table.scale_factor
+         << (table.add_raw          ? "R"
+             : table.force_baseline ? "B"
+                                    : "");
+    }
+  }
+  if (jparams.progressive_mode >= 0) {
+    os << "P" << jparams.progressive_mode;
+  } else if (jparams.simple_progression) {
+    os << "Psimple";
+  }
+  if (jparams.optimize_coding == 1) {
+    JXL_CHECK(jparams.progressive_mode <= 0 && !jparams.simple_progression);
+    os << "OptimizedCode";
+  } else if (jparams.optimize_coding == 0) {
+    JXL_CHECK(jparams.progressive_mode <= 0 && !jparams.simple_progression);
+    os << "FixedCode";
+    if (jparams.use_flat_dc_luma_code) {
+      os << "FlatDCLuma";
+    } else if (jparams.omit_standard_tables) {
+      os << "OmitDHT";
+    }
+  }
+  if (!jparams.use_adaptive_quantization) {
+    os << "NoAQ";
+  }
+  if (jparams.restart_interval > 0) {
+    os << "R" << jparams.restart_interval;
+  }
+  if (jparams.restart_in_rows > 0) {
+    os << "RR" << jparams.restart_in_rows;
+  }
+  if (jparams.xyb_mode) {
+    os << "XYB";
+  } else if (jparams.libjpeg_mode) {
+    os << "Libjpeg";
+  }
+  if (jparams.override_JFIF >= 0) {
+    os << (jparams.override_JFIF ? "AddJFIF" : "NoJFIF");
+  }
+  if (jparams.override_Adobe >= 0) {
+    os << (jparams.override_Adobe ? "AddAdobe" : "NoAdobe");
+  }
+  if (jparams.add_marker) {
+    os << "AddMarker";
+  }
+  if (!jparams.icc.empty()) {
+    os << "ICCSize" << jparams.icc.size();
+  }
+  if (jparams.smoothing_factor != 0) {
+    os << "SF" << jparams.smoothing_factor;
+  }
+  return os;
+}
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels) {
+  if (colorspace == JCS_GRAYSCALE) {
+    *channels = 1;
+  } else if (colorspace == JCS_RGB || colorspace == JCS_YCbCr) {
+    *channels = 3;
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    *channels = 4;
+  } else if (colorspace == JCS_UNKNOWN) {
+    JXL_CHECK(*channels <= 4);
+  } else {
+    JXL_ABORT();
+  }
+}
+
+void RGBToYCbCr(float r, float g, float b, float* y, float* cb, float* cr) {
+  *y = 0.299f * r + 0.587f * g + 0.114f * b;
+  *cb = -0.168736f * r - 0.331264f * g + 0.5f * b + 0.5f;
+  *cr = 0.5f * r - 0.418688f * g - 0.081312f * b + 0.5f;
+}
+
+void ConvertPixel(const uint8_t* input_rgb, uint8_t* out,
+                  J_COLOR_SPACE colorspace, size_t num_channels,
+                  JpegliDataType data_type = JPEGLI_TYPE_UINT8,
+                  bool swap_endianness = JPEGLI_NATIVE_ENDIAN) {
+  const float kMul = 255.0f;
+  float r = input_rgb[0] / kMul;
+  float g = input_rgb[1] / kMul;
+  float b = input_rgb[2] / kMul;
+  uint8_t out8[MAX_COMPONENTS];
+  if (colorspace == JCS_GRAYSCALE) {
+    const float Y = 0.299f * r + 0.587f * g + 0.114f * b;
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+  } else if (colorspace == JCS_RGB || colorspace == JCS_UNKNOWN) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      out8[c] = input_rgb[std::min<size_t>(2, c)];
+    }
+  } else if (colorspace == JCS_YCbCr) {
+    float Y, Cb, Cr;
+    RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+    out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+    out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    float K = 1.0f - std::max(r, std::max(g, b));
+    float scaleK = 1.0f / (1.0f - K);
+    r *= scaleK;
+    g *= scaleK;
+    b *= scaleK;
+    if (colorspace == JCS_CMYK) {
+      out8[0] = static_cast<uint8_t>(std::round((1.0f - r) * kMul));
+      out8[1] = static_cast<uint8_t>(std::round((1.0f - g) * kMul));
+      out8[2] = static_cast<uint8_t>(std::round((1.0f - b) * kMul));
+    } else if (colorspace == JCS_YCCK) {
+      float Y, Cb, Cr;
+      RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+      out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+      out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+      out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+    }
+    out8[3] = static_cast<uint8_t>(std::round(K * kMul));
+  } else {
+    JXL_ABORT("Colorspace %d not supported", colorspace);
+  }
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    memcpy(out, out8, num_channels);
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      uint16_t val = (out8[c] << 8) + out8[c];
+      val |= 0x40;  // Make little-endian and big-endian asymmetric
+      if (swap_endianness) {
+        val = JXL_BSWAP16(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      float val = out8[c] / 255.0f;
+      if (swap_endianness) {
+        val = BSwapFloat(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  }
+}
+
+void ConvertToGrayscale(TestImage* img) {
+  if (img->color_space == JCS_GRAYSCALE) return;
+  JXL_CHECK(img->data_type == JPEGLI_TYPE_UINT8);
+  for (size_t i = 0; i < img->pixels.size(); i += 3) {
+    if (img->color_space == JCS_RGB) {
+      ConvertPixel(&img->pixels[i], &img->pixels[i / 3], JCS_GRAYSCALE, 1);
+    } else if (img->color_space == JCS_YCbCr) {
+      img->pixels[i / 3] = img->pixels[i];
+    }
+  }
+  img->pixels.resize(img->pixels.size() / 3);
+  img->color_space = JCS_GRAYSCALE;
+  img->components = 1;
+}
+
+void GeneratePixels(TestImage* img) {
+  const std::vector<uint8_t> imgdata = ReadTestData("jxl/flower/flower.pnm");
+  size_t xsize, ysize, channels, bitdepth;
+  std::vector<uint8_t> pixels;
+  JXL_CHECK(ReadPNM(imgdata, &xsize, &ysize, &channels, &bitdepth, &pixels));
+  if (img->xsize == 0) img->xsize = xsize;
+  if (img->ysize == 0) img->ysize = ysize;
+  JXL_CHECK(img->xsize <= xsize);
+  JXL_CHECK(img->ysize <= ysize);
+  JXL_CHECK(3 == channels);
+  JXL_CHECK(8 == bitdepth);
+  size_t in_bytes_per_pixel = channels;
+  size_t in_stride = xsize * in_bytes_per_pixel;
+  size_t x0 = (xsize - img->xsize) / 2;
+  size_t y0 = (ysize - img->ysize) / 2;
+  SetNumChannels(img->color_space, &img->components);
+  size_t out_bytes_per_pixel =
+      jpegli_bytes_per_sample(img->data_type) * img->components;
+  size_t out_stride = img->xsize * out_bytes_per_pixel;
+  bool swap_endianness =
+      (img->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (img->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  img->pixels.resize(img->ysize * out_stride);
+  for (size_t iy = 0; iy < img->ysize; ++iy) {
+    size_t y = y0 + iy;
+    for (size_t ix = 0; ix < img->xsize; ++ix) {
+      size_t x = x0 + ix;
+      size_t idx_in = y * in_stride + x * in_bytes_per_pixel;
+      size_t idx_out = iy * out_stride + ix * out_bytes_per_pixel;
+      ConvertPixel(&pixels[idx_in], &img->pixels[idx_out], img->color_space,
+                   img->components, img->data_type, swap_endianness);
+    }
+  }
+}
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    size_t xsize = jparams.comp_width(*img, c);
+    size_t ysize = jparams.comp_height(*img, c);
+    size_t factor_y = jparams.max_v_sample() / jparams.v_samp(c);
+    size_t factor_x = jparams.max_h_sample() / jparams.h_samp(c);
+    size_t factor = factor_x * factor_y;
+    std::vector<uint8_t> plane(ysize * xsize);
+    size_t bytes_per_pixel = img->components;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        int result = 0;
+        for (size_t iy = 0; iy < factor_y; ++iy) {
+          size_t yy = std::min(y * factor_y + iy, img->ysize - 1);
+          for (size_t ix = 0; ix < factor_x; ++ix) {
+            size_t xx = std::min(x * factor_x + ix, img->xsize - 1);
+            size_t pixel_ix = (yy * img->xsize + xx) * bytes_per_pixel + c;
+            result += img->pixels[pixel_ix];
+          }
+        }
+        result = static_cast<uint8_t>((result + factor / 2) / factor);
+        plane[y * xsize + x] = result;
+      }
+    }
+    img->raw_data.emplace_back(std::move(plane));
+  }
+}
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    int xsize_blocks = jparams.comp_width(*img, c) / DCTSIZE;
+    int ysize_blocks = jparams.comp_height(*img, c) / DCTSIZE;
+    std::vector<JCOEF> plane(ysize_blocks * xsize_blocks * DCTSIZE2);
+    for (int by = 0; by < ysize_blocks; ++by) {
+      for (int bx = 0; bx < xsize_blocks; ++bx) {
+        JCOEF* block = &plane[(by * xsize_blocks + bx) * DCTSIZE2];
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          block[k] = (bx - by) / (k + 1);
+        }
+      }
+    }
+    img->coeffs.emplace_back(std::move(plane));
+  }
+}
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo) {
+  cinfo->image_width = input.xsize;
+  cinfo->image_height = input.ysize;
+  cinfo->input_components = input.components;
+  if (jparams.xyb_mode) {
+    jpegli_set_xyb_mode(cinfo);
+  }
+  if (jparams.libjpeg_mode) {
+    jpegli_enable_adaptive_quantization(cinfo, FALSE);
+    jpegli_use_standard_quant_tables(cinfo);
+    jpegli_set_progressive_level(cinfo, 0);
+  }
+  jpegli_set_defaults(cinfo);
+  cinfo->in_color_space = input.color_space;
+  jpegli_default_colorspace(cinfo);
+  if (jparams.override_JFIF >= 0) {
+    cinfo->write_JFIF_header = jparams.override_JFIF;
+  }
+  if (jparams.override_Adobe >= 0) {
+    cinfo->write_Adobe_marker = jparams.override_Adobe;
+  }
+  if (jparams.set_jpeg_colorspace) {
+    jpegli_set_colorspace(cinfo, jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].component_id = jparams.comp_ids[c];
+    }
+  }
+  if (!jparams.h_sampling.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].h_samp_factor = jparams.h_sampling[c];
+      cinfo->comp_info[c].v_samp_factor = jparams.v_sampling[c];
+    }
+  }
+  jpegli_set_quality(cinfo, jparams.quality, TRUE);
+  if (!jparams.quant_indexes.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].quant_tbl_no = jparams.quant_indexes[c];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      if (table.add_raw) {
+        cinfo->quant_tbl_ptrs[table.slot_idx] =
+            jpegli_alloc_quant_table((j_common_ptr)cinfo);
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          cinfo->quant_tbl_ptrs[table.slot_idx]->quantval[k] =
+              table.quantval[k];
+        }
+        cinfo->quant_tbl_ptrs[table.slot_idx]->sent_table = FALSE;
+      } else {
+        jpegli_add_quant_table(cinfo, table.slot_idx, &table.basic_table[0],
+                               table.scale_factor, table.force_baseline);
+      }
+    }
+  }
+  if (jparams.simple_progression) {
+    jpegli_simple_progression(cinfo);
+    JXL_CHECK(jparams.progressive_mode == -1);
+  }
+  if (jparams.progressive_mode > 2) {
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    cinfo->scan_info = script.scans;
+    cinfo->num_scans = script.num_scans;
+  } else if (jparams.progressive_mode >= 0) {
+    jpegli_set_progressive_level(cinfo, jparams.progressive_mode);
+  }
+  jpegli_set_input_format(cinfo, input.data_type, input.endianness);
+  jpegli_enable_adaptive_quantization(cinfo, jparams.use_adaptive_quantization);
+  cinfo->restart_interval = jparams.restart_interval;
+  cinfo->restart_in_rows = jparams.restart_in_rows;
+  cinfo->smoothing_factor = jparams.smoothing_factor;
+  if (jparams.optimize_coding == 1) {
+    cinfo->optimize_coding = TRUE;
+  } else if (jparams.optimize_coding == 0) {
+    cinfo->optimize_coding = FALSE;
+  }
+  cinfo->raw_data_in = !input.raw_data.empty();
+  if (jparams.optimize_coding == 0 && jparams.use_flat_dc_luma_code) {
+    JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->bits[4] = 15;
+    for (int i = 0; i < 15; ++i) tbl->huffval[i] = i;
+  }
+  if (input.coeffs.empty()) {
+    bool write_all_tables = TRUE;
+    if (jparams.optimize_coding == 0 && !jparams.use_flat_dc_luma_code &&
+        jparams.omit_standard_tables) {
+      write_all_tables = FALSE;
+      cinfo->dc_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->dc_huff_tbl_ptrs[1]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[1]->sent_table = TRUE;
+    }
+    jpegli_start_compress(cinfo, write_all_tables);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+      for (size_t i = 0; i < kMarkerSequenceLen; ++i) {
+        jpegli_write_marker(cinfo, kMarkerSequence[i], kMarkerData,
+                            ((i + 2) % sizeof(kMarkerData)));
+      }
+    }
+    if (!jparams.icc.empty()) {
+      jpegli_write_icc_profile(cinfo, jparams.icc.data(), jparams.icc.size());
+    }
+  }
+  if (cinfo->raw_data_in) {
+    // Need to copy because jpeg API requires non-const pointers.
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+    std::vector<JSAMPARRAY> data(cinfo->num_components);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      rowdata[c].resize(jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo->next_scanline < cinfo->image_height) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t cwidth = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo->next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      size_t num_lines = jpegli_write_raw_data(cinfo, &data[0], max_lines);
+      JXL_CHECK(num_lines == max_lines);
+    }
+  } else if (!input.coeffs.empty()) {
+    j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+    jvirt_barray_ptr* coef_arrays = reinterpret_cast<jvirt_barray_ptr*>((
+        *cinfo->mem->alloc_small)(
+        comptr, JPOOL_IMAGE, cinfo->num_components * sizeof(jvirt_barray_ptr)));
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize_blocks = jparams.comp_width(input, c) / DCTSIZE;
+      size_t ysize_blocks = jparams.comp_height(input, c) / DCTSIZE;
+      coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+          comptr, JPOOL_IMAGE, FALSE, xsize_blocks, ysize_blocks,
+          cinfo->comp_info[c].v_samp_factor);
+    }
+    jpegli_write_coefficients(cinfo, coef_arrays);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+    }
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+        JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+            comptr, coef_arrays[c], by, 1, true);
+        size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+        size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+        memcpy(ba[0], &input.coeffs[c][offset], stride);
+      }
+    }
+  } else {
+    size_t stride = cinfo->image_width * cinfo->input_components *
+                    jpegli_bytes_per_sample(input.data_type);
+    std::vector<uint8_t> row_bytes(stride);
+    for (size_t y = 0; y < cinfo->image_height; ++y) {
+      memcpy(&row_bytes[0], &input.pixels[y * stride], stride);
+      JSAMPROW row[] = {row_bytes.data()};
+      jpegli_write_scanlines(cinfo, row, 1);
+    }
+  }
+  jpegli_finish_compress(cinfo);
+}
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (success) {
+    compressed->resize(buffer_size);
+    std::copy_n(buffer, buffer_size, compressed->data());
+  }
+  if (buffer) std::free(buffer);
+  return success;
+}
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number,
+                             bool is_jpegli) {
+  const ScanDecompressParams* sparams = nullptr;
+  for (const auto& sp : dparams.scan_params) {
+    if (scan_number <= sp.max_scan_number) {
+      sparams = &sp;
+      break;
+    }
+  }
+  if (sparams == nullptr) {
+    return;
+  }
+  if (dparams.quantize_colors) {
+    cinfo->dither_mode = sparams->dither_mode;
+    if (sparams->color_quant_mode == CQUANT_1PASS) {
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_2PASS) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      cinfo->two_pass_quantize = TRUE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_EXTERNAL) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      cinfo->two_pass_quantize = FALSE;
+      bool have_colormap = cinfo->colormap != nullptr;
+      cinfo->actual_number_of_colors = kTestColorMapNumColors;
+      cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          cinfo->actual_number_of_colors, 3);
+      jxl::msan::UnpoisonMemory(cinfo->colormap, 3 * sizeof(JSAMPROW));
+      for (int i = 0; i < kTestColorMapNumColors; ++i) {
+        cinfo->colormap[0][i] = (kTestColorMap[i] >> 16) & 0xff;
+        cinfo->colormap[1][i] = (kTestColorMap[i] >> 8) & 0xff;
+        cinfo->colormap[2][i] = (kTestColorMap[i] >> 0) & 0xff;
+      }
+      if (have_colormap) {
+        if (is_jpegli) {
+          jpegli_new_colormap(cinfo);
+        } else {
+          jpeg_new_colormap(cinfo);
+        }
+      }
+    } else if (sparams->color_quant_mode == CQUANT_REUSE) {
+      JXL_CHECK(cinfo->out_color_space = JCS_RGB);
+      JXL_CHECK(cinfo->colormap);
+    }
+  }
+}
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo, bool is_jpegli) {
+  cinfo->do_block_smoothing = dparams.do_block_smoothing;
+  cinfo->do_fancy_upsampling = dparams.do_fancy_upsampling;
+  if (dparams.output_mode == RAW_DATA) {
+    cinfo->raw_data_out = TRUE;
+  }
+  if (dparams.set_out_color_space) {
+    cinfo->out_color_space = dparams.out_color_space;
+    if (dparams.out_color_space == JCS_UNKNOWN) {
+      cinfo->jpeg_color_space = JCS_UNKNOWN;
+    }
+  }
+  cinfo->scale_num = dparams.scale_num;
+  cinfo->scale_denom = dparams.scale_denom;
+  cinfo->quantize_colors = dparams.quantize_colors;
+  cinfo->desired_number_of_colors = dparams.desired_number_of_colors;
+  if (!dparams.scan_params.empty()) {
+    if (cinfo->buffered_image) {
+      for (const auto& sparams : dparams.scan_params) {
+        if (sparams.color_quant_mode == CQUANT_1PASS) {
+          cinfo->enable_1pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_2PASS) {
+          cinfo->enable_2pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_EXTERNAL) {
+          cinfo->enable_external_quant = TRUE;
+        }
+      }
+      SetScanDecompressParams(dparams, cinfo, 1, is_jpegli);
+    } else {
+      SetScanDecompressParams(dparams, cinfo, kLastScan, is_jpegli);
+    }
+  }
+  if (is_jpegli) {
+    jpegli_set_output_format(cinfo, dparams.data_type, dparams.endianness);
+  }
+}
+
+void CheckMarkerPresent(j_decompress_ptr cinfo, uint8_t marker_type) {
+  bool marker_found = false;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    jxl::msan::UnpoisonMemory(marker, sizeof(*marker));
+    jxl::msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (marker->marker == marker_type &&
+        marker->data_length == sizeof(kMarkerData) &&
+        memcmp(marker->data, kMarkerData, sizeof(kMarkerData)) == 0) {
+      marker_found = true;
+    }
+  }
+  JXL_CHECK(marker_found);
+}
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  if (jparams.set_jpeg_colorspace) {
+    JXL_CHECK(cinfo->jpeg_color_space == jparams.jpeg_color_space);
+  }
+  if (jparams.override_JFIF >= 0) {
+    JXL_CHECK(cinfo->saw_JFIF_marker == jparams.override_JFIF);
+  }
+  if (jparams.override_Adobe >= 0) {
+    JXL_CHECK(cinfo->saw_Adobe_marker == jparams.override_Adobe);
+  }
+  if (jparams.add_marker) {
+    CheckMarkerPresent(cinfo, kSpecialMarker0);
+    CheckMarkerPresent(cinfo, kSpecialMarker1);
+  }
+  jxl::msan::UnpoisonMemory(
+      cinfo->comp_info, cinfo->num_components * sizeof(cinfo->comp_info[0]));
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (!jparams.comp_ids.empty()) {
+      JXL_CHECK(comp->component_id == jparams.comp_ids[i]);
+    }
+    if (!jparams.h_sampling.empty()) {
+      JXL_CHECK(comp->h_samp_factor == jparams.h_sampling[i]);
+    }
+    if (!jparams.v_sampling.empty()) {
+      JXL_CHECK(comp->v_samp_factor == jparams.v_sampling[i]);
+    }
+    if (!jparams.quant_indexes.empty()) {
+      JXL_CHECK(comp->quant_tbl_no == jparams.quant_indexes[i]);
+    }
+    max_h_samp_factor = std::max(max_h_samp_factor, comp->h_samp_factor);
+    max_v_samp_factor = std::max(max_v_samp_factor, comp->v_samp_factor);
+  }
+  JXL_CHECK(max_h_samp_factor == cinfo->max_h_samp_factor);
+  JXL_CHECK(max_v_samp_factor == cinfo->max_v_samp_factor);
+  int referenced_tables[NUM_QUANT_TBLS] = {};
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    JXL_CHECK(comp->width_in_blocks ==
+              DivCeil(cinfo->image_width * comp->h_samp_factor,
+                      max_h_samp_factor * DCTSIZE));
+    JXL_CHECK(comp->height_in_blocks ==
+              DivCeil(cinfo->image_height * comp->v_samp_factor,
+                      max_v_samp_factor * DCTSIZE));
+    referenced_tables[comp->quant_tbl_no] = 1;
+  }
+  for (const auto& table : jparams.quant_tables) {
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[table.slot_idx];
+    if (!referenced_tables[table.slot_idx]) {
+      JXL_CHECK(quant_table == nullptr);
+      continue;
+    }
+    JXL_CHECK(quant_table != nullptr);
+    jxl::msan::UnpoisonMemory(quant_table, sizeof(*quant_table));
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      JXL_CHECK(quant_table->quantval[k] == table.quantval[k]);
+    }
+  }
+}
+
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  JXL_CHECK(cinfo->input_scan_number > 0);
+  if (cinfo->progressive_mode) {
+    JXL_CHECK(cinfo->Ss != 0 || cinfo->Se != 63);
+  } else {
+    JXL_CHECK(cinfo->Ss == 0 && cinfo->Se == 63);
+  }
+  if (jparams.progressive_mode > 2) {
+    JXL_CHECK(jparams.progressive_mode < 3 + kNumTestScripts);
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    JXL_CHECK(cinfo->input_scan_number <= script.num_scans);
+    const jpeg_scan_info& scan = script.scans[cinfo->input_scan_number - 1];
+    JXL_CHECK(cinfo->comps_in_scan == scan.comps_in_scan);
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      JXL_CHECK(cinfo->cur_comp_info[i]->component_index ==
+                scan.component_index[i]);
+    }
+    JXL_CHECK(cinfo->Ss == scan.Ss);
+    JXL_CHECK(cinfo->Se == scan.Se);
+    JXL_CHECK(cinfo->Ah == scan.Ah);
+    JXL_CHECK(cinfo->Al == scan.Al);
+  }
+  if (jparams.restart_interval > 0) {
+    JXL_CHECK(cinfo->restart_interval == jparams.restart_interval);
+  } else if (jparams.restart_in_rows > 0) {
+    JXL_CHECK(cinfo->restart_interval ==
+              jparams.restart_in_rows * cinfo->MCUs_per_row);
+  }
+  if (jparams.progressive_mode == 0 && jparams.optimize_coding == 0) {
+    if (cinfo->jpeg_color_space == JCS_RGB) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+    } else if (cinfo->jpeg_color_space == JCS_CMYK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    }
+    if (jparams.use_flat_dc_luma_code) {
+      JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+      jxl::msan::UnpoisonMemory(tbl, sizeof(*tbl));
+      for (int i = 0; i < 15; ++i) {
+        JXL_CHECK(tbl->huffval[i] == i);
+      }
+    }
+  }
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+void ReadOutputPass(j_decompress_ptr cinfo, const DecompressParams& dparams,
+                    TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpeg_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+    JXL_CHECK(xsize_cropped == cinfo->output_width);
+  }
+  output->xsize = xsize_cropped;
+  output->ysize = ysize_cropped;
+  output->components = cinfo->out_color_components;
+  if (cinfo->quantize_colors) {
+    jxl::msan::UnpoisonMemory(cinfo->colormap, cinfo->out_color_components *
+                                                   sizeof(cinfo->colormap[0]));
+    for (int c = 0; c < cinfo->out_color_components; ++c) {
+      jxl::msan::UnpoisonMemory(
+          cinfo->colormap[c],
+          cinfo->actual_number_of_colors * sizeof(cinfo->colormap[c][0]));
+    }
+  }
+  if (!cinfo->raw_data_out) {
+    size_t stride = output->xsize * output->components;
+    output->pixels.resize(output->ysize * stride);
+    output->color_space = cinfo->out_color_space;
+    if (yoffset > 0) {
+      jpeg_skip_scanlines(cinfo, yoffset);
+    }
+    for (size_t y = 0; y < output->ysize; ++y) {
+      JSAMPROW rows[] = {
+          reinterpret_cast<JSAMPLE*>(&output->pixels[y * stride])};
+      JXL_CHECK(1 == jpeg_read_scanlines(cinfo, rows, 1));
+      jxl::msan::UnpoisonMemory(
+          rows[0], sizeof(JSAMPLE) * cinfo->output_components * output->xsize);
+      if (cinfo->quantize_colors) {
+        UnmapColors(rows[0], cinfo->output_width, cinfo->out_color_components,
+                    cinfo->colormap, cinfo->actual_number_of_colors);
+      }
+    }
+    if (cinfo->output_scanline < cinfo->output_height) {
+      jpeg_skip_scanlines(cinfo, cinfo->output_height - cinfo->output_scanline);
+    }
+  } else {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+    while (cinfo->output_scanline < cinfo->output_height) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      JXL_CHECK(cinfo->output_scanline == cinfo->output_iMCU_row * iMCU_height);
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      JXL_CHECK(iMCU_height ==
+                jpeg_read_raw_data(cinfo, &data[0], iMCU_height));
+    }
+  }
+  JXL_CHECK(cinfo->total_iMCU_rows ==
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output) {
+  output->xsize = cinfo->image_width;
+  output->ysize = cinfo->image_height;
+  output->components = cinfo->num_components;
+  output->color_space = cinfo->out_color_space;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    std::vector<JCOEF> coeffs(comp->width_in_blocks * comp->height_in_blocks *
+                              DCTSIZE2);
+    for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+      JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(comptr, coef_arrays[c],
+                                                         by, 1, true);
+      size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+      size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+      memcpy(&coeffs[offset], ba[0], stride);
+    }
+    output->coeffs.emplace_back(std::move(coeffs));
+  }
+}
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpeg);
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, compressed.data(), compressed.size());
+    if (jparams.add_marker) {
+      jpeg_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpeg_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+    }
+    JXL_CHECK(JPEG_REACHED_SOS ==
+              jpeg_read_header(&cinfo, /*require_image=*/TRUE));
+    cinfo.buffered_image = TRUE;
+    SetDecompressParams(dparams, &cinfo, /*is_jpegli=*/false);
+    VerifyHeader(jparams, &cinfo);
+    JXL_CHECK(jpeg_start_decompress(&cinfo));
+    // start decompress should not read the whole input in buffered image mode
+    JXL_CHECK(!jpeg_input_complete(&cinfo));
+    JXL_CHECK(cinfo.output_scan_number == 0);
+    int sos_marker_cnt = 1;  // read header reads the first SOS marker
+    while (!jpeg_input_complete(&cinfo)) {
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      if (dparams.skip_scans && (cinfo.input_scan_number % 2) != 1) {
+        int result = JPEG_SUSPENDED;
+        while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+          result = jpeg_consume_input(&cinfo);
+        }
+        if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+        continue;
+      }
+      SetScanDecompressParams(dparams, &cinfo, cinfo.input_scan_number,
+                              /*is_jpegli=*/false);
+      JXL_CHECK(jpeg_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      VerifyScanHeader(jparams, &cinfo);
+      TestImage output;
+      ReadOutputPass(&cinfo, dparams, &output);
+      output_progression->emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      if (!cinfo.progressive_mode) {
+        JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+        JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      }
+      JXL_CHECK(jpeg_finish_output(&cinfo));
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression->back());
+      }
+    }
+    JXL_CHECK(jpeg_finish_decompress(&cinfo));
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams, j_decompress_ptr cinfo,
+                       TestImage* output) {
+  if (jparams.add_marker) {
+    jpeg_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpeg_save_markers(cinfo, kSpecialMarker1, 0xffff);
+  }
+  if (!jparams.icc.empty()) {
+    jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  JXL_CHECK(JPEG_REACHED_SOS ==
+            jpeg_read_header(cinfo, /*require_image=*/TRUE));
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpeg_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    jxl::msan::UnpoisonMemory(icc_data, icc_len);
+    JXL_CHECK(0 == memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  SetDecompressParams(dparams, cinfo, /*is_jpegli=*/false);
+  VerifyHeader(jparams, cinfo);
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    JXL_CHECK(jpeg_start_decompress(cinfo));
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputPass(cinfo, dparams, output);
+  }
+  JXL_CHECK(jpeg_finish_decompress(cinfo));
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpeg);
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, compressed.data(), compressed.size());
+    DecodeWithLibjpeg(jparams, dparams, &cinfo, output);
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+}
+
+void DumpImage(const TestImage& image, const std::string fn) {
+  JXL_CHECK(image.components == 1 || image.components == 3);
+  jxl::FileWrapper f(fn.c_str(), "wb");
+  size_t bytes_per_sample = jpegli_bytes_per_sample(image.data_type);
+  uint32_t maxval = (1u << (8 * bytes_per_sample)) - 1;
+  char type = image.components == 1 ? '5' : '6';
+  fprintf(f, "P%c\n%" PRIuS " %" PRIuS "\n%u\n", type, image.xsize, image.ysize,
+          maxval);
+  fwrite(image.pixels.data(), 1, image.pixels.size(), f);
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines, double* max_diff) {
+  size_t stride = input.xsize * input.components;
+  size_t start_offset = start_line * stride;
+  auto get_sample = [&](const TestImage& im, const std::vector<uint8_t>& data,
+                        size_t idx) -> double {
+    size_t bytes_per_sample = jpegli_bytes_per_sample(im.data_type);
+    bool is_little_endian =
+        (im.endianness == JPEGLI_LITTLE_ENDIAN ||
+         (im.endianness == JPEGLI_NATIVE_ENDIAN && IsLittleEndian()));
+    size_t offset = start_offset + idx * bytes_per_sample;
+    JXL_CHECK(offset < data.size());
+    const uint8_t* p = &data[offset];
+    if (im.data_type == JPEGLI_TYPE_UINT8) {
+      static const double mul8 = 1.0 / 255.0;
+      return p[0] * mul8;
+    } else if (im.data_type == JPEGLI_TYPE_UINT16) {
+      static const double mul16 = 1.0 / 65535.0;
+      return (is_little_endian ? LoadLE16(p) : LoadBE16(p)) * mul16;
+    } else if (im.data_type == JPEGLI_TYPE_FLOAT) {
+      return (is_little_endian ? LoadLEFloat(p) : LoadBEFloat(p));
+    }
+    return 0.0;
+  };
+  double diff2 = 0.0;
+  size_t num_samples = 0;
+  if (max_diff) *max_diff = 0.0;
+  if (!input.pixels.empty() && !output.pixels.empty()) {
+    num_samples = num_lines * stride;
+    for (size_t i = 0; i < num_samples; ++i) {
+      double sample_orig = get_sample(input, input.pixels, i);
+      double sample_output = get_sample(output, output.pixels, i);
+      double diff = sample_orig - sample_output;
+      if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+      diff2 += diff * diff;
+    }
+  } else {
+    JXL_CHECK(!input.raw_data.empty());
+    JXL_CHECK(!output.raw_data.empty());
+    for (size_t c = 0; c < input.raw_data.size(); ++c) {
+      JXL_CHECK(c < output.raw_data.size());
+      num_samples += input.raw_data[c].size();
+      for (size_t i = 0; i < input.raw_data[c].size(); ++i) {
+        double sample_orig = get_sample(input, input.raw_data[c], i);
+        double sample_output = get_sample(output, output.raw_data[c], i);
+        double diff = sample_orig - sample_output;
+        if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+        diff2 += diff * diff;
+      }
+    }
+  }
+  return std::sqrt(diff2 / num_samples) * 255.0;
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff) {
+  return DistanceRms(input, output, 0, output.ysize, max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff) {
+  double max_d;
+  double rms = DistanceRms(input, output, start_line, num_lines, &max_d);
+  printf("rms: %f, max_rms: %f, max_d: %f,  max_diff: %f\n", rms, max_rms,
+         max_d, max_diff);
+  JXL_CHECK(rms <= max_rms);
+  JXL_CHECK(max_d <= max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff) {
+  JXL_CHECK(output.xsize == input.xsize);
+  JXL_CHECK(output.ysize == input.ysize);
+  JXL_CHECK(output.components == input.components);
+  JXL_CHECK(output.color_space == input.color_space);
+  if (!input.coeffs.empty()) {
+    JXL_CHECK(input.coeffs.size() == input.components);
+    JXL_CHECK(output.coeffs.size() == input.components);
+    for (size_t c = 0; c < input.components; ++c) {
+      JXL_CHECK(output.coeffs[c].size() == input.coeffs[c].size());
+      JXL_CHECK(0 == memcmp(input.coeffs[c].data(), output.coeffs[c].data(),
+                            input.coeffs[c].size()));
+    }
+  } else {
+    VerifyOutputImage(input, output, 0, output.ysize, max_rms, max_diff);
+  }
+}
+
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/test_utils.h b/third_party/jpeg-xl/lib/jpegli/test_utils.h
new file mode 100644
index 0000000000..f300b5de9e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/test_utils.h
@@ -0,0 +1,318 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TEST_UTILS_H_
+#define LIB_JPEGLI_TEST_UTILS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <setjmp.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+// We define this here as well to make sure that the *_api_test.cc tests only
+// use the public API and therefore we don't include any *_internal.h headers.
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+#define ERROR_HANDLER_SETUP(flavor)                                \
+  jpeg_error_mgr jerr;                                             \
+  jmp_buf env;                                                     \
+  cinfo.err = flavor##_std_error(&jerr);                           \
+  if (setjmp(env)) {                                               \
+    return false;                                                  \
+  }                                                                \
+  cinfo.client_data = reinterpret_cast<void*>(&env);               \
+  cinfo.err->error_exit = [](j_common_ptr cinfo) {                 \
+    (*cinfo->err->output_message)(cinfo);                          \
+    jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data); \
+    flavor##_destroy(cinfo);                                       \
+    longjmp(*env, 1);                                              \
+  };
+
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+
+static constexpr int kSpecialMarker0 = 0xe5;
+static constexpr int kSpecialMarker1 = 0xe9;
+static constexpr uint8_t kMarkerData[] = {0, 1, 255, 0, 17};
+static constexpr uint8_t kMarkerSequence[] = {0xe6, 0xe8, 0xe7,
+                                              0xe6, 0xe7, 0xe8};
+static constexpr size_t kMarkerSequenceLen = ARRAY_SIZE(kMarkerSequence);
+
+static constexpr jpeg_scan_info kScript1[] = {
+    {1, {0}, 0, 63, 0, 0},
+    {1, {1}, 0, 63, 0, 0},
+    {1, {2}, 0, 63, 0, 0},
+};
+
+static constexpr jpeg_scan_info kScript2[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 0},
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript3[] = {
+    {1, {0}, 0, 0, 0, 0},  {1, {1}, 0, 0, 0, 0},  {1, {2}, 0, 0, 0, 0},
+    {1, {0}, 1, 63, 0, 0}, {1, {1}, 1, 63, 0, 0}, {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript4[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 0}, {1, {0}, 1, 63, 0, 1}, {1, {1}, 1, 63, 0, 1},
+    {1, {2}, 1, 63, 0, 1},      {1, {0}, 1, 63, 1, 0}, {1, {1}, 1, 63, 1, 0},
+    {1, {2}, 1, 63, 1, 0},
+};
+
+struct ScanScript {
+  int num_scans;
+  const jpeg_scan_info* scans;
+};
+
+static constexpr ScanScript kTestScript[] = {
+    {ARRAY_SIZE(kScript1), kScript1},
+    {ARRAY_SIZE(kScript2), kScript2},
+    {ARRAY_SIZE(kScript3), kScript3},
+    {ARRAY_SIZE(kScript4), kScript4},
+};
+static constexpr int kNumTestScripts = ARRAY_SIZE(kTestScript);
+
+static constexpr int kLastScan = 0xffff;
+
+static uint32_t kTestColorMap[] = {
+    0x000000, 0xff0000, 0x00ff00, 0x0000ff, 0xffff00, 0x00ffff,
+    0xff00ff, 0xffffff, 0x6251fc, 0x45d9c7, 0xa7f059, 0xd9a945,
+    0xfa4e44, 0xceaffc, 0xbad7db, 0xc1f0b1, 0xdbca9a, 0xfacac5,
+    0xf201ff, 0x0063db, 0x00f01c, 0xdbb204, 0xf12f0c, 0x7ba1dc};
+static constexpr int kTestColorMapNumColors = ARRAY_SIZE(kTestColorMap);
+
+std::string IOMethodName(JpegliDataType data_type, JpegliEndianness endianness);
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace);
+
+enum JpegIOMode {
+  PIXELS,
+  RAW_DATA,
+  COEFFICIENTS,
+};
+
+struct CustomQuantTable {
+  int slot_idx = 0;
+  uint16_t table_type = 0;
+  int scale_factor = 100;
+  bool add_raw = false;
+  bool force_baseline = true;
+  std::vector<unsigned int> basic_table;
+  std::vector<unsigned int> quantval;
+  void Generate();
+};
+
+struct TestImage {
+  size_t xsize = 2268;
+  size_t ysize = 1512;
+  J_COLOR_SPACE color_space = JCS_RGB;
+  size_t components = 3;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  std::vector<uint8_t> pixels;
+  std::vector<std::vector<uint8_t>> raw_data;
+  std::vector<std::vector<JCOEF>> coeffs;
+  void AllocatePixels() {
+    pixels.resize(ysize * xsize * components *
+                  jpegli_bytes_per_sample(data_type));
+  }
+  void Clear() {
+    pixels.clear();
+    raw_data.clear();
+    coeffs.clear();
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input);
+
+struct CompressParams {
+  int quality = 90;
+  bool set_jpeg_colorspace = false;
+  J_COLOR_SPACE jpeg_color_space = JCS_UNKNOWN;
+  std::vector<int> quant_indexes;
+  std::vector<CustomQuantTable> quant_tables;
+  std::vector<int> h_sampling;
+  std::vector<int> v_sampling;
+  std::vector<int> comp_ids;
+  int override_JFIF = -1;
+  int override_Adobe = -1;
+  bool add_marker = false;
+  bool simple_progression = false;
+  // -1 is library default
+  // 0, 1, 2 is set through jpegli_set_progressive_level()
+  // 2 + N is kScriptN
+  int progressive_mode = -1;
+  unsigned int restart_interval = 0;
+  int restart_in_rows = 0;
+  int smoothing_factor = 0;
+  int optimize_coding = -1;
+  bool use_flat_dc_luma_code = false;
+  bool omit_standard_tables = false;
+  bool xyb_mode = false;
+  bool libjpeg_mode = false;
+  bool use_adaptive_quantization = true;
+  std::vector<uint8_t> icc;
+
+  int h_samp(int c) const { return h_sampling.empty() ? 1 : h_sampling[c]; }
+  int v_samp(int c) const { return v_sampling.empty() ? 1 : v_sampling[c]; }
+  int max_h_sample() const {
+    auto it = std::max_element(h_sampling.begin(), h_sampling.end());
+    return it == h_sampling.end() ? 1 : *it;
+  }
+  int max_v_sample() const {
+    auto it = std::max_element(v_sampling.begin(), v_sampling.end());
+    return it == v_sampling.end() ? 1 : *it;
+  }
+  int comp_width(const TestImage& input, int c) const {
+    return DivCeil(input.xsize * h_samp(c), max_h_sample() * DCTSIZE) * DCTSIZE;
+  }
+  int comp_height(const TestImage& input, int c) const {
+    return DivCeil(input.ysize * v_samp(c), max_v_sample() * DCTSIZE) * DCTSIZE;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams);
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+
+enum ColorQuantMode {
+  CQUANT_1PASS,
+  CQUANT_2PASS,
+  CQUANT_EXTERNAL,
+  CQUANT_REUSE,
+};
+
+struct ScanDecompressParams {
+  int max_scan_number;
+  J_DITHER_MODE dither_mode;
+  ColorQuantMode color_quant_mode;
+};
+
+struct DecompressParams {
+  float size_factor = 1.0f;
+  size_t chunk_size = 65536;
+  size_t max_output_lines = 16;
+  JpegIOMode output_mode = PIXELS;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  bool set_out_color_space = false;
+  J_COLOR_SPACE out_color_space = JCS_UNKNOWN;
+  bool crop_output = false;
+  bool do_block_smoothing = false;
+  bool do_fancy_upsampling = true;
+  bool skip_scans = false;
+  int scale_num = 1;
+  int scale_denom = 1;
+  bool quantize_colors = false;
+  int desired_number_of_colors = 256;
+  std::vector<ScanDecompressParams> scan_params;
+};
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo, bool is_jpegli);
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number,
+                             bool is_jpegli);
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output);
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors);
+
+std::string GetTestDataPath(const std::string& filename);
+std::vector<uint8_t> ReadTestData(const std::string& filename);
+
+class PNMParser {
+ public:
+  explicit PNMParser(const uint8_t* data, const size_t len)
+      : pos_(data), end_(data + len) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  bool ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                   size_t* num_channels, size_t* bitdepth);
+
+ private:
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  bool ParseUnsigned(size_t* number);
+
+  bool SkipWhitespace();
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels);
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels);
+
+void ConvertToGrayscale(TestImage* img);
+
+void GeneratePixels(TestImage* img);
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img);
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img);
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo);
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed);
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression);
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams, j_decompress_ptr cinfo,
+                       TestImage* output);
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines,
+                   double* max_diff = nullptr);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff = nullptr);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff = 255.0);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff = 255.0);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/testing.h b/third_party/jpeg-xl/lib/jpegli/testing.h
new file mode 100644
index 0000000000..873a0171e7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/testing.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TESTING_H_
+#define LIB_JPEGLI_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include "gtest/gtest.h"
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#endif  // LIB_JPEGLI_TESTING_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc
new file mode 100644
index 0000000000..1d99ce37fa
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/transcode_api_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void TranscodeWithJpegli(const std::vector<uint8_t>& jpeg_input,
+                         const CompressParams& jparams,
+                         std::vector<uint8_t>* jpeg_output) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  uint8_t* transcoded_data = nullptr;
+  unsigned long transcoded_size;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    jpegli_create_decompress(&dinfo);
+    jpegli_mem_src(&dinfo, jpeg_input.data(), jpeg_input.size());
+    EXPECT_EQ(JPEG_REACHED_SOS,
+              jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&dinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &transcoded_data, &transcoded_size);
+    jpegli_copy_critical_parameters(&dinfo, &cinfo);
+    jpegli_set_progressive_level(&cinfo, jparams.progressive_mode);
+    cinfo.optimize_coding = jparams.optimize_coding;
+    jpegli_write_coefficients(&cinfo, coef_arrays);
+    jpegli_finish_compress(&cinfo);
+    jpegli_finish_decompress(&dinfo);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+  if (transcoded_data) {
+    jpeg_output->assign(transcoded_data, transcoded_data + transcoded_size);
+    free(transcoded_data);
+  }
+}
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class TranscodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(TranscodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  CompressParams& jparams = config.jparams;
+  GeneratePixels(&config.input);
+
+  // Start with sequential non-optimized jpeg.
+  jparams.progressive_mode = 0;
+  jparams.optimize_coding = 0;
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, jparams, &compressed));
+  TestImage output0;
+  DecodeWithLibjpeg(jparams, DecompressParams(), compressed, &output0);
+
+  // Transcode first to a sequential optimized jpeg, and then further to
+  // a progressive jpeg.
+  for (int progr : {0, 2}) {
+    std::vector<uint8_t> transcoded;
+    jparams.progressive_mode = progr;
+    jparams.optimize_coding = 1;
+    TranscodeWithJpegli(compressed, jparams, &transcoded);
+
+    // We expect a size reduction of at least 2%.
+    EXPECT_LT(transcoded.size(), compressed.size() * 0.98f);
+
+    // Verify that transcoding is lossless.
+    TestImage output1;
+    DecodeWithLibjpeg(jparams, DecompressParams(), transcoded, &output1);
+    ASSERT_EQ(output0.pixels.size(), output1.pixels.size());
+    EXPECT_EQ(0, memcmp(output0.pixels.data(), output1.pixels.data(),
+                        output0.pixels.size()));
+    compressed = transcoded;
+  }
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1024;
+  const size_t ysize0 = 768;
+  for (int dxsize : {0, 1, 8, 9}) {
+    for (int dysize : {0, 1, 8, 9}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          TestConfig config;
+          config.input.xsize = xsize0 + dxsize;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {h_sampling, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<TranscodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(TranscodeAPITest, TranscodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/third_party/jpeg-xl/lib/jpegli/transpose-inl.h b/third_party/jpeg-xl/lib/jpegli/transpose-inl.h
new file mode 100644
index 0000000000..9fdd222f4e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/transpose-inl.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
+#undef LIB_JPEGLI_TRANSPOSE_INL_H_
+#else
+#define LIB_JPEGLI_TRANSPOSE_INL_H_
+#endif
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+#if HWY_CAP_GE256
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 8) d;
+  auto i0 = Load(d, from);
+  auto i1 = Load(d, from + 1 * 8);
+  auto i2 = Load(d, from + 2 * 8);
+  auto i3 = Load(d, from + 3 * 8);
+  auto i4 = Load(d, from + 4 * 8);
+  auto i5 = Load(d, from + 5 * 8);
+  auto i6 = Load(d, from + 6 * 8);
+  auto i7 = Load(d, from + 7 * 8);
+
+  const auto q0 = InterleaveLower(d, i0, i2);
+  const auto q1 = InterleaveLower(d, i1, i3);
+  const auto q2 = InterleaveUpper(d, i0, i2);
+  const auto q3 = InterleaveUpper(d, i1, i3);
+  const auto q4 = InterleaveLower(d, i4, i6);
+  const auto q5 = InterleaveLower(d, i5, i7);
+  const auto q6 = InterleaveUpper(d, i4, i6);
+  const auto q7 = InterleaveUpper(d, i5, i7);
+
+  const auto r0 = InterleaveLower(d, q0, q1);
+  const auto r1 = InterleaveUpper(d, q0, q1);
+  const auto r2 = InterleaveLower(d, q2, q3);
+  const auto r3 = InterleaveUpper(d, q2, q3);
+  const auto r4 = InterleaveLower(d, q4, q5);
+  const auto r5 = InterleaveUpper(d, q4, q5);
+  const auto r6 = InterleaveLower(d, q6, q7);
+  const auto r7 = InterleaveUpper(d, q6, q7);
+
+  i0 = ConcatLowerLower(d, r4, r0);
+  i1 = ConcatLowerLower(d, r5, r1);
+  i2 = ConcatLowerLower(d, r6, r2);
+  i3 = ConcatLowerLower(d, r7, r3);
+  i4 = ConcatUpperUpper(d, r4, r0);
+  i5 = ConcatUpperUpper(d, r5, r1);
+  i6 = ConcatUpperUpper(d, r6, r2);
+  i7 = ConcatUpperUpper(d, r7, r3);
+
+  Store(i0, d, to);
+  Store(i1, d, to + 1 * 8);
+  Store(i2, d, to + 2 * 8);
+  Store(i3, d, to + 3 * 8);
+  Store(i4, d, to + 4 * 8);
+  Store(i5, d, to + 5 * 8);
+  Store(i6, d, to + 6 * 8);
+  Store(i7, d, to + 7 * 8);
+}
+#elif HWY_TARGET != HWY_SCALAR
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 4) d;
+  for (size_t n = 0; n < 8; n += 4) {
+    for (size_t m = 0; m < 8; m += 4) {
+      auto p0 = Load(d, from + n * 8 + m);
+      auto p1 = Load(d, from + (n + 1) * 8 + m);
+      auto p2 = Load(d, from + (n + 2) * 8 + m);
+      auto p3 = Load(d, from + (n + 3) * 8 + m);
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      Store(r0, d, to + m * 8 + n);
+      Store(r1, d, to + (1 + m) * 8 + n);
+      Store(r2, d, to + (2 + m) * 8 + n);
+      Store(r3, d, to + (3 + m) * 8 + n);
+    }
+  }
+}
+#else
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  for (size_t n = 0; n < 8; ++n) {
+    for (size_t m = 0; m < 8; ++m) {
+      to[8 * n + m] = from[8 * m + n];
+    }
+  }
+}
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_TRANSPOSE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jpegli/upsample.cc b/third_party/jpeg-xl/lib/jpegli/upsample.cc
new file mode 100644
index 0000000000..5559aa78a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/upsample.cc
@@ -0,0 +1,137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/upsample.h"
+
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+#if HWY_CAP_GE512
+using hwy::HWY_NAMESPACE::Half;
+using hwy::HWY_NAMESPACE::Vec;
+template <size_t i, class DF, class V>
+HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
+  using HF = Half<DF>;
+  using HHF = Half<HF>;
+  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
+  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
+}
+
+template <class DF, class V>
+HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
+  using HF = Half<DF>;
+  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
+}
+
+#endif
+
+// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
+// aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+#elif !HWY_CAP_GE256
+  Store(InterleaveLower(df, v0, v1), df, mem);
+  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
+#else
+  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(ConcatLowerLower(df, t1, t0), df, mem);
+    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
+  } else {
+#if HWY_CAP_GE512
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
+                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
+          df, mem);
+    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
+                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
+          df, mem + Lanes(df));
+#endif
+  }
+#endif
+}
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  const size_t len_in = (len_out + 1) >> 1;
+  memcpy(scratch_space, row, len_in * sizeof(row[0]));
+  scratch_space[-1] = scratch_space[0];
+  scratch_space[len_in] = scratch_space[len_in - 1];
+  for (size_t x = 0; x < len_in; x += Lanes(df)) {
+    auto current = Mul(Load(df, scratch_space + x), threefour);
+    auto prev = LoadU(df, scratch_space + x - 1);
+    auto next = LoadU(df, scratch_space + x + 1);
+    auto left = MulAdd(onefour, prev, current);
+    auto right = MulAdd(onefour, next, current);
+    StoreInterleaved(df, left, right, row + x * 2);
+  }
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  for (size_t x = 0; x < len; x += Lanes(df)) {
+    auto it = Load(df, row_top + x);
+    auto im = Load(df, row_mid + x);
+    auto ib = Load(df, row_bot + x);
+    auto im_scaled = Mul(im, threefour);
+    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Upsample2Horizontal);
+HWY_EXPORT(Upsample2Vertical);
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
+                                                 row_out0, row_out1, len);
+}
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jpegli/upsample.h b/third_party/jpeg-xl/lib/jpegli/upsample.h
new file mode 100644
index 0000000000..1a057208dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jpegli/upsample.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_UPSAMPLE_H_
+#define LIB_JPEGLI_UPSAMPLE_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out);
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_UPSAMPLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl.cmake b/third_party/jpeg-xl/lib/jxl.cmake
new file mode 100644
index 0000000000..2672cb4c77
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl.cmake
@@ -0,0 +1,329 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(compatibility.cmake)
+include(jxl_lists.cmake)
+
+if (JPEGXL_ENABLE_TOOLS OR JPEGXL_ENABLE_DEVTOOLS OR JPEGXL_ENABLE_BOXES)
+list(APPEND JPEGXL_INTERNAL_DEC_SOURCES ${JPEGXL_INTERNAL_DEC_BOX_SOURCES})
+endif()
+
+if (JPEGXL_ENABLE_TRANSCODE_JPEG OR JPEGXL_ENABLE_TOOLS OR JPEGXL_ENABLE_DEVTOOLS)
+list(APPEND JPEGXL_INTERNAL_DEC_SOURCES ${JPEGXL_INTERNAL_DEC_JPEG_SOURCES})
+endif()
+
+set_source_files_properties(jxl/enc_fast_lossless.cc PROPERTIES COMPILE_FLAGS -O3)
+
+set(JPEGXL_DEC_INTERNAL_LIBS
+  hwy
+  Threads::Threads
+  ${ATOMICS_LIBRARIES}
+)
+
+if (JPEGXL_ENABLE_TRANSCODE_JPEG OR JPEGXL_ENABLE_BOXES)
+list(APPEND JPEGXL_DEC_INTERNAL_LIBS brotlidec brotlicommon)
+endif()
+
+set(JPEGXL_INTERNAL_LIBS
+  ${JPEGXL_DEC_INTERNAL_LIBS}
+  brotlienc
+)
+
+if (JPEGXL_ENABLE_SKCMS)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_SKCMS=1)
+  if (JPEGXL_BUNDLE_SKCMS)
+    list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_BUNDLE_SKCMS=1)
+    # skcms objects are later added to JPEGXL_INTERNAL_OBJECTS
+  else ()
+    list(APPEND JPEGXL_INTERNAL_LIBS skcms)
+  endif ()
+else ()
+  list(APPEND JPEGXL_INTERNAL_LIBS lcms2)
+endif ()
+
+if (JPEGXL_ENABLE_TRANSCODE_JPEG)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_TRANSCODE_JPEG=1)
+else()
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_TRANSCODE_JPEG=0)
+endif ()
+
+if (JPEGXL_ENABLE_BOXES)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_BOXES=1)
+else()
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_BOXES=0)
+endif ()
+
+set(OBJ_COMPILE_DEFINITIONS
+  JPEGXL_MAJOR_VERSION=${JPEGXL_MAJOR_VERSION}
+  JPEGXL_MINOR_VERSION=${JPEGXL_MINOR_VERSION}
+  JPEGXL_PATCH_VERSION=${JPEGXL_PATCH_VERSION}
+  # Used to determine if we are building the library when defined or just
+  # including the library when not defined. This is public so libjxl shared
+  # library gets this define too.
+  JXL_INTERNAL_LIBRARY_BUILD
+)
+
+# Generate version.h
+configure_file("jxl/version.h.in" "include/jxl/version.h")
+
+# Headers for exporting/importing public headers
+include(GenerateExportHeader)
+
+# CMake does not allow generate_export_header for INTERFACE library, so we
+# add this stub library just for file generation.
+add_library(jxl_export OBJECT ${JPEGXL_INTERNAL_PUBLIC_HEADERS})
+set_target_properties(jxl_export PROPERTIES
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+  DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
+  LINKER_LANGUAGE CXX
+)
+generate_export_header(jxl_export
+  BASE_NAME JXL
+  EXPORT_FILE_NAME include/jxl/jxl_export.h)
+# Place all public headers in a single directory.
+foreach(path ${JPEGXL_INTERNAL_PUBLIC_HEADERS})
+  configure_file(
+    ${path}
+    ${path}
+    COPYONLY
+  )
+endforeach()
+
+add_library(jxl_includes INTERFACE)
+target_include_directories(jxl_includes SYSTEM INTERFACE
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
+)
+add_dependencies(jxl_includes jxl_export)
+
+# Base headers / utilities.
+add_library(jxl_base-obj OBJECT ${JPEGXL_INTERNAL_BASE_SOURCES})
+target_compile_options(jxl_base-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jxl_base-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jxl_base-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_base-obj PUBLIC
+  ${PROJECT_SOURCE_DIR}
+  ${JXL_HWY_INCLUDE_DIRS}
+)
+
+jxl_link_libraries(jxl_base-obj jxl_includes)
+
+if(JPEGXL_ENABLE_PROFILER)
+  target_compile_definitions(jxl_base-obj PUBLIC -DJXL_PROFILER_ENABLED=1)
+endif()
+
+# Decoder-only object library
+add_library(jxl_dec-obj OBJECT ${JPEGXL_INTERNAL_DEC_SOURCES})
+target_compile_options(jxl_dec-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jxl_dec-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jxl_dec-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_dec-obj PUBLIC
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
+  "${JXL_HWY_INCLUDE_DIRS}"
+  "$<BUILD_INTERFACE:$<TARGET_PROPERTY:brotlicommon,INTERFACE_INCLUDE_DIRECTORIES>>"
+)
+target_compile_definitions(jxl_dec-obj PUBLIC
+  ${OBJ_COMPILE_DEFINITIONS}
+)
+jxl_link_libraries(jxl_dec-obj jxl_base-obj)
+
+# Object library. This is used to hold the set of objects and properties.
+add_library(jxl_enc-obj OBJECT ${JPEGXL_INTERNAL_ENC_SOURCES})
+target_compile_options(jxl_enc-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jxl_enc-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jxl_enc-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_enc-obj PUBLIC
+  ${PROJECT_SOURCE_DIR}
+  ${JXL_HWY_INCLUDE_DIRS}
+  $<TARGET_PROPERTY:brotlicommon,INTERFACE_INCLUDE_DIRECTORIES>
+)
+target_compile_definitions(jxl_enc-obj PUBLIC
+  ${OBJ_COMPILE_DEFINITIONS}
+)
+jxl_link_libraries(jxl_enc-obj jxl_base-obj)
+
+#TODO(lode): don't depend on CMS for the core library
+if (JPEGXL_ENABLE_SKCMS)
+  target_include_directories(jxl_enc-obj PRIVATE
+    $<TARGET_PROPERTY:skcms,INCLUDE_DIRECTORIES>
+  )
+else ()
+  target_include_directories(jxl_enc-obj PRIVATE
+    $<TARGET_PROPERTY:lcms2,INCLUDE_DIRECTORIES>
+  )
+endif ()
+
+set_target_properties(jxl_dec-obj PROPERTIES
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+  DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
+)
+
+set_target_properties(jxl_enc-obj PROPERTIES
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+  DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
+)
+
+# Private static library. This exposes all the internal functions and is used
+# for tests.
+add_library(jxl_dec-static STATIC
+  $<TARGET_OBJECTS:jxl_base-obj>
+  $<TARGET_OBJECTS:jxl_dec-obj>
+)
+target_link_libraries(jxl_dec-static
+  PUBLIC ${JPEGXL_COVERAGE_FLAGS} ${JPEGXL_DEC_INTERNAL_LIBS} jxl_includes)
+
+# The list of objects in the static and shared libraries.
+set(JPEGXL_INTERNAL_OBJECTS
+  $<TARGET_OBJECTS:jxl_base-obj>
+  $<TARGET_OBJECTS:jxl_enc-obj>
+  $<TARGET_OBJECTS:jxl_dec-obj>
+)
+if (JPEGXL_ENABLE_SKCMS AND JPEGXL_BUNDLE_SKCMS)
+  list(APPEND JPEGXL_INTERNAL_OBJECTS $<TARGET_OBJECTS:skcms-obj>)
+endif()
+
+# Private static library. This exposes all the internal functions and is used
+# for tests.
+# TODO(lode): once the source files are correctly split so that it is possible
+# to do, remove $<TARGET_OBJECTS:jxl_dec-obj> here and depend on jxl_dec-static
+add_library(jxl-static STATIC ${JPEGXL_INTERNAL_OBJECTS})
+target_link_libraries(jxl-static
+  PUBLIC ${JPEGXL_COVERAGE_FLAGS} ${JPEGXL_INTERNAL_LIBS} jxl_includes)
+target_include_directories(jxl-static PUBLIC
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>")
+
+# JXL_EXPORT is defined to "__declspec(dllimport)" automatically by CMake
+# in Windows builds when including headers from the C API and compiling from
+# outside the jxl library. This is required when using the shared library,
+# however in windows this causes the function to not be found when linking
+# against the static library. This define JXL_EXPORT= here forces it to not
+# use dllimport in tests and other tools that require the static library.
+target_compile_definitions(jxl-static INTERFACE -DJXL_EXPORT=)
+target_compile_definitions(jxl_dec-static INTERFACE -DJXL_EXPORT=)
+
+# TODO(deymo): Move TCMalloc linkage to the tools/ directory since the library
+# shouldn't do any allocs anyway.
+if(JPEGXL_ENABLE_TCMALLOC)
+  pkg_check_modules(TCMallocMinimal REQUIRED IMPORTED_TARGET
+      libtcmalloc_minimal)
+  # tcmalloc 2.8 has concurrency issues that makes it sometimes return nullptr
+  # for large allocs. See https://github.com/gperftools/gperftools/issues/1204
+  # for details.
+  if(TCMallocMinimal_VERSION VERSION_EQUAL 2.8)
+    message(FATAL_ERROR
+        "tcmalloc version 2.8 has a concurrency bug. You have installed "
+        "version ${TCMallocMinimal_VERSION}, please either downgrade tcmalloc "
+        "to version 2.7, upgrade to 2.8.1 or newer or pass "
+        "-DJPEGXL_ENABLE_TCMALLOC=OFF to jpeg-xl cmake line. See the following "
+        "bug for details:\n"
+        "   https://github.com/gperftools/gperftools/issues/1204\n")
+  endif()
+  target_link_libraries(jxl-static PUBLIC PkgConfig::TCMallocMinimal)
+endif()  # JPEGXL_ENABLE_TCMALLOC
+
+# Install the static library too, but as jxl.a file without the -static except
+# in Windows.
+if (NOT WIN32 OR MINGW)
+  set_target_properties(jxl-static PROPERTIES OUTPUT_NAME "jxl")
+  set_target_properties(jxl_dec-static PROPERTIES OUTPUT_NAME "jxl_dec")
+endif()
+install(TARGETS jxl-static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS jxl_dec-static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+if (BUILD_SHARED_LIBS)
+
+# Public shared library.
+add_library(jxl SHARED ${JPEGXL_INTERNAL_OBJECTS})
+strip_static(JPEGXL_INTERNAL_SHARED_LIBS JPEGXL_INTERNAL_LIBS)
+target_link_libraries(jxl PUBLIC ${JPEGXL_COVERAGE_FLAGS} jxl_includes)
+target_link_libraries(jxl PRIVATE ${JPEGXL_INTERNAL_SHARED_LIBS})
+# Shared library include path contains only the "include/" paths.
+set_target_properties(jxl PROPERTIES
+  VERSION ${JPEGXL_LIBRARY_VERSION}
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+# Public shared decoder library.
+add_library(jxl_dec SHARED $<TARGET_OBJECTS:jxl_base-obj> $<TARGET_OBJECTS:jxl_dec-obj>)
+strip_static(JPEGXL_DEC_INTERNAL_SHARED_LIBS JPEGXL_DEC_INTERNAL_LIBS)
+target_link_libraries(jxl_dec PUBLIC ${JPEGXL_COVERAGE_FLAGS} jxl_includes)
+target_link_libraries(jxl_dec PRIVATE ${JPEGXL_DEC_INTERNAL_SHARED_LIBS})
+# Shared library include path contains only the "include/" paths.
+set_target_properties(jxl_dec PROPERTIES
+  VERSION ${JPEGXL_LIBRARY_VERSION}
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+# Check whether the linker support excluding libs
+set(LINKER_EXCLUDE_LIBS_FLAG "-Wl,--exclude-libs=ALL")
+include(CheckCSourceCompiles)
+list(APPEND CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
+check_c_source_compiles("int main(){return 0;}" LINKER_SUPPORT_EXCLUDE_LIBS)
+list(REMOVE_ITEM CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
+
+# Add a jxl.version file as a version script to tag symbols with the
+# appropriate version number. This script is also used to limit what's exposed
+# in the shared library from the static dependencies bundled here.
+foreach(target IN ITEMS jxl jxl_dec)
+  set_target_properties(${target} PROPERTIES
+      LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version)
+  if(APPLE)
+  set_property(TARGET ${target} APPEND_STRING PROPERTY
+      LINK_FLAGS "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl_osx.syms")
+  elseif(WIN32)
+    # Nothing needed here, we use __declspec(dllexport) (jxl_export.h)
+  else()
+  set_property(TARGET ${target} APPEND_STRING PROPERTY
+      LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version")
+  endif()  # APPLE
+  # This hides the default visibility symbols from static libraries bundled into
+  # the shared library. In particular this prevents exposing symbols from hwy
+  # and skcms in the shared library.
+  if(LINKER_SUPPORT_EXCLUDE_LIBS)
+    set_property(TARGET ${target} APPEND_STRING PROPERTY
+        LINK_FLAGS " ${LINKER_EXCLUDE_LIBS_FLAG}")
+  endif()
+endforeach()
+
+# Only install libjxl shared library. The libjxl_dec is not installed since it
+# contains symbols also in libjxl which would conflict if programs try to use
+# both.
+install(TARGETS jxl
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+else()
+add_library(jxl ALIAS jxl-static)
+add_library(jxl_dec ALIAS jxl_dec-static)
+endif()  # BUILD_SHARED_LIBS
+
+# Add a pkg-config file for libjxl.
+set(JPEGXL_LIBRARY_REQUIRES
+    "libhwy libbrotlienc libbrotlidec")
+if(NOT JPEGXL_ENABLE_SKCMS)
+  set(JPEGXL_LIBRARY_REQUIRES "${JPEGXL_LIBRARY_REQUIRES} lcms2")
+endif()
+
+# Allow adding prefix if CMAKE_INSTALL_INCLUDEDIR not absolute.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(PKGCONFIG_TARGET_INCLUDES "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+    set(PKGCONFIG_TARGET_INCLUDES "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+# Allow adding prefix if CMAKE_INSTALL_LIBDIR not absolute.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+    set(PKGCONFIG_TARGET_LIBS "${CMAKE_INSTALL_LIBDIR}")
+else()
+    set(PKGCONFIG_TARGET_LIBS "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+endif()
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/jxl/libjxl.pc.in"
+               "libjxl.pc" @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libjxl.pc"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
diff --git a/third_party/jpeg-xl/lib/jxl/ac_context.h b/third_party/jpeg-xl/lib/jxl/ac_context.h
new file mode 100644
index 0000000000..a2b9e046d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_context.h
@@ -0,0 +1,149 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AC_CONTEXT_H_
+#define LIB_JXL_AC_CONTEXT_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+
+namespace jxl {
+
+// Block context used for scanning order, number of non-zeros, AC coefficients.
+// Equal to the channel.
+constexpr uint32_t kDCTOrderContextStart = 0;
+
+// The number of predicted nonzeros goes from 0 to 1008. We use
+// ceil(log2(predicted+1)) as a context for the number of nonzeros, so from 0 to
+// 10, inclusive.
+constexpr uint32_t kNonZeroBuckets = 37;
+
+static const uint16_t kCoeffFreqContext[64] = {
+    0xBAD, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+    15,    15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+    23,    23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26,
+    27,    27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30,
+};
+
+static const uint16_t kCoeffNumNonzeroContext[64] = {
+    0xBAD, 0,   31,  62,  62,  93,  93,  93,  93,  123, 123, 123, 123,
+    152,   152, 152, 152, 152, 152, 152, 152, 180, 180, 180, 180, 180,
+    180,   180, 180, 180, 180, 180, 180, 206, 206, 206, 206, 206, 206,
+    206,   206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+    206,   206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+};
+
+// Supremum of ZeroDensityContext(x, y) + 1, when x + y < 64.
+constexpr int kZeroDensityContextCount = 458;
+// Supremum of ZeroDensityContext(x, y) + 1.
+constexpr int kZeroDensityContextLimit = 474;
+
+/* This function is used for entropy-sources pre-clustering.
+ *
+ * Ideally, each combination of |nonzeros_left| and |k| should go to its own
+ * bucket; but it implies (64 * 63 / 2) == 2016 buckets. If there is other
+ * dimension (e.g. block context), then number of primary clusters becomes too
+ * big.
+ *
+ * To solve this problem, |nonzeros_left| and |k| values are clustered. It is
+ * known that their sum is at most 64, consequently, the total number buckets
+ * is at most A(64) * B(64).
+ */
+// TODO(user): investigate, why disabling pre-clustering makes entropy code
+// less dense. Perhaps we would need to add HQ clustering algorithm that would
+// be able to squeeze better by spending more CPU cycles.
+static JXL_INLINE size_t ZeroDensityContext(size_t nonzeros_left, size_t k,
+                                            size_t covered_blocks,
+                                            size_t log2_covered_blocks,
+                                            size_t prev) {
+  JXL_DASSERT((1u << log2_covered_blocks) == covered_blocks);
+  nonzeros_left = (nonzeros_left + covered_blocks - 1) >> log2_covered_blocks;
+  k >>= log2_covered_blocks;
+  JXL_DASSERT(k > 0);
+  JXL_DASSERT(k < 64);
+  JXL_DASSERT(nonzeros_left > 0);
+  // Asserting nonzeros_left + k < 65 here causes crashes in debug mode with
+  // invalid input, since the (hot) decoding loop does not check this condition.
+  // As no out-of-bound memory reads are issued even if that condition is
+  // broken, we check this simpler condition which holds anyway. The decoder
+  // will still mark a file in which that condition happens as not valid at the
+  // end of the decoding loop, as `nzeros` will not be `0`.
+  JXL_DASSERT(nonzeros_left < 64);
+  return (kCoeffNumNonzeroContext[nonzeros_left] + kCoeffFreqContext[k]) * 2 +
+         prev;
+}
+
+struct BlockCtxMap {
+  std::vector<int> dc_thresholds[3];
+  std::vector<uint32_t> qf_thresholds;
+  std::vector<uint8_t> ctx_map;
+  size_t num_ctxs, num_dc_ctxs;
+
+  static constexpr uint8_t kDefaultCtxMap[] = {
+      // Default ctx map clusters all the large transforms together.
+      0, 1, 2, 2, 3,  3,  4,  5,  6,  6,  6,  6,  6,   //
+      7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,  //
+      7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,  //
+  };
+  static_assert(3 * kNumOrders ==
+                    sizeof(kDefaultCtxMap) / sizeof *kDefaultCtxMap,
+                "Update default context map");
+
+  size_t Context(int dc_idx, uint32_t qf, size_t ord, size_t c) const {
+    size_t qf_idx = 0;
+    for (uint32_t t : qf_thresholds) {
+      if (qf > t) qf_idx++;
+    }
+    size_t idx = c < 2 ? c ^ 1 : 2;
+    idx = idx * kNumOrders + ord;
+    idx = idx * (qf_thresholds.size() + 1) + qf_idx;
+    idx = idx * num_dc_ctxs + dc_idx;
+    return ctx_map[idx];
+  }
+  // Non-zero context is based on number of non-zeros and block context.
+  // For better clustering, contexts with same number of non-zeros are grouped.
+  constexpr uint32_t ZeroDensityContextsOffset(uint32_t block_ctx) const {
+    return num_ctxs * kNonZeroBuckets + kZeroDensityContextCount * block_ctx;
+  }
+
+  // Context map for AC coefficients consists of 2 blocks:
+  //  |num_ctxs x                : context for number of non-zeros in the block
+  //   kNonZeroBuckets|            computed from block context and predicted
+  //                               value (based top and left values)
+  //  |num_ctxs x                : context for AC coefficient symbols,
+  //   kZeroDensityContextCount|   computed from block context,
+  //                               number of non-zeros left and
+  //                               index in scan order
+  constexpr uint32_t NumACContexts() const {
+    return num_ctxs * (kNonZeroBuckets + kZeroDensityContextCount);
+  }
+
+  // Non-zero context is based on number of non-zeros and block context.
+  // For better clustering, contexts with same number of non-zeros are grouped.
+  inline uint32_t NonZeroContext(uint32_t non_zeros, uint32_t block_ctx) const {
+    uint32_t ctx;
+    if (non_zeros >= 64) non_zeros = 64;
+    if (non_zeros < 8) {
+      ctx = non_zeros;
+    } else {
+      ctx = 4 + non_zeros / 2;
+    }
+    return ctx * num_ctxs + block_ctx;
+  }
+
+  BlockCtxMap() {
+    ctx_map.assign(std::begin(kDefaultCtxMap), std::end(kDefaultCtxMap));
+    num_ctxs = *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+    num_dc_ctxs = 1;
+  }
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AC_CONTEXT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc
new file mode 100644
index 0000000000..ada3bcb6f5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.cc
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ac_strategy.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <numeric>  // iota
+#include <type_traits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+// Tries to generalize zig-zag order to non-square blocks. Surprisingly, in
+// square block frequency along the (i + j == const) diagonals is roughly the
+// same. For historical reasons, consecutive diagonals are traversed
+// in alternating directions - so called "zig-zag" (or "snake") order.
+template <bool is_lut>
+static void CoeffOrderAndLut(AcStrategy acs, coeff_order_t* out) {
+  size_t cx = acs.covered_blocks_x();
+  size_t cy = acs.covered_blocks_y();
+  CoefficientLayout(&cy, &cx);
+
+  // CoefficientLayout ensures cx >= cy.
+  // We compute the zigzag order for a cx x cx block, then discard all the
+  // lines that are not multiple of the ratio between cx and cy.
+  size_t xs = cx / cy;
+  size_t xsm = xs - 1;
+  size_t xss = CeilLog2Nonzero(xs);
+  // First half of the block
+  size_t cur = cx * cy;
+  for (size_t i = 0; i < cx * kBlockDim; i++) {
+    for (size_t j = 0; j <= i; j++) {
+      size_t x = j;
+      size_t y = i - j;
+      if (i % 2) std::swap(x, y);
+      if ((y & xsm) != 0) continue;
+      y >>= xss;
+      size_t val = 0;
+      if (x < cx && y < cy) {
+        val = y * cx + x;
+      } else {
+        val = cur++;
+      }
+      if (is_lut) {
+        out[y * cx * kBlockDim + x] = val;
+      } else {
+        out[val] = y * cx * kBlockDim + x;
+      }
+    }
+  }
+  // Second half
+  for (size_t ip = cx * kBlockDim - 1; ip > 0; ip--) {
+    size_t i = ip - 1;
+    for (size_t j = 0; j <= i; j++) {
+      size_t x = cx * kBlockDim - 1 - (i - j);
+      size_t y = cx * kBlockDim - 1 - j;
+      if (i % 2) std::swap(x, y);
+      if ((y & xsm) != 0) continue;
+      y >>= xss;
+      size_t val = cur++;
+      if (is_lut) {
+        out[y * cx * kBlockDim + x] = val;
+      } else {
+        out[val] = y * cx * kBlockDim + x;
+      }
+    }
+  }
+}
+
+void AcStrategy::ComputeNaturalCoeffOrder(coeff_order_t* order) const {
+  CoeffOrderAndLut</*is_lut=*/false>(*this, order);
+}
+void AcStrategy::ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const {
+  CoeffOrderAndLut</*is_lut=*/true>(*this, lut);
+}
+
+// These definitions are needed before C++17.
+constexpr size_t AcStrategy::kMaxCoeffBlocks;
+constexpr size_t AcStrategy::kMaxBlockDim;
+constexpr size_t AcStrategy::kMaxCoeffArea;
+
+AcStrategyImage::AcStrategyImage(size_t xsize, size_t ysize)
+    : layers_(xsize, ysize) {
+  row_ = layers_.Row(0);
+  stride_ = layers_.PixelsPerRow();
+}
+
+size_t AcStrategyImage::CountBlocks(AcStrategy::Type type) const {
+  size_t ret = 0;
+  for (size_t y = 0; y < layers_.ysize(); y++) {
+    const uint8_t* JXL_RESTRICT row = layers_.ConstRow(y);
+    for (size_t x = 0; x < layers_.xsize(); x++) {
+      if (row[x] == ((static_cast<uint8_t>(type) << 1) | 1)) ret++;
+    }
+  }
+  return ret;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy.h b/third_party/jpeg-xl/lib/jxl/ac_strategy.h
new file mode 100644
index 0000000000..7d21167e6e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy.h
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AC_STRATEGY_H_
+#define LIB_JXL_AC_STRATEGY_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/base.h>  // kMaxVectorSize
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+
+// Defines the different kinds of transforms, and heuristics to choose between
+// them.
+// `AcStrategy` represents what transform should be used, and which sub-block of
+// that transform we are currently in. Note that DCT4x4 is applied on all four
+// 4x4 sub-blocks of an 8x8 block.
+// `AcStrategyImage` defines which strategy should be used for each 8x8 block
+// of the image. The highest 4 bits represent the strategy to be used, the
+// lowest 4 represent the index of the block inside that strategy.
+
+namespace jxl {
+
+class AcStrategy {
+ public:
+  // Extremal values for the number of blocks/coefficients of a single strategy.
+  static constexpr size_t kMaxCoeffBlocks = 32;
+  static constexpr size_t kMaxBlockDim = kBlockDim * kMaxCoeffBlocks;
+  // Maximum number of coefficients in a block. Guaranteed to be a multiple of
+  // the vector size.
+  static constexpr size_t kMaxCoeffArea = kMaxBlockDim * kMaxBlockDim;
+  static_assert((kMaxCoeffArea * sizeof(float)) % hwy::kMaxVectorSize == 0,
+                "Coefficient area is not a multiple of vector size");
+
+  // Raw strategy types.
+  enum Type : uint32_t {
+    // Regular block size DCT
+    DCT = 0,
+    // Encode pixels without transforming
+    IDENTITY = 1,
+    // Use 2-by-2 DCT
+    DCT2X2 = 2,
+    // Use 4-by-4 DCT
+    DCT4X4 = 3,
+    // Use 16-by-16 DCT
+    DCT16X16 = 4,
+    // Use 32-by-32 DCT
+    DCT32X32 = 5,
+    // Use 16-by-8 DCT
+    DCT16X8 = 6,
+    // Use 8-by-16 DCT
+    DCT8X16 = 7,
+    // Use 32-by-8 DCT
+    DCT32X8 = 8,
+    // Use 8-by-32 DCT
+    DCT8X32 = 9,
+    // Use 32-by-16 DCT
+    DCT32X16 = 10,
+    // Use 16-by-32 DCT
+    DCT16X32 = 11,
+    // 4x8 and 8x4 DCT
+    DCT4X8 = 12,
+    DCT8X4 = 13,
+    // Corner-DCT.
+    AFV0 = 14,
+    AFV1 = 15,
+    AFV2 = 16,
+    AFV3 = 17,
+    // Larger DCTs
+    DCT64X64 = 18,
+    DCT64X32 = 19,
+    DCT32X64 = 20,
+    DCT128X128 = 21,
+    DCT128X64 = 22,
+    DCT64X128 = 23,
+    DCT256X256 = 24,
+    DCT256X128 = 25,
+    DCT128X256 = 26,
+    // Marker for num of valid strategies.
+    kNumValidStrategies
+  };
+
+  static constexpr uint32_t TypeBit(const Type type) {
+    return 1u << static_cast<uint32_t>(type);
+  }
+
+  // Returns true if this block is the first 8x8 block (i.e. top-left) of a
+  // possibly multi-block strategy.
+  JXL_INLINE bool IsFirstBlock() const { return is_first_; }
+
+  JXL_INLINE bool IsMultiblock() const {
+    constexpr uint32_t bits =
+        TypeBit(Type::DCT16X16) | TypeBit(Type::DCT32X32) |
+        TypeBit(Type::DCT16X8) | TypeBit(Type::DCT8X16) |
+        TypeBit(Type::DCT32X8) | TypeBit(Type::DCT8X32) |
+        TypeBit(Type::DCT16X32) | TypeBit(Type::DCT32X16) |
+        TypeBit(Type::DCT32X64) | TypeBit(Type::DCT64X32) |
+        TypeBit(Type::DCT64X64) | TypeBit(DCT64X128) | TypeBit(DCT128X64) |
+        TypeBit(DCT128X128) | TypeBit(DCT128X256) | TypeBit(DCT256X128) |
+        TypeBit(DCT256X256);
+    JXL_DASSERT(Strategy() < kNumValidStrategies);
+    return ((1u << static_cast<uint32_t>(Strategy())) & bits) != 0;
+  }
+
+  // Returns the raw strategy value. Should only be used for tokenization.
+  JXL_INLINE uint8_t RawStrategy() const {
+    return static_cast<uint8_t>(strategy_);
+  }
+
+  JXL_INLINE Type Strategy() const { return strategy_; }
+
+  // Inverse check
+  static JXL_INLINE constexpr bool IsRawStrategyValid(int raw_strategy) {
+    return raw_strategy < static_cast<int32_t>(kNumValidStrategies) &&
+           raw_strategy >= 0;
+  }
+  static JXL_INLINE AcStrategy FromRawStrategy(uint8_t raw_strategy) {
+    return FromRawStrategy(static_cast<Type>(raw_strategy));
+  }
+  static JXL_INLINE AcStrategy FromRawStrategy(Type raw_strategy) {
+    JXL_DASSERT(IsRawStrategyValid(static_cast<uint32_t>(raw_strategy)));
+    return AcStrategy(raw_strategy, /*is_first=*/true);
+  }
+
+  // "Natural order" means the order of increasing of "anisotropic" frequency of
+  // continuous version of DCT basis.
+  // Round-trip, for any given strategy s:
+  //  X = NaturalCoeffOrder(s)[NaturalCoeffOrderLutN(s)[X]]
+  //  X = NaturalCoeffOrderLut(s)[NaturalCoeffOrderN(s)[X]]
+  void ComputeNaturalCoeffOrder(coeff_order_t* order) const;
+  void ComputeNaturalCoeffOrderLut(coeff_order_t* lut) const;
+
+  // Number of 8x8 blocks that this strategy will cover. 0 for non-top-left
+  // blocks inside a multi-block transform.
+  JXL_INLINE size_t covered_blocks_x() const {
+    static constexpr uint8_t kLut[] = {1, 1, 1, 1,  2, 4,  1,  2,  1,
+                                       4, 2, 4, 1,  1, 1,  1,  1,  1,
+                                       8, 4, 8, 16, 8, 16, 32, 16, 32};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+  JXL_INLINE size_t covered_blocks_y() const {
+    static constexpr uint8_t kLut[] = {1, 1, 1, 1,  2,  4, 2,  1,  4,
+                                       1, 4, 2, 1,  1,  1, 1,  1,  1,
+                                       8, 8, 4, 16, 16, 8, 32, 32, 16};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+  JXL_INLINE size_t log2_covered_blocks() const {
+    static constexpr uint8_t kLut[] = {0, 0, 0, 0, 2, 4, 1,  1, 2,
+                                       2, 3, 3, 0, 0, 0, 0,  0, 0,
+                                       6, 5, 5, 8, 7, 7, 10, 9, 9};
+    static_assert(sizeof(kLut) / sizeof(*kLut) == kNumValidStrategies,
+                  "Update LUT");
+    return kLut[size_t(strategy_)];
+  }
+
+ private:
+  friend class AcStrategyRow;
+  JXL_INLINE AcStrategy(Type strategy, bool is_first)
+      : strategy_(strategy), is_first_(is_first) {
+    JXL_DASSERT(IsMultiblock() || is_first == true);
+  }
+
+  Type strategy_;
+  bool is_first_;
+};
+
+// Class to use a certain row of the AC strategy.
+class AcStrategyRow {
+ public:
+  explicit AcStrategyRow(const uint8_t* row) : row_(row) {}
+  AcStrategy operator[](size_t x) const {
+    return AcStrategy(static_cast<AcStrategy::Type>(row_[x] >> 1), row_[x] & 1);
+  }
+
+ private:
+  const uint8_t* JXL_RESTRICT row_;
+};
+
+class AcStrategyImage {
+ public:
+  AcStrategyImage() = default;
+  AcStrategyImage(size_t xsize, size_t ysize);
+  AcStrategyImage(AcStrategyImage&&) = default;
+  AcStrategyImage& operator=(AcStrategyImage&&) = default;
+
+  void FillDCT8(const Rect& rect) {
+    FillPlane<uint8_t>((static_cast<uint8_t>(AcStrategy::Type::DCT) << 1) | 1,
+                       &layers_, rect);
+  }
+  void FillDCT8() { FillDCT8(Rect(layers_)); }
+
+  void FillInvalid() { FillImage(INVALID, &layers_); }
+
+  void Set(size_t x, size_t y, AcStrategy::Type type) {
+#if JXL_ENABLE_ASSERT
+    AcStrategy acs = AcStrategy::FromRawStrategy(type);
+#endif  // JXL_ENABLE_ASSERT
+    JXL_ASSERT(y + acs.covered_blocks_y() <= layers_.ysize());
+    JXL_ASSERT(x + acs.covered_blocks_x() <= layers_.xsize());
+    JXL_CHECK(SetNoBoundsCheck(x, y, type, /*check=*/false));
+  }
+
+  Status SetNoBoundsCheck(size_t x, size_t y, AcStrategy::Type type,
+                          bool check = true) {
+    AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+        size_t pos = (y + iy) * stride_ + x + ix;
+        if (check && row_[pos] != INVALID) {
+          return JXL_FAILURE("Invalid AC strategy: block overlap");
+        }
+        row_[pos] =
+            (static_cast<uint8_t>(type) << 1) | ((iy | ix) == 0 ? 1 : 0);
+      }
+    }
+    return true;
+  }
+
+  bool IsValid(size_t x, size_t y) { return row_[y * stride_ + x] != INVALID; }
+
+  AcStrategyRow ConstRow(size_t y, size_t x_prefix = 0) const {
+    return AcStrategyRow(layers_.ConstRow(y) + x_prefix);
+  }
+
+  AcStrategyRow ConstRow(const Rect& rect, size_t y) const {
+    return ConstRow(rect.y0() + y, rect.x0());
+  }
+
+  size_t PixelsPerRow() const { return layers_.PixelsPerRow(); }
+
+  size_t xsize() const { return layers_.xsize(); }
+  size_t ysize() const { return layers_.ysize(); }
+
+  // Count the number of blocks of a given type.
+  size_t CountBlocks(AcStrategy::Type type) const;
+
+ private:
+  ImageB layers_;
+  uint8_t* JXL_RESTRICT row_;
+  size_t stride_;
+
+  // A value that does not represent a valid combined AC strategy
+  // value. Used as a sentinel.
+  static constexpr uint8_t INVALID = 0xFF;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AC_STRATEGY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc
new file mode 100644
index 0000000000..d366aa3f82
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ac_strategy_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ac_strategy.h"
+
+#include <string.h>
+
+#include <cmath>
+#include <hwy/aligned_allocator.h>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+#include <hwy/tests/test_util-inl.h>
+#include <utility>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms_testonly.h"
+#include "lib/jxl/enc_transforms.h"
+
+namespace jxl {
+namespace {
+
+// Test that DCT -> IDCT is a noop.
+class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
+    float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t j = 0; j < 64; j++) {
+      size_t i = (acs.log2_covered_blocks()
+                      ? rng.UniformU(0, 64u << acs.log2_covered_blocks())
+                      : j);
+      float* input = idct + AcStrategy::kMaxCoeffArea;
+      std::fill_n(input, AcStrategy::kMaxCoeffArea, 0);
+      input[i] = 0.2f;
+      TransformFromPixels(type, input, acs.covered_blocks_x() * 8, coeffs,
+                          scratch_space);
+      ASSERT_NEAR(coeffs[0], 0.2 / (64 << acs.log2_covered_blocks()), 1e-6)
+          << " i = " << i;
+      TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                        scratch_space);
+      for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) {
+        ASSERT_NEAR(idct[j], j == i ? 0.2f : 0, 2e-6)
+            << "j = " << j << " i = " << i << " acs " << type;
+      }
+    }
+    // Test DC.
+    std::fill_n(idct, AcStrategy::kMaxCoeffArea, 0);
+    for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+      for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+        float* dc = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        DCFromLowestFrequencies(type, coeffs, idct, acs.covered_blocks_x() * 8);
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
+        for (size_t j = 0; j < 64u << acs.log2_covered_blocks(); j++) {
+          ASSERT_NEAR(idct[j], dc[j], 1e-6)
+              << "j = " << j << " x = " << x << " y = " << y << " acs " << type;
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyRoundtrip,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyRoundtrip, Test) { Run(); }
+
+// Test that DC(2x2) -> DCT coefficients -> IDCT -> downsampled IDCT is a noop.
+class AcStrategyRoundtripDownsample
+    : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
+    std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
+    float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+      for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+        if (x > 4 || y > 4) {
+          if (rng.Bernoulli(0.9f)) continue;
+        }
+        float* dc = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f;
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                          scratch_space);
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
+        std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
+        dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f;
+        // Downsample
+        for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) {
+          for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) {
+            float sum = 0;
+            for (size_t iy = 0; iy < 8; iy++) {
+              for (size_t ix = 0; ix < 8; ix++) {
+                sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() +
+                            dx * 8 + ix];
+              }
+            }
+            sum /= 64.0f;
+            ASSERT_NEAR(sum, dc[dy * 8 * acs.covered_blocks_x() + dx], 1e-6)
+                << "acs " << type;
+          }
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyRoundtripDownsample,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyRoundtripDownsample, Test) { Run(); }
+
+// Test that IDCT(block with zeros in the non-topleft corner) -> downsampled
+// IDCT is the same as IDCT -> DC(2x2) of the same block.
+class AcStrategyDownsample : public ::hwy::TestWithParamTargetAndT<int> {
+ protected:
+  void Run() {
+    const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
+    const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    size_t cx = acs.covered_blocks_y();
+    size_t cy = acs.covered_blocks_x();
+    CoefficientLayout(&cy, &cx);
+
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
+    float* scratch_space = mem.get();
+    float* idct = scratch_space + AcStrategy::kMaxCoeffArea;
+    float* idct_acs_downsampled = idct + AcStrategy::kMaxCoeffArea;
+    Rng rng(type * 65537 + 13);
+
+    for (size_t y = 0; y < cy; y++) {
+      for (size_t x = 0; x < cx; x++) {
+        if (x > 4 || y > 4) {
+          if (rng.Bernoulli(0.9f)) continue;
+        }
+        float* coeffs = idct + AcStrategy::kMaxCoeffArea;
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0);
+        coeffs[y * cx * 8 + x] = 0.2f;
+        TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
+                          scratch_space);
+        std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0);
+        coeffs[y * cx * 8 + x] = 0.2f;
+        DCFromLowestFrequencies(type, coeffs, idct_acs_downsampled,
+                                acs.covered_blocks_x() * 8);
+        // Downsample
+        for (size_t dy = 0; dy < acs.covered_blocks_y(); dy++) {
+          for (size_t dx = 0; dx < acs.covered_blocks_x(); dx++) {
+            float sum = 0;
+            for (size_t iy = 0; iy < 8; iy++) {
+              for (size_t ix = 0; ix < 8; ix++) {
+                sum += idct[(dy * 8 + iy) * 8 * acs.covered_blocks_x() +
+                            dx * 8 + ix];
+              }
+            }
+            sum /= 64;
+            ASSERT_NEAR(
+                sum, idct_acs_downsampled[dy * 8 * acs.covered_blocks_x() + dx],
+                1e-6)
+                << " acs " << type;
+          }
+        }
+      }
+    }
+  }
+};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(
+    AcStrategyDownsample,
+    ::testing::Range(0, int(AcStrategy::Type::kNumValidStrategies)));
+
+TEST_P(AcStrategyDownsample, Test) { Run(); }
+
+class AcStrategyTargetTest : public ::hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(AcStrategyTargetTest);
+
+TEST_P(AcStrategyTargetTest, RoundtripAFVDCT) {
+  HWY_ALIGN_MAX float idct[16];
+  for (size_t i = 0; i < 16; i++) {
+    HWY_ALIGN_MAX float pixels[16] = {};
+    pixels[i] = 1;
+    HWY_ALIGN_MAX float coeffs[16] = {};
+
+    AFVDCT4x4(pixels, coeffs);
+    AFVIDCT4x4(coeffs, idct);
+    for (size_t j = 0; j < 16; j++) {
+      EXPECT_NEAR(idct[j], pixels[j], 1e-6);
+    }
+  }
+}
+
+TEST_P(AcStrategyTargetTest, BenchmarkAFV) {
+  const AcStrategy::Type type = AcStrategy::Type::AFV0;
+  HWY_ALIGN_MAX float pixels[64] = {1};
+  HWY_ALIGN_MAX float coeffs[64] = {};
+  HWY_ALIGN_MAX float scratch_space[64] = {};
+  for (size_t i = 0; i < 1 << 14; i++) {
+    TransformToPixels(type, coeffs, pixels, 8, scratch_space);
+    TransformFromPixels(type, pixels, 8, coeffs, scratch_space);
+  }
+  EXPECT_NEAR(pixels[0], 0.0, 1E-6);
+}
+
+TEST_P(AcStrategyTargetTest, BenchmarkAFVDCT) {
+  HWY_ALIGN_MAX float pixels[64] = {1};
+  HWY_ALIGN_MAX float coeffs[64] = {};
+  for (size_t i = 0; i < 1 << 14; i++) {
+    AFVDCT4x4(pixels, coeffs);
+    AFVIDCT4x4(coeffs, pixels);
+  }
+  EXPECT_NEAR(pixels[0], 1.0, 1E-6);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/alpha.cc b/third_party/jpeg-xl/lib/jxl/alpha.cc
new file mode 100644
index 0000000000..48d7e7ee92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha.cc
@@ -0,0 +1,115 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/alpha.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+namespace jxl {
+
+static float Clamp(float x) { return std::max(std::min(1.0f, x), 0.0f); }
+
+void PerformAlphaBlending(const AlphaBlendingInputLayer& bg,
+                          const AlphaBlendingInputLayer& fg,
+                          const AlphaBlendingOutput& out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp) {
+  if (alpha_is_premultiplied) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fga = clamp ? Clamp(fg.a[x]) : fg.a[x];
+      out.r[x] = (fg.r[x] + bg.r[x] * (1.f - fga));
+      out.g[x] = (fg.g[x] + bg.g[x] * (1.f - fga));
+      out.b[x] = (fg.b[x] + bg.b[x] * (1.f - fga));
+      out.a[x] = (1.f - (1.f - fga) * (1.f - bg.a[x]));
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fga = clamp ? Clamp(fg.a[x]) : fg.a[x];
+      const float new_a = 1.f - (1.f - fga) * (1.f - bg.a[x]);
+      const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f);
+      out.r[x] = (fg.r[x] * fga + bg.r[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.g[x] = (fg.g[x] * fga + bg.g[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.b[x] = (fg.b[x] * fga + bg.b[x] * bg.a[x] * (1.f - fga)) * rnew_a;
+      out.a[x] = new_a;
+    }
+  }
+}
+void PerformAlphaBlending(const float* bg, const float* bga, const float* fg,
+                          const float* fga, float* out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp) {
+  if (bg == bga && fg == fga) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      float fa = clamp ? fga[x] : Clamp(fga[x]);
+      out[x] = (1.f - (1.f - fa) * (1.f - bga[x]));
+    }
+  } else {
+    if (alpha_is_premultiplied) {
+      for (size_t x = 0; x < num_pixels; ++x) {
+        float fa = clamp ? fga[x] : Clamp(fga[x]);
+        out[x] = (fg[x] + bg[x] * (1.f - fa));
+      }
+    } else {
+      for (size_t x = 0; x < num_pixels; ++x) {
+        float fa = clamp ? fga[x] : Clamp(fga[x]);
+        const float new_a = 1.f - (1.f - fa) * (1.f - bga[x]);
+        const float rnew_a = (new_a > 0 ? 1.f / new_a : 0.f);
+        out[x] = (fg[x] * fa + bg[x] * bga[x] * (1.f - fa)) * rnew_a;
+      }
+    }
+  }
+}
+
+void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga,
+                             float* out, size_t num_pixels, bool clamp) {
+  if (fg == fga) {
+    memcpy(out, bg, num_pixels * sizeof(*out));
+  } else if (clamp) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] + fg[x] * Clamp(fga[x]);
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] + fg[x] * fga[x];
+    }
+  }
+}
+
+void PerformMulBlending(const float* bg, const float* fg, float* out,
+                        size_t num_pixels, bool clamp) {
+  if (clamp) {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] * Clamp(fg[x]);
+    }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] * fg[x];
+    }
+  }
+}
+
+void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                      float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                      size_t num_pixels) {
+  for (size_t x = 0; x < num_pixels; ++x) {
+    const float multiplier = std::max(kSmallAlpha, a[x]);
+    r[x] *= multiplier;
+    g[x] *= multiplier;
+    b[x] *= multiplier;
+  }
+}
+
+void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                        float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                        size_t num_pixels) {
+  for (size_t x = 0; x < num_pixels; ++x) {
+    const float multiplier = 1.f / std::max(kSmallAlpha, a[x]);
+    r[x] *= multiplier;
+    g[x] *= multiplier;
+    b[x] *= multiplier;
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/alpha.h b/third_party/jpeg-xl/lib/jxl/alpha.h
new file mode 100644
index 0000000000..efb76c800f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ALPHA_H_
+#define LIB_JXL_ALPHA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// A very small value to avoid divisions by zero when converting to
+// unpremultiplied alpha. Page 21 of the technical introduction to OpenEXR
+// (https://www.openexr.com/documentation/TechnicalIntroduction.pdf) recommends
+// "a power of two" that is "less than half of the smallest positive 16-bit
+// floating-point value". That smallest value happens to be the denormal number
+// 2^-24, so 2^-26 should be a good choice.
+static constexpr float kSmallAlpha = 1.f / (1u << 26u);
+
+struct AlphaBlendingInputLayer {
+  const float* r;
+  const float* g;
+  const float* b;
+  const float* a;
+};
+
+struct AlphaBlendingOutput {
+  float* r;
+  float* g;
+  float* b;
+  float* a;
+};
+
+// Note: The pointers in `out` are allowed to alias those in `bg` or `fg`.
+// No pointer shall be null.
+void PerformAlphaBlending(const AlphaBlendingInputLayer& bg,
+                          const AlphaBlendingInputLayer& fg,
+                          const AlphaBlendingOutput& out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp);
+// Single plane alpha blending
+void PerformAlphaBlending(const float* bg, const float* bga, const float* fg,
+                          const float* fga, float* out, size_t num_pixels,
+                          bool alpha_is_premultiplied, bool clamp);
+
+void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga,
+                             float* out, size_t num_pixels, bool clamp);
+
+void PerformMulBlending(const float* bg, const float* fg, float* out,
+                        size_t num_pixels, bool clamp);
+
+void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                      float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                      size_t num_pixels);
+void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
+                        float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
+                        size_t num_pixels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ALPHA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/alpha_test.cc b/third_party/jpeg-xl/lib/jxl/alpha_test.cc
new file mode 100644
index 0000000000..ddafd829ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/alpha_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/alpha.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::FloatNear;
+
+TEST(AlphaTest, BlendingWithNonPremultiplied) {
+  const float bg_rgb[3] = {100, 110, 120};
+  const float bg_a = 180.f / 255;
+  const float fg_rgb[3] = {25, 21, 23};
+  const float fg_a = 15420.f / 65535;
+  const float fg_a2 = 2.0f;
+  float out_rgb[3];
+  float out_a;
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/false, /*clamp=*/false);
+  EXPECT_THAT(out_rgb,
+              ElementsAre(FloatNear(77.2f, .05f), FloatNear(83.0f, .05f),
+                          FloatNear(90.6f, .05f)));
+  EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5);
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/false, /*clamp=*/true);
+  EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f),
+                                   FloatNear(fg_rgb[1], .05f),
+                                   FloatNear(fg_rgb[2], .05f)));
+  EXPECT_NEAR(out_a, 1.0f, 1e-5);
+}
+
+TEST(AlphaTest, BlendingWithPremultiplied) {
+  const float bg_rgb[3] = {100, 110, 120};
+  const float bg_a = 180.f / 255;
+  const float fg_rgb[3] = {25, 21, 23};
+  const float fg_a = 15420.f / 65535;
+  const float fg_a2 = 2.0f;
+  float out_rgb[3];
+  float out_a;
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/true, /*clamp=*/false);
+  EXPECT_THAT(out_rgb,
+              ElementsAre(FloatNear(101.5f, .05f), FloatNear(105.1f, .05f),
+                          FloatNear(114.8f, .05f)));
+  EXPECT_NEAR(out_a, 3174.f / 4095, 1e-5);
+  PerformAlphaBlending(
+      /*bg=*/{&bg_rgb[0], &bg_rgb[1], &bg_rgb[2], &bg_a},
+      /*fg=*/{&fg_rgb[0], &fg_rgb[1], &fg_rgb[2], &fg_a2},
+      /*out=*/{&out_rgb[0], &out_rgb[1], &out_rgb[2], &out_a}, 1,
+      /*alpha_is_premultiplied=*/true, /*clamp=*/true);
+  EXPECT_THAT(out_rgb, ElementsAre(FloatNear(fg_rgb[0], .05f),
+                                   FloatNear(fg_rgb[1], .05f),
+                                   FloatNear(fg_rgb[2], .05f)));
+  EXPECT_NEAR(out_a, 1.0f, 1e-5);
+}
+
+TEST(AlphaTest, Mul) {
+  const float bg = 100;
+  const float fg = 25;
+  float out;
+  PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/false);
+  EXPECT_THAT(out, FloatNear(fg * bg, .05f));
+  PerformMulBlending(&bg, &fg, &out, 1, /*clamp=*/true);
+  EXPECT_THAT(out, FloatNear(bg, .05f));
+}
+
+TEST(AlphaTest, PremultiplyAndUnpremultiply) {
+  const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f};
+  float r[] = {120, 130, 140, 150};
+  float g[] = {124, 134, 144, 154};
+  float b[] = {127, 137, 147, 157};
+
+  PremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(
+      r, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(130 * 63.f / 255, 1e-5f),
+                     FloatNear(140 * 127.f / 255, 1e-5f), 150));
+  EXPECT_THAT(
+      g, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(134 * 63.f / 255, 1e-5f),
+                     FloatNear(144 * 127.f / 255, 1e-5f), 154));
+  EXPECT_THAT(
+      b, ElementsAre(FloatNear(0.f, 1e-5f), FloatNear(137 * 63.f / 255, 1e-5f),
+                     FloatNear(147 * 127.f / 255, 1e-5f), 157));
+
+  UnpremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(FloatNear(120, 1e-4f), FloatNear(130, 1e-4f),
+                             FloatNear(140, 1e-4f), FloatNear(150, 1e-4f)));
+  EXPECT_THAT(g, ElementsAre(FloatNear(124, 1e-4f), FloatNear(134, 1e-4f),
+                             FloatNear(144, 1e-4f), FloatNear(154, 1e-4f)));
+  EXPECT_THAT(b, ElementsAre(FloatNear(127, 1e-4f), FloatNear(137, 1e-4f),
+                             FloatNear(147, 1e-4f), FloatNear(157, 1e-4f)));
+}
+
+TEST(AlphaTest, UnpremultiplyAndPremultiply) {
+  const float alpha[] = {0.f, 63.f / 255, 127.f / 255, 1.f};
+  float r[] = {50, 60, 70, 80};
+  float g[] = {54, 64, 74, 84};
+  float b[] = {57, 67, 77, 87};
+
+  UnpremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(_, FloatNear(60 * 255.f / 63, 1e-4f),
+                             FloatNear(70 * 255.f / 127, 1e-4f), 80));
+  EXPECT_THAT(g, ElementsAre(_, FloatNear(64 * 255.f / 63, 1e-4f),
+                             FloatNear(74 * 255.f / 127, 1e-4f), 84));
+  EXPECT_THAT(b, ElementsAre(_, FloatNear(67 * 255.f / 63, 1e-4f),
+                             FloatNear(77 * 255.f / 127, 1e-4f), 87));
+
+  PremultiplyAlpha(r, g, b, alpha, 4);
+  EXPECT_THAT(r, ElementsAre(FloatNear(50, 1e-4f), FloatNear(60, 1e-4f),
+                             FloatNear(70, 1e-4f), FloatNear(80, 1e-4f)));
+  EXPECT_THAT(g, ElementsAre(FloatNear(54, 1e-4f), FloatNear(64, 1e-4f),
+                             FloatNear(74, 1e-4f), FloatNear(84, 1e-4f)));
+  EXPECT_THAT(b, ElementsAre(FloatNear(57, 1e-4f), FloatNear(67, 1e-4f),
+                             FloatNear(77, 1e-4f), FloatNear(87, 1e-4f)));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.cc b/third_party/jpeg-xl/lib/jxl/ans_common.cc
new file mode 100644
index 0000000000..d2cf897ec4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common.cc
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ans_common.h"
+
+#include <numeric>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+std::vector<int32_t> CreateFlatHistogram(int length, int total_count) {
+  JXL_ASSERT(length > 0);
+  JXL_ASSERT(length <= total_count);
+  const int count = total_count / length;
+  std::vector<int32_t> result(length, count);
+  const int rem_counts = total_count % length;
+  for (int i = 0; i < rem_counts; ++i) {
+    ++result[i];
+  }
+  return result;
+}
+
+// First, all trailing non-occurring symbols are removed from the distribution;
+// if this leaves the distribution empty, a dummy symbol with max weight is
+// added. This ensures that the resulting distribution sums to total table size.
+// Then, `entry_size` is chosen to be the largest power of two so that
+// `table_size` = ANS_TAB_SIZE/`entry_size` is at least as big as the
+// distribution size.
+// Note that each entry will only ever contain two different symbols, and
+// consecutive ranges of offsets, which allows us to use a compact
+// representation.
+// Each entry is initialized with only the (symbol=i, offset) pairs; then
+// positions for which the entry overflows (i.e. distribution[i] > entry_size)
+// or is not full are computed, and put into a stack in increasing order.
+// Missing symbols in the distribution are padded with 0 (because `table_size`
+// >= number of symbols). The `cutoff` value for each entry is initialized to
+// the number of occupied slots in that entry (i.e. `distributions[i]`). While
+// the overflowing-symbol stack is not empty (which implies that the
+// underflowing-symbol stack also is not), the top overfull and underfull
+// positions are popped from the stack; the empty slots in the underfull entry
+// are then filled with as many slots as needed from the overfull entry; such
+// slots are placed after the slots in the overfull entry, and `offsets[1]` is
+// computed accordingly. The formerly underfull entry is thus now neither
+// underfull nor overfull, and represents exactly two symbols. The overfull
+// entry might be either overfull or underfull, and is pushed into the
+// corresponding stack.
+void InitAliasTable(std::vector<int32_t> distribution, uint32_t range,
+                    size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a) {
+  while (!distribution.empty() && distribution.back() == 0) {
+    distribution.pop_back();
+  }
+  // Ensure that a valid table is always returned, even for an empty
+  // alphabet. Otherwise, a specially-crafted stream might crash the
+  // decoder.
+  if (distribution.empty()) {
+    distribution.emplace_back(range);
+  }
+  const size_t table_size = 1 << log_alpha_size;
+#if JXL_ENABLE_ASSERT
+  int sum = std::accumulate(distribution.begin(), distribution.end(), 0);
+#endif  // JXL_ENABLE_ASSERT
+  JXL_ASSERT(static_cast<uint32_t>(sum) == range);
+  // range must be a power of two
+  JXL_ASSERT((range & (range - 1)) == 0);
+  JXL_ASSERT(distribution.size() <= table_size);
+  JXL_ASSERT(table_size <= range);
+  const uint32_t entry_size = range >> log_alpha_size;  // this is exact
+  // Special case for single-symbol distributions, that ensures that the state
+  // does not change when decoding from such a distribution. Note that, since we
+  // hardcode offset0 == 0, it is not straightforward (if at all possible) to
+  // fix the general case to produce this result.
+  for (size_t sym = 0; sym < distribution.size(); sym++) {
+    if (distribution[sym] == ANS_TAB_SIZE) {
+      for (size_t i = 0; i < table_size; i++) {
+        a[i].right_value = sym;
+        a[i].cutoff = 0;
+        a[i].offsets1 = entry_size * i;
+        a[i].freq0 = 0;
+        a[i].freq1_xor_freq0 = ANS_TAB_SIZE;
+      }
+      return;
+    }
+  }
+  std::vector<uint32_t> underfull_posn;
+  std::vector<uint32_t> overfull_posn;
+  std::vector<uint32_t> cutoffs(1 << log_alpha_size);
+  // Initialize entries.
+  for (size_t i = 0; i < distribution.size(); i++) {
+    cutoffs[i] = distribution[i];
+    if (cutoffs[i] > entry_size) {
+      overfull_posn.push_back(i);
+    } else if (cutoffs[i] < entry_size) {
+      underfull_posn.push_back(i);
+    }
+  }
+  for (uint32_t i = distribution.size(); i < table_size; i++) {
+    cutoffs[i] = 0;
+    underfull_posn.push_back(i);
+  }
+  // Reassign overflow/underflow values.
+  while (!overfull_posn.empty()) {
+    uint32_t overfull_i = overfull_posn.back();
+    overfull_posn.pop_back();
+    JXL_ASSERT(!underfull_posn.empty());
+    uint32_t underfull_i = underfull_posn.back();
+    underfull_posn.pop_back();
+    uint32_t underfull_by = entry_size - cutoffs[underfull_i];
+    cutoffs[overfull_i] -= underfull_by;
+    // overfull positions have their original symbols
+    a[underfull_i].right_value = overfull_i;
+    a[underfull_i].offsets1 = cutoffs[overfull_i];
+    // Slots in the right part of entry underfull_i were taken from the end
+    // of the symbols in entry overfull_i.
+    if (cutoffs[overfull_i] < entry_size) {
+      underfull_posn.push_back(overfull_i);
+    } else if (cutoffs[overfull_i] > entry_size) {
+      overfull_posn.push_back(overfull_i);
+    }
+  }
+  for (uint32_t i = 0; i < table_size; i++) {
+    // cutoffs[i] is properly initialized but the clang-analyzer doesn't infer
+    // it since it is partially initialized across two for-loops.
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    if (cutoffs[i] == entry_size) {
+      a[i].right_value = i;
+      a[i].offsets1 = 0;
+      a[i].cutoff = 0;
+    } else {
+      // Note that, if cutoff is not equal to entry_size,
+      // a[i].offsets1 was initialized with (overfull cutoff) -
+      // (entry_size - a[i].cutoff). Thus, subtracting
+      // a[i].cutoff cannot make it negative.
+      a[i].offsets1 -= cutoffs[i];
+      a[i].cutoff = cutoffs[i];
+    }
+    const size_t freq0 = i < distribution.size() ? distribution[i] : 0;
+    const size_t i1 = a[i].right_value;
+    const size_t freq1 = i1 < distribution.size() ? distribution[i1] : 0;
+    a[i].freq0 = static_cast<uint16_t>(freq0);
+    a[i].freq1_xor_freq0 = static_cast<uint16_t>(freq1 ^ freq0);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common.h b/third_party/jpeg-xl/lib/jxl/ans_common.h
new file mode 100644
index 0000000000..fb5058e310
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common.h
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ANS_COMMON_H_
+#define LIB_JXL_ANS_COMMON_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <hwy/cache_control.h>  // Prefetch
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Returns the precision (number of bits) that should be used to store
+// a histogram count such that Log2Floor(count) == logcount.
+static JXL_INLINE uint32_t GetPopulationCountPrecision(uint32_t logcount,
+                                                       uint32_t shift) {
+  int32_t r = std::min<int>(
+      logcount, int(shift) - int((ANS_LOG_TAB_SIZE - logcount) >> 1));
+  if (r < 0) return 0;
+  return r;
+}
+
+// Returns a histogram where the counts are positive, differ by at most 1,
+// and add up to total_count. The bigger counts (if any) are at the beginning
+// of the histogram.
+std::vector<int32_t> CreateFlatHistogram(int length, int total_count);
+
+// An alias table implements a mapping from the [0, ANS_TAB_SIZE) range into
+// the [0, ANS_MAX_ALPHABET_SIZE) range, satisfying the following conditions:
+// - each symbol occurs as many times as specified by any valid distribution
+//   of frequencies of the symbols. A valid distribution here is an array of
+//   ANS_MAX_ALPHABET_SIZE that contains numbers in the range [0, ANS_TAB_SIZE],
+//   and whose sum is ANS_TAB_SIZE.
+// - lookups can be done in constant time, and also return how many smaller
+//   input values map into the same symbol, according to some well-defined order
+//   of input values.
+// - the space used by the alias table is given by a small constant times the
+//   index of the largest symbol with nonzero probability in the distribution.
+// Each of the entries in the table covers a range of `entry_size` values in the
+// [0, ANS_TAB_SIZE) range; consecutive entries represent consecutive
+// sub-ranges. In the range covered by entry `i`, the first `cutoff` values map
+// to symbol `i`, while the others map to symbol `right_value`.
+//
+// TODO(veluca): consider making the order used for computing offsets easier to
+// define - it is currently defined by the algorithm to compute the alias table.
+// Beware of breaking the implicit assumption that symbols that come after the
+// cutoff value should have an offset at least as big as the cutoff.
+
+struct AliasTable {
+  struct Symbol {
+    size_t value;
+    size_t offset;
+    size_t freq;
+  };
+
+// Working set size matters here (~64 tables x 256 entries).
+// offsets0 is always zero (beginning of [0] side among the same symbol).
+// offsets1 is an offset of (pos >= cutoff) side decremented by cutoff.
+#pragma pack(push, 1)
+  struct Entry {
+    uint8_t cutoff;       // < kEntrySizeMinus1 when used by ANS.
+    uint8_t right_value;  // < alphabet size.
+    uint16_t freq0;
+
+    // Only used if `greater` (see Lookup)
+    uint16_t offsets1;         // <= ANS_TAB_SIZE
+    uint16_t freq1_xor_freq0;  // for branchless ternary in Lookup
+  };
+#pragma pack(pop)
+
+  // Dividing `value` by `entry_size` determines `i`, the entry which is
+  // responsible for the input. If the remainder is below `cutoff`, then the
+  // mapped symbol is `i`; since `offsets[0]` stores the number of occurrences
+  // of `i` "before" the start of this entry, the offset of the input will be
+  // `offsets[0] + remainder`. If the remainder is above cutoff, the mapped
+  // symbol is `right_value`; since `offsets[1]` stores the number of
+  // occurrences of `right_value` "before" this entry, minus the `cutoff` value,
+  // the input offset is then `remainder + offsets[1]`.
+  static JXL_INLINE Symbol Lookup(const Entry* JXL_RESTRICT table, size_t value,
+                                  size_t log_entry_size,
+                                  size_t entry_size_minus_1) {
+    const size_t i = value >> log_entry_size;
+    const size_t pos = value & entry_size_minus_1;
+
+#if JXL_BYTE_ORDER_LITTLE
+    uint64_t entry;
+    memcpy(&entry, &table[i].cutoff, sizeof(entry));
+    const size_t cutoff = entry & 0xFF;              // = MOVZX
+    const size_t right_value = (entry >> 8) & 0xFF;  // = MOVZX
+    const size_t freq0 = (entry >> 16) & 0xFFFF;
+#else
+    // Generates multiple loads with complex addressing.
+    const size_t cutoff = table[i].cutoff;
+    const size_t right_value = table[i].right_value;
+    const size_t freq0 = table[i].freq0;
+#endif
+
+    const bool greater = pos >= cutoff;
+
+#if JXL_BYTE_ORDER_LITTLE
+    const uint64_t conditional = greater ? entry : 0;  // = CMOV
+    const size_t offsets1_or_0 = (conditional >> 32) & 0xFFFF;
+    const size_t freq1_xor_freq0_or_0 = conditional >> 48;
+#else
+    const size_t offsets1_or_0 = greater ? table[i].offsets1 : 0;
+    const size_t freq1_xor_freq0_or_0 = greater ? table[i].freq1_xor_freq0 : 0;
+#endif
+
+    // WARNING: moving this code may interfere with CMOV heuristics.
+    Symbol s;
+    s.value = greater ? right_value : i;
+    s.offset = offsets1_or_0 + pos;
+    s.freq = freq0 ^ freq1_xor_freq0_or_0;  // = greater ? freq1 : freq0
+    // XOR avoids implementation-defined conversion from unsigned to signed.
+    // Alternatives considered: BEXTR is 2 cycles on HSW, SET+shift causes
+    // spills, simple ternary has a long dependency chain.
+
+    return s;
+  }
+
+  static HWY_INLINE void Prefetch(const Entry* JXL_RESTRICT table, size_t value,
+                                  size_t log_entry_size) {
+    const size_t i = value >> log_entry_size;
+    hwy::Prefetch(table + i);
+  }
+};
+
+// Computes an alias table for a given distribution.
+void InitAliasTable(std::vector<int32_t> distribution, uint32_t range,
+                    size_t log_alpha_size, AliasTable::Entry* JXL_RESTRICT a);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ANS_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ans_common_test.cc b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc
new file mode 100644
index 0000000000..487b6cf5bd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_common_test.cc
@@ -0,0 +1,43 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/ans_common.h"
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void VerifyAliasDistribution(const std::vector<int>& distribution,
+                             uint32_t range) {
+  constexpr size_t log_alpha_size = 8;
+  AliasTable::Entry table[1 << log_alpha_size];
+  InitAliasTable(distribution, range, log_alpha_size, table);
+  std::vector<std::vector<uint32_t>> offsets(distribution.size());
+  for (uint32_t i = 0; i < range; i++) {
+    AliasTable::Symbol s = AliasTable::Lookup(
+        table, i, ANS_LOG_TAB_SIZE - 8, (1 << (ANS_LOG_TAB_SIZE - 8)) - 1);
+    offsets[s.value].push_back(s.offset);
+  }
+  for (uint32_t i = 0; i < distribution.size(); i++) {
+    ASSERT_EQ(static_cast<size_t>(distribution[i]), offsets[i].size());
+    std::sort(offsets[i].begin(), offsets[i].end());
+    for (uint32_t j = 0; j < offsets[i].size(); j++) {
+      ASSERT_EQ(offsets[i][j], j);
+    }
+  }
+}
+
+TEST(ANSCommonTest, AliasDistributionSmoke) {
+  VerifyAliasDistribution({ANS_TAB_SIZE / 2, ANS_TAB_SIZE / 2}, ANS_TAB_SIZE);
+  VerifyAliasDistribution({ANS_TAB_SIZE}, ANS_TAB_SIZE);
+  VerifyAliasDistribution({0, 0, 0, ANS_TAB_SIZE, 0}, ANS_TAB_SIZE);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/ans_params.h b/third_party/jpeg-xl/lib/jxl/ans_params.h
new file mode 100644
index 0000000000..4bbc284c0b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_params.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ANS_PARAMS_H_
+#define LIB_JXL_ANS_PARAMS_H_
+
+// Common parameters that are needed for both the ANS entropy encoding and
+// decoding methods.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+// TODO(veluca): decide if 12 is the best constant here (valid range is up to
+// 16). This requires recomputing the Huffman tables in {enc,dec}_ans.cc
+// 14 gives a 0.2% improvement at d1 and makes d8 slightly worse. This is
+// likely not worth the increase in encoder complexity.
+#define ANS_LOG_TAB_SIZE 12u
+#define ANS_TAB_SIZE (1 << ANS_LOG_TAB_SIZE)
+#define ANS_TAB_MASK (ANS_TAB_SIZE - 1)
+
+// Largest possible symbol to be encoded by either ANS or prefix coding.
+#define PREFIX_MAX_ALPHABET_SIZE 4096
+#define ANS_MAX_ALPHABET_SIZE 256
+
+// Max number of bits for prefix coding.
+#define PREFIX_MAX_BITS 15
+
+#define ANS_SIGNATURE 0x13  // Initial state, used as CRC.
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ANS_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/ans_test.cc b/third_party/jpeg-xl/lib/jxl/ans_test.cc
new file mode 100644
index 0000000000..06bc46477f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/ans_test.cc
@@ -0,0 +1,278 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void RoundtripTestcase(int n_histograms, int alphabet_size,
+                       const std::vector<Token>& input_values) {
+  constexpr uint16_t kMagic1 = 0x9e33;
+  constexpr uint16_t kMagic2 = 0x8b04;
+
+  BitWriter writer;
+  // Space for magic bytes.
+  BitWriter::Allotment allotment_magic1(&writer, 16);
+  writer.Write(16, kMagic1);
+  allotment_magic1.ReclaimAndCharge(&writer, 0, nullptr);
+
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  std::vector<std::vector<Token>> input_values_vec;
+  input_values_vec.push_back(input_values);
+
+  BuildAndEncodeHistograms(HistogramParams(), n_histograms, input_values_vec,
+                           &codes, &context_map, &writer, 0, nullptr);
+  WriteTokens(input_values_vec[0], codes, context_map, &writer, 0, nullptr);
+
+  // Magic bytes + padding
+  BitWriter::Allotment allotment_magic2(&writer, 24);
+  writer.Write(16, kMagic2);
+  writer.ZeroPadToByte();
+  allotment_magic2.ReclaimAndCharge(&writer, 0, nullptr);
+
+  // We do not truncate the output. Reading past the end reads out zeroes
+  // anyway.
+  BitReader br(writer.GetSpan());
+
+  ASSERT_EQ(br.ReadBits(16), kMagic1);
+
+  std::vector<uint8_t> dec_context_map;
+  ANSCode decoded_codes;
+  ASSERT_TRUE(
+      DecodeHistograms(&br, n_histograms, &decoded_codes, &dec_context_map));
+  ASSERT_EQ(dec_context_map, context_map);
+  ANSSymbolReader reader(&decoded_codes, &br);
+
+  for (const Token& symbol : input_values) {
+    uint32_t read_symbol =
+        reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+    ASSERT_EQ(read_symbol, symbol.value);
+  }
+  ASSERT_TRUE(reader.CheckANSFinalState());
+
+  ASSERT_EQ(br.ReadBits(16), kMagic2);
+  EXPECT_TRUE(br.Close());
+}
+
+TEST(ANSTest, EmptyRoundtrip) {
+  RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, std::vector<Token>());
+}
+
+TEST(ANSTest, SingleSymbolRoundtrip) {
+  for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) {
+    RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE, {{0, i}});
+  }
+  for (uint32_t i = 0; i < ANS_MAX_ALPHABET_SIZE; i++) {
+    RoundtripTestcase(2, ANS_MAX_ALPHABET_SIZE,
+                      std::vector<Token>(1024, {0, i}));
+  }
+}
+
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+constexpr size_t kReps = 3;
+#else
+constexpr size_t kReps = 10;
+#endif
+
+void RoundtripRandomStream(int alphabet_size, size_t reps = kReps,
+                           size_t num = 1 << 18) {
+  constexpr int kNumHistograms = 3;
+  Rng rng(0);
+  for (size_t i = 0; i < reps; i++) {
+    std::vector<Token> symbols;
+    for (size_t j = 0; j < num; j++) {
+      int context = rng.UniformI(0, kNumHistograms);
+      int value = rng.UniformU(0, alphabet_size);
+      symbols.emplace_back(context, value);
+    }
+    RoundtripTestcase(kNumHistograms, alphabet_size, symbols);
+  }
+}
+
+void RoundtripRandomUnbalancedStream(int alphabet_size) {
+  constexpr int kNumHistograms = 3;
+  constexpr int kPrecision = 1 << 10;
+  Rng rng(0);
+  for (size_t i = 0; i < kReps; i++) {
+    std::vector<int> distributions[kNumHistograms] = {};
+    for (int j = 0; j < kNumHistograms; j++) {
+      distributions[j].resize(kPrecision);
+      int symbol = 0;
+      int remaining = 1;
+      for (int k = 0; k < kPrecision; k++) {
+        if (remaining == 0) {
+          if (symbol < alphabet_size - 1) symbol++;
+          // There is no meaning behind this distribution: it's anything that
+          // will create a nonuniform distribution and won't have too few
+          // symbols usually. Also we want different distributions we get to be
+          // sufficiently dissimilar.
+          remaining = rng.UniformU(0, kPrecision - k + 1);
+        }
+        distributions[j][k] = symbol;
+        remaining--;
+      }
+    }
+    std::vector<Token> symbols;
+    for (int j = 0; j < 1 << 18; j++) {
+      int context = rng.UniformI(0, kNumHistograms);
+      int value = rng.UniformU(0, kPrecision);
+      symbols.emplace_back(context, value);
+    }
+    RoundtripTestcase(kNumHistograms + 1, alphabet_size, symbols);
+  }
+}
+
+TEST(ANSTest, RandomStreamRoundtrip3Small) { RoundtripRandomStream(3, 1, 16); }
+
+TEST(ANSTest, RandomStreamRoundtrip3) { RoundtripRandomStream(3); }
+
+TEST(ANSTest, RandomStreamRoundtripBig) {
+  RoundtripRandomStream(ANS_MAX_ALPHABET_SIZE);
+}
+
+TEST(ANSTest, RandomUnbalancedStreamRoundtrip3) {
+  RoundtripRandomUnbalancedStream(3);
+}
+
+TEST(ANSTest, RandomUnbalancedStreamRoundtripBig) {
+  RoundtripRandomUnbalancedStream(ANS_MAX_ALPHABET_SIZE);
+}
+
+TEST(ANSTest, UintConfigRoundtrip) {
+  for (size_t log_alpha_size = 5; log_alpha_size <= 8; log_alpha_size++) {
+    std::vector<HybridUintConfig> uint_config, uint_config_dec;
+    for (size_t i = 0; i < log_alpha_size; i++) {
+      for (size_t j = 0; j <= i; j++) {
+        for (size_t k = 0; k <= i - j; k++) {
+          uint_config.emplace_back(i, j, k);
+        }
+      }
+    }
+    uint_config.emplace_back(log_alpha_size, 0, 0);
+    uint_config_dec.resize(uint_config.size());
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, 10 * uint_config.size());
+    EncodeUintConfigs(uint_config, &writer, log_alpha_size);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    writer.ZeroPadToByte();
+    BitReader br(writer.GetSpan());
+    EXPECT_TRUE(DecodeUintConfigs(log_alpha_size, &uint_config_dec, &br));
+    EXPECT_TRUE(br.Close());
+    for (size_t i = 0; i < uint_config.size(); i++) {
+      EXPECT_EQ(uint_config[i].split_token, uint_config_dec[i].split_token);
+      EXPECT_EQ(uint_config[i].msb_in_token, uint_config_dec[i].msb_in_token);
+      EXPECT_EQ(uint_config[i].lsb_in_token, uint_config_dec[i].lsb_in_token);
+    }
+  }
+}
+
+void TestCheckpointing(bool ans, bool lz77) {
+  std::vector<std::vector<Token>> input_values(1);
+  for (size_t i = 0; i < 1024; i++) {
+    input_values[0].push_back(Token(0, i % 4));
+  }
+  // up to lz77 window size.
+  for (size_t i = 0; i < (1 << 20) - 1022; i++) {
+    input_values[0].push_back(Token(0, (i % 5) + 4));
+  }
+  // Ensure that when the window wraps around, new values are different.
+  input_values[0].push_back(Token(0, 0));
+  for (size_t i = 0; i < 1024; i++) {
+    input_values[0].push_back(Token(0, i % 4));
+  }
+
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  HistogramParams params;
+  params.lz77_method = lz77 ? HistogramParams::LZ77Method::kLZ77
+                            : HistogramParams::LZ77Method::kNone;
+  params.force_huffman = !ans;
+
+  BitWriter writer;
+  {
+    auto input_values_copy = input_values;
+    BuildAndEncodeHistograms(params, 1, input_values_copy, &codes, &context_map,
+                             &writer, 0, nullptr);
+    WriteTokens(input_values_copy[0], codes, context_map, &writer, 0, nullptr);
+    writer.ZeroPadToByte();
+  }
+
+  // We do not truncate the output. Reading past the end reads out zeroes
+  // anyway.
+  BitReader br(writer.GetSpan());
+  Status status = true;
+  {
+    BitReaderScopedCloser bc(&br, &status);
+
+    std::vector<uint8_t> dec_context_map;
+    ANSCode decoded_codes;
+    ASSERT_TRUE(DecodeHistograms(&br, 1, &decoded_codes, &dec_context_map));
+    ASSERT_EQ(dec_context_map, context_map);
+    ANSSymbolReader reader(&decoded_codes, &br);
+
+    ANSSymbolReader::Checkpoint checkpoint;
+    size_t br_pos = 0;
+    constexpr size_t kInterval = ANSSymbolReader::kMaxCheckpointInterval - 2;
+    for (size_t i = 0; i < input_values[0].size(); i++) {
+      if (i % kInterval == 0 && i > 0) {
+        reader.Restore(checkpoint);
+        ASSERT_TRUE(br.Close());
+        br = BitReader(writer.GetSpan());
+        br.SkipBits(br_pos);
+        for (size_t j = i - kInterval; j < i; j++) {
+          Token symbol = input_values[0][j];
+          uint32_t read_symbol =
+              reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+          ASSERT_EQ(read_symbol, symbol.value) << "j = " << j;
+        }
+      }
+      if (i % kInterval == 0) {
+        reader.Save(&checkpoint);
+        br_pos = br.TotalBitsConsumed();
+      }
+      Token symbol = input_values[0][i];
+      uint32_t read_symbol =
+          reader.ReadHybridUint(symbol.context, &br, dec_context_map);
+      ASSERT_EQ(read_symbol, symbol.value) << "i = " << i;
+    }
+    ASSERT_TRUE(reader.CheckANSFinalState());
+  }
+  EXPECT_TRUE(status);
+}
+
+TEST(ANSTest, TestCheckpointingANS) {
+  TestCheckpointing(/*ans=*/true, /*lz77=*/false);
+}
+
+TEST(ANSTest, TestCheckpointingPrefix) {
+  TestCheckpointing(/*ans=*/false, /*lz77=*/false);
+}
+
+TEST(ANSTest, TestCheckpointingANSLZ77) {
+  TestCheckpointing(/*ans=*/true, /*lz77=*/true);
+}
+
+TEST(ANSTest, TestCheckpointingPrefixLZ77) {
+  TestCheckpointing(/*ans=*/false, /*lz77=*/true);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/arch_macros.h b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
new file mode 100644
index 0000000000..a98301915e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/arch_macros.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_ARCH_MACROS_H_
+#define LIB_JXL_BASE_ARCH_MACROS_H_
+
+// Defines the JXL_ARCH_* macros.
+
+namespace jxl {
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define JXL_ARCH_X64 1
+#else
+#define JXL_ARCH_X64 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define JXL_ARCH_PPC 1
+#else
+#define JXL_ARCH_PPC 0
+#endif
+
+#if defined(__aarch64__) || defined(__arm__)
+#define JXL_ARCH_ARM 1
+#else
+#define JXL_ARCH_ARM 0
+#endif
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_ARCH_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/bits.h b/third_party/jpeg-xl/lib/jxl/base/bits.h
new file mode 100644
index 0000000000..9f86118e72
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/bits.h
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BITS_H_
+#define LIB_JXL_BASE_BITS_H_
+
+// Specialized instructions for processing register-sized bit arrays.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace jxl {
+
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+
+template <typename T>
+constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+  unsigned long index;
+  _BitScanReverse(&index, x);
+  return 31 - index;
+#else
+  return static_cast<size_t>(__builtin_clz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+  unsigned long index;
+  _BitScanReverse64(&index, x);
+  return 63 - index;
+#else   // JXL_ARCH_X64
+  // _BitScanReverse64 not available
+  uint32_t msb = static_cast<uint32_t>(x >> 32u);
+  unsigned long index;
+  if (msb == 0) {
+    uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+    _BitScanReverse(&index, lsb);
+    return 63 - index;
+  } else {
+    _BitScanReverse(&index, msb);
+    return 31 - index;
+  }
+#endif  // JXL_ARCH_X64
+#else
+  return static_cast<size_t>(__builtin_clzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsAboveMS1Bit_Nonzero(const T x) {
+  static_assert(!IsSigned<T>(), "Num0BitsAboveMS1Bit_Nonzero: use unsigned");
+  return Num0BitsAboveMS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Undefined results for x == 0.
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<4> /* tag */, const uint32_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+  unsigned long index;
+  _BitScanForward(&index, x);
+  return index;
+#else
+  return static_cast<size_t>(__builtin_ctz(x));
+#endif
+}
+static JXL_INLINE JXL_MAYBE_UNUSED size_t
+Num0BitsBelowLS1Bit_Nonzero(SizeTag<8> /* tag */, const uint64_t x) {
+  JXL_DASSERT(x != 0);
+#if JXL_COMPILER_MSVC
+#if JXL_ARCH_X64
+  unsigned long index;
+  _BitScanForward64(&index, x);
+  return index;
+#else   // JXL_ARCH_64
+  // _BitScanForward64 not available
+  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+  unsigned long index;
+  if (lsb == 0) {
+    uint32_t msb = static_cast<uint32_t>(x >> 32u);
+    _BitScanForward(&index, msb);
+    return 32 + index;
+  } else {
+    _BitScanForward(&index, lsb);
+    return index;
+  }
+#endif  // JXL_ARCH_X64
+#else
+  return static_cast<size_t>(__builtin_ctzll(x));
+#endif
+}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit_Nonzero(T x) {
+  static_assert(!IsSigned<T>(), "Num0BitsBelowLS1Bit_Nonzero: use unsigned");
+  return Num0BitsBelowLS1Bit_Nonzero(SizeTag<sizeof(T)>(), x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsAboveMS1Bit(const T x) {
+  return (x == 0) ? sizeof(T) * 8 : Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns bit width for x == 0.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t Num0BitsBelowLS1Bit(const T x) {
+  return (x == 0) ? sizeof(T) * 8 : Num0BitsBelowLS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded down.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t FloorLog2Nonzero(const T x) {
+  return (sizeof(T) * 8 - 1) ^ Num0BitsAboveMS1Bit_Nonzero(x);
+}
+
+// Returns base-2 logarithm, rounded up.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED size_t CeilLog2Nonzero(const T x) {
+  const size_t floor_log2 = FloorLog2Nonzero(x);
+  if ((x & (x - 1)) == 0) return floor_log2;  // power of two
+  return floor_log2 + 1;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_BITS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/byte_order.h b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
new file mode 100644
index 0000000000..8966834e08
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/byte_order.h
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_BYTE_ORDER_H_
+#define LIB_JXL_BASE_BYTE_ORDER_H_
+
+#include <jxl/types.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include "lib/jxl/base/compiler_specific.h"
+
+#if JXL_COMPILER_MSVC
+#include <intrin.h>  // _byteswap_*
+#endif
+
+#if (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#define JXL_BYTE_ORDER_LITTLE 1
+#else
+// This means that we don't know that the byte order is little endian, in
+// this case we use endian-neutral code that works for both little- and
+// big-endian.
+#define JXL_BYTE_ORDER_LITTLE 0
+#endif
+
+// Returns whether the system is little-endian (least-significant byte first).
+#if JXL_BYTE_ORDER_LITTLE
+static constexpr bool IsLittleEndian() { return true; }
+#else
+static inline bool IsLittleEndian() {
+  const uint32_t multibyte = 1;
+  uint8_t byte;
+  memcpy(&byte, &multibyte, 1);
+  return byte == 1;
+}
+#endif
+
+static inline bool SwapEndianness(JxlEndianness endianness) {
+  return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) ||
+          (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian()));
+}
+
+#if JXL_COMPILER_MSVC
+#define JXL_BSWAP16(x) _byteswap_ushort(x)
+#define JXL_BSWAP32(x) _byteswap_ulong(x)
+#define JXL_BSWAP64(x) _byteswap_uint64(x)
+#else
+#define JXL_BSWAP16(x) __builtin_bswap16(x)
+#define JXL_BSWAP32(x) __builtin_bswap32(x)
+#define JXL_BSWAP64(x) __builtin_bswap64(x)
+#endif
+
+static JXL_INLINE uint32_t LoadBE16(const uint8_t* p) {
+  const uint32_t byte1 = p[0];
+  const uint32_t byte0 = p[1];
+  return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadLE16(const uint8_t* p) {
+  const uint32_t byte0 = p[0];
+  const uint32_t byte1 = p[1];
+  return (byte1 << 8) | byte0;
+}
+
+static JXL_INLINE uint32_t LoadBE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint32_t big;
+  memcpy(&big, p, 4);
+  return JXL_BSWAP32(big);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint32_t byte3 = p[0];
+  const uint32_t byte2 = p[1];
+  const uint32_t byte1 = p[2];
+  const uint32_t byte0 = p[3];
+  return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadBE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t big;
+  memcpy(&big, p, 8);
+  return JXL_BSWAP64(big);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint64_t byte7 = p[0];
+  const uint64_t byte6 = p[1];
+  const uint64_t byte5 = p[2];
+  const uint64_t byte4 = p[3];
+  const uint64_t byte3 = p[4];
+  const uint64_t byte2 = p[5];
+  const uint64_t byte1 = p[6];
+  const uint64_t byte0 = p[7];
+  return (byte7 << 56ull) | (byte6 << 48ull) | (byte5 << 40ull) |
+         (byte4 << 32ull) | (byte3 << 24ull) | (byte2 << 16ull) |
+         (byte1 << 8ull) | byte0;
+#endif
+}
+
+static JXL_INLINE uint32_t LoadLE32(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint32_t little;
+  memcpy(&little, p, 4);
+  return little;
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint32_t byte0 = p[0];
+  const uint32_t byte1 = p[1];
+  const uint32_t byte2 = p[2];
+  const uint32_t byte3 = p[3];
+  return (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t little;
+  memcpy(&little, p, 8);
+  return little;
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  const uint64_t byte0 = p[0];
+  const uint64_t byte1 = p[1];
+  const uint64_t byte2 = p[2];
+  const uint64_t byte3 = p[3];
+  const uint64_t byte4 = p[4];
+  const uint64_t byte5 = p[5];
+  const uint64_t byte6 = p[6];
+  const uint64_t byte7 = p[7];
+  return (byte7 << 56) | (byte6 << 48) | (byte5 << 40) | (byte4 << 32) |
+         (byte3 << 24) | (byte2 << 16) | (byte1 << 8) | byte0;
+#endif
+}
+
+// Loads a Big-Endian float
+static JXL_INLINE float LoadBEFloat(const uint8_t* p) {
+  uint32_t u = LoadBE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+// Loads a Little-Endian float
+static JXL_INLINE float LoadLEFloat(const uint8_t* p) {
+  uint32_t u = LoadLE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) {
+  p[0] = (native >> 8) & 0xFF;
+  p[1] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreLE16(const uint32_t native, uint8_t* p) {
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+}
+
+static JXL_INLINE void StoreBE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint32_t big = JXL_BSWAP32(native);
+  memcpy(p, &big, 4);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[0] = native >> 24;
+  p[1] = (native >> 16) & 0xFF;
+  p[2] = (native >> 8) & 0xFF;
+  p[3] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreBE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint64_t big = JXL_BSWAP64(native);
+  memcpy(p, &big, 8);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[0] = native >> 56ull;
+  p[1] = (native >> 48ull) & 0xFF;
+  p[2] = (native >> 40ull) & 0xFF;
+  p[3] = (native >> 32ull) & 0xFF;
+  p[4] = (native >> 24ull) & 0xFF;
+  p[5] = (native >> 16ull) & 0xFF;
+  p[6] = (native >> 8ull) & 0xFF;
+  p[7] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE32(const uint32_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint32_t little = native;
+  memcpy(p, &little, 4);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[3] = native >> 24;
+  p[2] = (native >> 16) & 0xFF;
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) {
+#if JXL_BYTE_ORDER_LITTLE
+  const uint64_t little = native;
+  memcpy(p, &little, 8);
+#else
+  // Byte-order-independent - can't assume this machine is big endian.
+  p[7] = native >> 56;
+  p[6] = (native >> 48) & 0xFF;
+  p[5] = (native >> 40) & 0xFF;
+  p[4] = (native >> 32) & 0xFF;
+  p[3] = (native >> 24) & 0xFF;
+  p[2] = (native >> 16) & 0xFF;
+  p[1] = (native >> 8) & 0xFF;
+  p[0] = native & 0xFF;
+#endif
+}
+
+static JXL_INLINE float BSwapFloat(float x) {
+  uint32_t u;
+  memcpy(&u, &x, 4);
+  uint32_t uswap = JXL_BSWAP32(u);
+  float xswap;
+  memcpy(&xswap, &uswap, 4);
+  return xswap;
+}
+
+// Big/Little Endian order.
+struct OrderBE {};
+struct OrderLE {};
+
+// Wrappers for calling from generic code.
+static JXL_INLINE void Store16(OrderBE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreBE16(native, p);
+}
+
+static JXL_INLINE void Store16(OrderLE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreLE16(native, p);
+}
+
+static JXL_INLINE void Store32(OrderBE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreBE32(native, p);
+}
+
+static JXL_INLINE void Store32(OrderLE /*tag*/, const uint32_t native,
+                               uint8_t* p) {
+  return StoreLE32(native, p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderBE /*tag*/, const uint8_t* p) {
+  return LoadBE16(p);
+}
+
+static JXL_INLINE uint32_t Load16(OrderLE /*tag*/, const uint8_t* p) {
+  return LoadLE16(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderBE /*tag*/, const uint8_t* p) {
+  return LoadBE32(p);
+}
+
+static JXL_INLINE uint32_t Load32(OrderLE /*tag*/, const uint8_t* p) {
+  return LoadLE32(p);
+}
+
+#endif  // LIB_JXL_BASE_BYTE_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
new file mode 100644
index 0000000000..9a9cc585a1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/cache_aligned.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Disabled: slower than malloc + alignment.
+#define JXL_USE_MMAP 0
+
+#if JXL_USE_MMAP
+#include <sys/mman.h>
+#endif
+
+#include <algorithm>  // std::max
+#include <atomic>
+#include <hwy/base.h>  // kMaxVectorSize
+#include <limits>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace {
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void* allocated;
+  size_t allocated_size;
+  uint8_t left_padding[hwy::kMaxVectorSize];
+};
+#pragma pack(pop)
+
+std::atomic<uint64_t> num_allocations{0};
+std::atomic<uint64_t> bytes_in_use{0};
+std::atomic<uint64_t> max_bytes_in_use{0};
+
+}  // namespace
+
+// Avoids linker errors in pre-C++17 builds.
+constexpr size_t CacheAligned::kPointerSize;
+constexpr size_t CacheAligned::kCacheLineSize;
+constexpr size_t CacheAligned::kAlignment;
+constexpr size_t CacheAligned::kAlias;
+
+void CacheAligned::PrintStats() {
+  fprintf(
+      stderr, "Allocations: %" PRIuS " (max bytes in use: %E)\n",
+      static_cast<size_t>(num_allocations.load(std::memory_order_relaxed)),
+      static_cast<double>(max_bytes_in_use.load(std::memory_order_relaxed)));
+}
+
+size_t CacheAligned::NextOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = CacheAligned::kAlias / CacheAligned::kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  return CacheAligned::kAlignment * group;
+}
+
+void* CacheAligned::Allocate(const size_t payload_size, size_t offset) {
+  JXL_ASSERT(payload_size <= std::numeric_limits<size_t>::max() / 2);
+  JXL_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset |                  |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    // SVE/RVV vectors can be large, so we cannot rely on them (including the
+    // padding at the end of AllocationHeader) to fit in kAlignment.
+    offset = hwy::RoundUpTo(sizeof(AllocationHeader), kAlignment);
+  }
+
+#if JXL_USE_MMAP
+  const size_t allocated_size = offset + payload_size;
+  const int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE;
+  void* allocated =
+      mmap(nullptr, allocated_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (allocated == MAP_FAILED) return nullptr;
+  const uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated);
+#else
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void* allocated = malloc(allocated_size);
+  if (allocated == nullptr) return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+#endif
+
+#if 0
+  // No effect.
+  uintptr_t page_aligned = reinterpret_cast<uintptr_t>(allocated);
+  page_aligned &= ~(4096 - 1);
+  if (madvise(reinterpret_cast<void*>(page_aligned), allocated_size,
+              MADV_WILLNEED) != 0) {
+    JXL_NOTIFY_ERROR("madvise failed");
+  }
+#elif 0
+  // INCREASES both first and subsequent decode times.
+  if (mlock(allocated, allocated_size) != 0) {
+    JXL_NOTIFY_ERROR("mlock failed");
+  }
+#endif
+
+  // Update statistics (#allocations and max bytes in use)
+  num_allocations.fetch_add(1, std::memory_order_relaxed);
+  const uint64_t prev_bytes =
+      bytes_in_use.fetch_add(allocated_size, std::memory_order_acq_rel);
+  uint64_t expected_max = max_bytes_in_use.load(std::memory_order_acquire);
+  for (;;) {
+    const uint64_t desired =
+        std::max(expected_max, prev_bytes + allocated_size);
+    if (max_bytes_in_use.compare_exchange_strong(expected_max, desired,
+                                                 std::memory_order_acq_rel)) {
+      break;
+    }
+  }
+
+  const uintptr_t payload = aligned + offset;  // still aligned
+
+  // Stash `allocated` and payload_size inside header for use by Free().
+  AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
+  header->allocated = allocated;
+  header->allocated_size = allocated_size;
+
+  return JXL_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), 64);
+}
+
+void CacheAligned::Free(const void* aligned_pointer) {
+  if (aligned_pointer == nullptr) {
+    return;
+  }
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  JXL_ASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  // Subtract (2's complement negation).
+  bytes_in_use.fetch_add(~header->allocated_size + 1,
+                         std::memory_order_acq_rel);
+
+#if JXL_USE_MMAP
+  munmap(header->allocated, header->allocated_size);
+#else
+  free(header->allocated);
+#endif
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
new file mode 100644
index 0000000000..e57df14837
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/cache_aligned.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_CACHE_ALIGNED_H_
+#define LIB_JXL_BASE_CACHE_ALIGNED_H_
+
+// Memory allocator with support for alignment + misalignment.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Functions that depend on the cache line size.
+class CacheAligned {
+ public:
+  static void PrintStats();
+
+  static constexpr size_t kPointerSize = sizeof(void*);
+  static constexpr size_t kCacheLineSize = 64;
+  // To avoid RFOs, match L2 fill size (pairs of lines).
+  static constexpr size_t kAlignment = 2 * kCacheLineSize;
+  // Minimum multiple for which cache set conflicts and/or loads blocked by
+  // preceding stores can occur.
+  static constexpr size_t kAlias = 2048;
+
+  // Returns a 'random' (cyclical) offset suitable for Allocate.
+  static size_t NextOffset();
+
+  // Returns null or memory whose address is congruent to `offset` (mod kAlias).
+  // This reduces cache conflicts and load/store stalls, especially with large
+  // allocations that would otherwise have similar alignments. At least
+  // `payload_size` (which can be zero) bytes will be accessible.
+  static void* Allocate(size_t payload_size, size_t offset);
+
+  static void* Allocate(const size_t payload_size) {
+    return Allocate(payload_size, NextOffset());
+  }
+
+  static void Free(const void* aligned_pointer);
+};
+
+// Avoids the need for a function pointer (deleter) in CacheAlignedUniquePtr.
+struct CacheAlignedDeleter {
+  void operator()(uint8_t* aligned_pointer) const {
+    return CacheAligned::Free(aligned_pointer);
+  }
+};
+
+using CacheAlignedUniquePtr = std::unique_ptr<uint8_t[], CacheAlignedDeleter>;
+
+// Does not invoke constructors.
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) {
+  return CacheAlignedUniquePtr(
+      static_cast<uint8_t*>(CacheAligned::Allocate(bytes)),
+      CacheAlignedDeleter());
+}
+
+static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes,
+                                                  const size_t offset) {
+  return CacheAlignedUniquePtr(
+      static_cast<uint8_t*>(CacheAligned::Allocate(bytes, offset)),
+      CacheAlignedDeleter());
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_CACHE_ALIGNED_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
new file mode 100644
index 0000000000..abe1261f48
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/compiler_specific.h
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+#define LIB_JXL_BASE_COMPILER_SPECIFIC_H_
+
+// Macros for compiler version + nonstandard keywords, e.g. __builtin_expect.
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+// #if is shorter and safer than #ifdef. *_VERSION are zero if not detected,
+// otherwise 100 * major + minor version. Note that other packages check for
+// #ifdef COMPILER_MSVC, so we cannot use that same name.
+
+#ifdef _MSC_VER
+#define JXL_COMPILER_MSVC _MSC_VER
+#else
+#define JXL_COMPILER_MSVC 0
+#endif
+
+#ifdef __GNUC__
+#define JXL_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define JXL_COMPILER_GCC 0
+#endif
+
+#ifdef __clang__
+#define JXL_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+// Clang pretends to be GCC for compatibility.
+#undef JXL_COMPILER_GCC
+#define JXL_COMPILER_GCC 0
+#else
+#define JXL_COMPILER_CLANG 0
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_RESTRICT __restrict
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_RESTRICT __restrict__
+#else
+#define JXL_RESTRICT
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_INLINE __forceinline
+#define JXL_NOINLINE __declspec(noinline)
+#else
+#define JXL_INLINE inline __attribute__((always_inline))
+#define JXL_NOINLINE __attribute__((noinline))
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_NORETURN __declspec(noreturn)
+#elif JXL_COMPILER_GCC || JXL_COMPILER_CLANG
+#define JXL_NORETURN __attribute__((noreturn))
+#else
+#define JXL_NORETURN
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_UNREACHABLE __assume(false)
+#elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405
+#define JXL_UNREACHABLE __builtin_unreachable()
+#else
+#define JXL_UNREACHABLE
+#endif
+
+#if JXL_COMPILER_MSVC
+#define JXL_MAYBE_UNUSED
+#else
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define JXL_MAYBE_UNUSED __attribute__((unused))
+#endif
+
+// MSAN execution won't hurt if some code it not inlined, but this can greatly
+// improve compilation time. Unfortunately this macro can not be used just
+// everywhere - inside header files it leads to "multiple definition" error;
+// though it would be better not to have JXL_INLINE in header overall.
+#if JXL_MEMORY_SANITIZER || JXL_ADDRESS_SANITIZER || JXL_THREAD_SANITIZER
+#define JXL_MAYBE_INLINE JXL_MAYBE_UNUSED
+#else
+#define JXL_MAYBE_INLINE JXL_INLINE
+#endif
+
+#if JXL_COMPILER_MSVC
+// Unsupported, __assume is not the same.
+#define JXL_LIKELY(expr) expr
+#define JXL_UNLIKELY(expr) expr
+#else
+#define JXL_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define JXL_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if JXL_COMPILER_CLANG
+// Early versions of Clang did not support __builtin_assume_aligned.
+#define JXL_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned)
+#elif JXL_COMPILER_GCC
+#define JXL_HAS_ASSUME_ALIGNED 1
+#else
+#define JXL_HAS_ASSUME_ALIGNED 0
+#endif
+
+#if JXL_HAS_ASSUME_ALIGNED
+#define JXL_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define JXL_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+
+#ifdef __has_attribute
+#define JXL_HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define JXL_HAVE_ATTRIBUTE(x) 0
+#endif
+
+// Raises warnings if the function return value is unused. Should appear as the
+// first part of a function definition/declaration.
+#if JXL_HAVE_ATTRIBUTE(nodiscard)
+#define JXL_MUST_USE_RESULT [[nodiscard]]
+#elif JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(warn_unused_result)
+#define JXL_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define JXL_MUST_USE_RESULT
+#endif
+
+// Disable certain -fsanitize flags for functions that are expected to include
+// things like unsigned integer overflow. For example use in the function
+// declaration JXL_NO_SANITIZE("unsigned-integer-overflow") to silence unsigned
+// integer overflow ubsan messages.
+#if JXL_COMPILER_CLANG && JXL_HAVE_ATTRIBUTE(no_sanitize)
+#define JXL_NO_SANITIZE(X) __attribute__((no_sanitize(X)))
+#else
+#define JXL_NO_SANITIZE(X)
+#endif
+
+#if JXL_HAVE_ATTRIBUTE(__format__)
+#define JXL_FORMAT(idx_fmt, idx_arg) \
+  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define JXL_FORMAT(idx_fmt, idx_arg)
+#endif
+
+#if JXL_COMPILER_MSVC
+using ssize_t = intptr_t;
+#endif
+
+#endif  // LIB_JXL_BASE_COMPILER_SPECIFIC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
new file mode 100644
index 0000000000..20a911255c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+
+namespace jxl {
+
+// static
+JxlParallelRetCode ThreadPool::SequentialRunnerStatic(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1);
+  if (init_ret != 0) return init_ret;
+
+  for (uint32_t i = start_range; i < end_range; i++) {
+    (*func)(jpegxl_opaque, i, 0);
+  }
+  return 0;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
new file mode 100644
index 0000000000..ba7e7adfad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
@@ -0,0 +1,120 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_DATA_PARALLEL_H_
+#define LIB_JXL_BASE_DATA_PARALLEL_H_
+
+// Portable, low-overhead C++11 ThreadPool alternative to OpenMP for
+// data-parallel computations.
+
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#if JXL_COMPILER_MSVC
+// suppress warnings about the const & applied to function types
+#pragma warning(disable : 4180)
+#endif
+
+namespace jxl {
+
+class ThreadPool {
+ public:
+  ThreadPool(JxlParallelRunner runner, void* runner_opaque)
+      : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic),
+        runner_opaque_(runner ? runner_opaque : static_cast<void*>(this)) {}
+
+  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool& operator&(const ThreadPool&) = delete;
+
+  JxlParallelRunner runner() const { return runner_; }
+  void* runner_opaque() const { return runner_opaque_; }
+
+  // Runs init_func(num_threads) followed by data_func(task, thread) on worker
+  // thread(s) for every task in [begin, end). init_func() must return a Status
+  // indicating whether the initialization succeeded.
+  // "thread" is an integer smaller than num_threads.
+  // Not thread-safe - no two calls to Run may overlap.
+  // Subsequent calls will reuse the same threads.
+  //
+  // Precondition: begin <= end.
+  template <class InitFunc, class DataFunc>
+  Status Run(uint32_t begin, uint32_t end, const InitFunc& init_func,
+             const DataFunc& data_func, const char* caller = "") {
+    JXL_ASSERT(begin <= end);
+    if (begin == end) return true;
+    RunCallState<InitFunc, DataFunc> call_state(init_func, data_func);
+    // The runner_ uses the C convention and returns 0 in case of error, so we
+    // convert it to a Status.
+    return (*runner_)(runner_opaque_, static_cast<void*>(&call_state),
+                      &call_state.CallInitFunc, &call_state.CallDataFunc, begin,
+                      end) == 0;
+  }
+
+  // Use this as init_func when no initialization is needed.
+  static Status NoInit(size_t num_threads) { return true; }
+
+ private:
+  // class holding the state of a Run() call to pass to the runner_ as an
+  // opaque_jpegxl pointer.
+  template <class InitFunc, class DataFunc>
+  class RunCallState final {
+   public:
+    RunCallState(const InitFunc& init_func, const DataFunc& data_func)
+        : init_func_(init_func), data_func_(data_func) {}
+
+    // JxlParallelRunInit interface.
+    static int CallInitFunc(void* jpegxl_opaque, size_t num_threads) {
+      const auto* self =
+          static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+      // Returns -1 when the internal init function returns false Status to
+      // indicate an error.
+      return self->init_func_(num_threads) ? 0 : -1;
+    }
+
+    // JxlParallelRunFunction interface.
+    static void CallDataFunc(void* jpegxl_opaque, uint32_t value,
+                             size_t thread_id) {
+      const auto* self =
+          static_cast<RunCallState<InitFunc, DataFunc>*>(jpegxl_opaque);
+      return self->data_func_(value, thread_id);
+    }
+
+   private:
+    const InitFunc& init_func_;
+    const DataFunc& data_func_;
+  };
+
+  // Default JxlParallelRunner used when no runner is provided by the
+  // caller. This runner doesn't use any threading and thread_id is always 0.
+  static JxlParallelRetCode SequentialRunnerStatic(
+      void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+      JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
+
+  // The caller supplied runner function and its opaque void*.
+  const JxlParallelRunner runner_;
+  void* const runner_opaque_;
+};
+
+template <class InitFunc, class DataFunc>
+Status RunOnPool(ThreadPool* pool, const uint32_t begin, const uint32_t end,
+                 const InitFunc& init_func, const DataFunc& data_func,
+                 const char* caller) {
+  if (pool == nullptr) {
+    ThreadPool default_pool(nullptr, nullptr);
+    return default_pool.Run(begin, end, init_func, data_func, caller);
+  } else {
+    return pool->Run(begin, end, init_func, data_func, caller);
+  }
+}
+
+}  // namespace jxl
+#if JXL_COMPILER_MSVC
+#pragma warning(default : 4180)
+#endif
+
+#endif  // LIB_JXL_BASE_DATA_PARALLEL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/file_io.h b/third_party/jpeg-xl/lib/jxl/base/file_io.h
new file mode 100644
index 0000000000..64d5860915
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/file_io.h
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FILE_IO_H_
+#define LIB_JXL_BASE_FILE_IO_H_
+
+// Helper functions for reading/writing files.
+
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include <list>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Returns extension including the dot, or empty string if none. Assumes
+// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname
+// if the filename contains a dot and/or no other path component does.
+static inline std::string Extension(const std::string& filename) {
+  const size_t pos = filename.rfind('.');
+  if (pos == std::string::npos) return std::string();
+  return filename.substr(pos);
+}
+
+// RAII, ensures files are closed even when returning early.
+class FileWrapper {
+ public:
+  FileWrapper(const FileWrapper& other) = delete;
+  FileWrapper& operator=(const FileWrapper& other) = delete;
+
+  explicit FileWrapper(const std::string& pathname, const char* mode)
+      : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout)
+                              : fopen(pathname.c_str(), mode)),
+        close_on_delete_(pathname != "-") {
+#ifdef _WIN32
+    struct __stat64 s = {};
+    const int err = _stat64(pathname.c_str(), &s);
+    const bool is_file = (s.st_mode & S_IFREG) != 0;
+#else
+    struct stat s = {};
+    const int err = stat(pathname.c_str(), &s);
+    const bool is_file = S_ISREG(s.st_mode);
+#endif
+    if (err == 0 && is_file) {
+      size_ = s.st_size;
+    }
+  }
+
+  ~FileWrapper() {
+    if (file_ != nullptr && close_on_delete_) {
+      const int err = fclose(file_);
+      JXL_CHECK(err == 0);
+    }
+  }
+
+  // We intend to use FileWrapper as a replacement of FILE.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator FILE*() const { return file_; }
+
+  int64_t size() { return size_; }
+
+ private:
+  FILE* const file_;
+  bool close_on_delete_ = true;
+  int64_t size_ = -1;
+};
+
+template <typename ContainerType>
+static inline Status ReadFile(const std::string& pathname,
+                              ContainerType* JXL_RESTRICT bytes) {
+  FileWrapper f(pathname, "rb");
+  if (f == nullptr)
+    return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str());
+
+  // Get size of file in bytes
+  const int64_t size = f.size();
+  if (size < 0) {
+    // Size is unknown, loop reading chunks until EOF.
+    bytes->clear();
+    std::list<std::vector<uint8_t>> chunks;
+
+    size_t total_size = 0;
+    while (true) {
+      std::vector<uint8_t> chunk(16 * 1024);
+      const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f);
+      if (ferror(f) || bytes_read > chunk.size()) {
+        return JXL_FAILURE("Error reading %s", pathname.c_str());
+      }
+
+      chunk.resize(bytes_read);
+      total_size += bytes_read;
+      if (bytes_read != 0) {
+        chunks.emplace_back(std::move(chunk));
+      }
+      if (feof(f)) {
+        break;
+      }
+    }
+    bytes->resize(total_size);
+    size_t pos = 0;
+    for (const auto& chunk : chunks) {
+      // Needed in case ContainerType is std::string, whose data() is const.
+      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+      memcpy(bytes_writable + pos, chunk.data(), chunk.size());
+      pos += chunk.size();
+    }
+  } else {
+    // Size is known, read the file directly.
+    bytes->resize(static_cast<size_t>(size));
+    size_t pos = 0;
+    while (pos < bytes->size()) {
+      // Needed in case ContainerType is std::string, whose data() is const.
+      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
+      const size_t bytes_read =
+          fread(bytes_writable + pos, 1, bytes->size() - pos, f);
+      if (bytes_read == 0) return JXL_FAILURE("Failed to read");
+      pos += bytes_read;
+    }
+    JXL_ASSERT(pos == bytes->size());
+  }
+  return true;
+}
+
+template <typename ContainerType>
+static inline Status WriteFile(const ContainerType& bytes,
+                               const std::string& pathname) {
+  FileWrapper f(pathname, "wb");
+  if (f == nullptr)
+    return JXL_FAILURE("Failed to open file for writing: %s", pathname.c_str());
+
+  size_t pos = 0;
+  while (pos < bytes.size()) {
+    const size_t bytes_written =
+        fwrite(bytes.data() + pos, 1, bytes.size() - pos, f);
+    if (bytes_written == 0) return JXL_FAILURE("Failed to write");
+    pos += bytes_written;
+  }
+  JXL_ASSERT(pos == bytes.size());
+
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_FILE_IO_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h
new file mode 100644
index 0000000000..90bdeedf54
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/float.h
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FLOAT_H_
+#define LIB_JXL_BASE_FLOAT_H_
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace {
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16) {
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+  float result;
+  memcpy(&result, &bits32, 4);
+  return result;
+}
+}  // namespace
+
+template <typename SaveFloatAtFn>
+static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count,
+                                      size_t stride, JxlDataType type,
+                                      bool little_endian, float scale,
+                                      SaveFloatAtFn callback) {
+  switch (type) {
+    case JXL_TYPE_FLOAT:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLEFloat(src + stride * i));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBEFloat(src + stride * i));
+        }
+      }
+      return true;
+
+    case JXL_TYPE_UINT8:
+      for (size_t i = 0; i < count; ++i) {
+        callback(i, src[stride * i] * scale);
+      }
+      return true;
+
+    case JXL_TYPE_UINT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLE16(src + stride * i) * scale);
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBE16(src + stride * i) * scale);
+        }
+      }
+      return true;
+
+    case JXL_TYPE_FLOAT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadLE16(src + stride * i)));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadBE16(src + stride * i)));
+        }
+      }
+      return true;
+
+    default:
+      return JXL_FAILURE("Unsupported sample format");
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_FLOAT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/iaca.h b/third_party/jpeg-xl/lib/jxl/base/iaca.h
new file mode 100644
index 0000000000..e5732dae5c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/iaca.h
@@ -0,0 +1,65 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_IACA_H_
+#define LIB_JXL_BASE_IACA_H_
+
+#include "lib/jxl/base/compiler_specific.h"
+
+// IACA (Intel's Code Analyzer) analyzes instruction latencies, but only for
+// code between special markers. These functions embed such markers in an
+// executable, but only for reading via IACA - they deliberately trigger a
+// crash if executed to ensure they are removed in normal builds.
+
+#ifndef JXL_IACA_ENABLED
+#define JXL_IACA_ENABLED 0
+#endif
+
+namespace jxl {
+
+// Call before the region of interest.
+static JXL_INLINE void BeginIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+  asm volatile(
+      // UD2 "instruction" raises an invalid opcode exception.
+      ".byte 0x0F, 0x0B\n\t"
+      // Magic sequence recognized by IACA (MOV + addr32 fs:NOP). This actually
+      // clobbers EBX, but we don't care because the code won't be run, and we
+      // want IACA to observe the same code the compiler would have generated
+      // without this marker.
+      "movl $111, %%ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t"
+      :
+      :
+      // (Allegedly) clobbering memory may prevent reordering.
+      : "memory");
+#endif
+}
+
+// Call after the region of interest.
+static JXL_INLINE void EndIACA() {
+#if JXL_IACA_ENABLED && (JXL_COMPILER_GCC || JXL_COMPILER_CLANG)
+  asm volatile(
+      // See above.
+      "movl $222, %%ebx\n\t"
+      ".byte 0x64, 0x67, 0x90\n\t"
+      // UD2
+      ".byte 0x0F, 0x0B\n\t"
+      :
+      :
+      // (Allegedly) clobbering memory may prevent reordering.
+      : "memory");
+#endif
+}
+
+// Add to a scope to mark a region.
+struct ScopeIACA {
+  JXL_INLINE ScopeIACA() { BeginIACA(); }
+  JXL_INLINE ~ScopeIACA() { EndIACA(); }
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_IACA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/os_macros.h b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
new file mode 100644
index 0000000000..84d0b82bf5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/os_macros.h
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OS_MACROS_H_
+#define LIB_JXL_BASE_OS_MACROS_H_
+
+// Defines the JXL_OS_* macros.
+
+#if defined(_WIN32) || defined(_WIN64)
+#define JXL_OS_WIN 1
+#else
+#define JXL_OS_WIN 0
+#endif
+
+#ifdef __linux__
+#define JXL_OS_LINUX 1
+#else
+#define JXL_OS_LINUX 0
+#endif
+
+#ifdef __APPLE__
+#define JXL_OS_MAC 1
+#else
+#define JXL_OS_MAC 0
+#endif
+
+#define JXL_OS_IOS 0
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#undef JXL_OS_IOS
+#define JXL_OS_IOS 1
+#endif
+#endif
+
+#ifdef __FreeBSD__
+#define JXL_OS_FREEBSD 1
+#else
+#define JXL_OS_FREEBSD 0
+#endif
+
+#ifdef __HAIKU__
+#define JXL_OS_HAIKU 1
+#else
+#define JXL_OS_HAIKU 0
+#endif
+
+#endif  // LIB_JXL_BASE_OS_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/override.h b/third_party/jpeg-xl/lib/jxl/base/override.h
new file mode 100644
index 0000000000..1f8b657974
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/override.h
@@ -0,0 +1,29 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_OVERRIDE_H_
+#define LIB_JXL_BASE_OVERRIDE_H_
+
+// 'Trool' for command line arguments: force enable/disable, or use default.
+
+namespace jxl {
+
+// No effect if kDefault, otherwise forces a feature (typically a FrameHeader
+// flag) on or off.
+enum class Override : int { kOn = 1, kOff = 0, kDefault = -1 };
+
+static inline Override OverrideFromBool(bool flag) {
+  return flag ? Override::kOn : Override::kOff;
+}
+
+static inline bool ApplyOverride(Override o, bool default_condition) {
+  if (o == Override::kOn) return true;
+  if (o == Override::kOff) return false;
+  return default_condition;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_OVERRIDE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
new file mode 100644
index 0000000000..11e4bff6fe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/padded_bytes.h"
+
+namespace jxl {
+
+void PaddedBytes::IncreaseCapacityTo(size_t capacity) {
+  JXL_ASSERT(capacity > capacity_);
+
+  size_t new_capacity = std::max(capacity, 3 * capacity_ / 2);
+  new_capacity = std::max<size_t>(64, new_capacity);
+
+  // BitWriter writes up to 7 bytes past the end.
+  CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8);
+  if (new_data == nullptr) {
+    // Allocation failed, discard all data to ensure this is noticed.
+    size_ = capacity_ = 0;
+    return;
+  }
+
+  if (data_ == nullptr) {
+    // First allocation: ensure first byte is initialized (won't be copied).
+    new_data[0] = 0;
+  } else {
+    // Subsequent resize: copy existing data to new location.
+    memcpy(new_data.get(), data_.get(), size_);
+    // Ensure that the first new byte is initialized, to allow write_bits to
+    // safely append to the newly-resized PaddedBytes.
+    new_data[size_] = 0;
+  }
+
+  capacity_ = new_capacity;
+  std::swap(new_data, data_);
+}
+
+void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) {
+  JXL_DASSERT(new_begin <= new_end);
+  const size_t new_size = static_cast<size_t>(new_end - new_begin);
+
+  // memcpy requires non-overlapping ranges, and resizing might invalidate the
+  // new range. Neither happens if the new range is completely to the left or
+  // right of the _allocated_ range (irrespective of size_).
+  const uint8_t* allocated_end = begin() + capacity_;
+  const bool outside = new_end <= begin() || new_begin >= allocated_end;
+  if (outside) {
+    resize(new_size);  // grow or shrink
+    memcpy(data(), new_begin, new_size);
+    return;
+  }
+
+  // There is overlap. The new size cannot be larger because we own the memory
+  // and the new range cannot include anything outside the allocated range.
+  JXL_ASSERT(new_size <= capacity_);
+
+  // memmove allows overlap and capacity_ is sufficient.
+  memmove(data(), new_begin, new_size);
+  size_ = new_size;  // shrink
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
new file mode 100644
index 0000000000..4534ddf863
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/padded_bytes.h
@@ -0,0 +1,197 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PADDED_BYTES_H_
+#define LIB_JXL_BASE_PADDED_BYTES_H_
+
+// std::vector replacement with padding to reduce bounds checks in WriteBits
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // max
+#include <initializer_list>
+#include <utility>  // swap
+
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Provides a subset of the std::vector interface with some differences:
+// - allows BitWriter to write 64 bits at a time without bounds checking;
+// - ONLY zero-initializes the first byte (required by BitWriter);
+// - ensures cache-line alignment.
+class PaddedBytes {
+ public:
+  // Required for output params.
+  PaddedBytes() : size_(0), capacity_(0) {}
+
+  explicit PaddedBytes(size_t size) : size_(size), capacity_(0) {
+    if (size != 0) IncreaseCapacityTo(size);
+  }
+
+  PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) {
+    if (size != 0) {
+      IncreaseCapacityTo(size);
+    }
+    if (size_ != 0) {
+      memset(data(), value, size);
+    }
+  }
+
+  PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) {
+    if (size_ != 0) IncreaseCapacityTo(size_);
+    if (data() != nullptr) memcpy(data(), other.data(), size_);
+  }
+  PaddedBytes& operator=(const PaddedBytes& other) {
+    // Self-assignment is safe.
+    resize(other.size());
+    if (data() != nullptr) memmove(data(), other.data(), size_);
+    return *this;
+  }
+
+  // default is not OK - need to set other.size_ to 0!
+  PaddedBytes(PaddedBytes&& other) noexcept
+      : size_(other.size_),
+        capacity_(other.capacity_),
+        data_(std::move(other.data_)) {
+    other.size_ = other.capacity_ = 0;
+  }
+  PaddedBytes& operator=(PaddedBytes&& other) noexcept {
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    data_ = std::move(other.data_);
+
+    if (&other != this) {
+      other.size_ = other.capacity_ = 0;
+    }
+    return *this;
+  }
+
+  void swap(PaddedBytes& other) {
+    std::swap(size_, other.size_);
+    std::swap(capacity_, other.capacity_);
+    std::swap(data_, other.data_);
+  }
+
+  void reserve(size_t capacity) {
+    if (capacity > capacity_) IncreaseCapacityTo(capacity);
+  }
+
+  // NOTE: unlike vector, this does not initialize the new data!
+  // However, we guarantee that write_bits can safely append after
+  // the resize, as we zero-initialize the first new byte of data.
+  // If size < capacity(), does not invalidate the memory.
+  void resize(size_t size) {
+    if (size > capacity_) IncreaseCapacityTo(size);
+    size_ = (data() == nullptr) ? 0 : size;
+  }
+
+  // resize(size) plus explicit initialization of the new data with `value`.
+  void resize(size_t size, uint8_t value) {
+    size_t old_size = size_;
+    resize(size);
+    if (size_ > old_size) {
+      memset(data() + old_size, value, size_ - old_size);
+    }
+  }
+
+  // Amortized constant complexity due to exponential growth.
+  void push_back(uint8_t x) {
+    if (size_ == capacity_) {
+      IncreaseCapacityTo(capacity_ + 1);
+      if (data() == nullptr) return;
+    }
+
+    data_[size_++] = x;
+  }
+
+  size_t size() const { return size_; }
+  size_t capacity() const { return capacity_; }
+
+  uint8_t* data() { return data_.get(); }
+  const uint8_t* data() const { return data_.get(); }
+
+  // std::vector operations implemented in terms of the public interface above.
+
+  void clear() { resize(0); }
+  bool empty() const { return size() == 0; }
+
+  void assign(std::initializer_list<uint8_t> il) {
+    resize(il.size());
+    memcpy(data(), il.begin(), il.size());
+  }
+
+  // Replaces data() with [new_begin, new_end); potentially reallocates.
+  void assign(const uint8_t* new_begin, const uint8_t* new_end);
+
+  uint8_t* begin() { return data(); }
+  const uint8_t* begin() const { return data(); }
+  uint8_t* end() { return begin() + size(); }
+  const uint8_t* end() const { return begin() + size(); }
+
+  uint8_t& operator[](const size_t i) {
+    BoundsCheck(i);
+    return data()[i];
+  }
+  const uint8_t& operator[](const size_t i) const {
+    BoundsCheck(i);
+    return data()[i];
+  }
+
+  uint8_t& back() {
+    JXL_ASSERT(size() != 0);
+    return data()[size() - 1];
+  }
+  const uint8_t& back() const {
+    JXL_ASSERT(size() != 0);
+    return data()[size() - 1];
+  }
+
+  template <typename T>
+  void append(const T& other) {
+    append(reinterpret_cast<const uint8_t*>(other.data()),
+           reinterpret_cast<const uint8_t*>(other.data()) + other.size());
+  }
+
+  void append(const uint8_t* begin, const uint8_t* end) {
+    if (end - begin > 0) {
+      size_t old_size = size();
+      resize(size() + (end - begin));
+      memcpy(data() + old_size, begin, end - begin);
+    }
+  }
+
+ private:
+  void BoundsCheck(size_t i) const {
+    // <= is safe due to padding and required by BitWriter.
+    JXL_ASSERT(i <= size());
+  }
+
+  // Copies existing data to newly allocated "data_". If allocation fails,
+  // data() == nullptr and size_ = capacity_ = 0.
+  // The new capacity will be at least 1.5 times the old capacity. This ensures
+  // that we avoid quadratic behaviour.
+  void IncreaseCapacityTo(size_t capacity);
+
+  size_t size_;
+  size_t capacity_;
+  CacheAlignedUniquePtr data_;
+};
+
+template <typename T>
+static inline void Append(const T& s, PaddedBytes* out,
+                          size_t* JXL_RESTRICT byte_pos) {
+  memcpy(out->data() + *byte_pos, s.data(), s.size());
+  *byte_pos += s.size();
+  JXL_CHECK(*byte_pos <= out->size());
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_PADDED_BYTES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/printf_macros.h b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
new file mode 100644
index 0000000000..3215052afd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/printf_macros.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PRINTF_MACROS_H_
+#define LIB_JXL_BASE_PRINTF_MACROS_H_
+
+// Format string macros. These should be included after any other system
+// library since those may unconditionally define these, depending on the
+// platform.
+
+// PRIuS and PRIdS macros to print size_t and ssize_t respectively.
+#if !defined(PRIdS)
+#if defined(_WIN64)
+#define PRIdS "lld"
+#elif defined(_WIN32)
+#define PRIdS "d"
+#else
+#define PRIdS "zd"
+#endif
+#endif  // PRIdS
+
+#if !defined(PRIuS)
+#if defined(_WIN64)
+#define PRIuS "llu"
+#elif defined(_WIN32)
+#define PRIuS "u"
+#else
+#define PRIuS "zu"
+#endif
+#endif  // PRIuS
+
+#endif  // LIB_JXL_BASE_PRINTF_MACROS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.cc b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
new file mode 100644
index 0000000000..a38d9b82b7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.cc
@@ -0,0 +1,540 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/profiler.h"
+
+#if JXL_PROFILER_ENABLED
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>  // memcpy
+
+#include <algorithm>  // sort
+#include <atomic>
+#include <cinttypes>  // PRIu64
+#include <hwy/cache_control.h>
+#include <limits>
+#include <new>
+
+// Optionally use SIMD in StreamCacheLine if available.
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/base/profiler.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace profiler {
+namespace HWY_NAMESPACE {
+
+// Overwrites `to` without loading it into cache (read-for-ownership).
+// Copies 64 bytes from/to naturally aligned addresses.
+void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
+#if HWY_TARGET == HWY_SCALAR
+  hwy::CopyBytes<64>(from, to);
+#else
+  const HWY_CAPPED(uint64_t, 2) d;
+  HWY_FENCE;
+  const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
+  const auto v0 = Load(d, from64 + 0);
+  const auto v1 = Load(d, from64 + 2);
+  const auto v2 = Load(d, from64 + 4);
+  const auto v3 = Load(d, from64 + 6);
+  // Fences prevent the compiler from reordering loads/stores, which may
+  // interfere with write-combining.
+  HWY_FENCE;
+  uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
+  Stream(v0, d, to64 + 0);
+  Stream(v1, d, to64 + 2);
+  Stream(v2, d, to64 + 4);
+  Stream(v3, d, to64 + 6);
+  HWY_FENCE;
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace profiler
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace profiler {
+
+HWY_EXPORT(StreamCacheLine);
+
+namespace {
+
+// How many mebibytes to allocate (if JXL_PROFILER_ENABLED) per thread that
+// enters at least one zone. Once this buffer is full, the thread will analyze
+// packets (two per zone), which introduces observer overhead.
+#ifndef PROFILER_THREAD_STORAGE
+#define PROFILER_THREAD_STORAGE 32ULL
+#endif
+
+#define PROFILER_PRINT_OVERHEAD 0
+
+// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
+constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
+constexpr size_t kMaxZones = 256;  // Total number of zones.
+
+// Stack of active (entered but not exited) zones. POD, uninitialized.
+// Used to deduct child duration from the parent's self time.
+struct ActiveZone {
+  const char* name;
+  uint64_t entry_timestamp;
+  uint64_t child_total;
+};
+
+// Totals for all Zones with the same name. POD, must be zero-initialized.
+struct ZoneTotals {
+  uint64_t total_duration;
+  const char* name;
+  uint64_t num_calls;
+};
+
+template <typename T>
+inline T ClampedSubtract(const T minuend, const T subtrahend) {
+  if (subtrahend > minuend) {
+    return 0;
+  }
+  return minuend - subtrahend;
+}
+
+}  // namespace
+
+// Per-thread call graph (stack) and ZoneTotals for each zone.
+class Results {
+ public:
+  Results() {
+    // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+  // Used for computing overhead when this thread encounters its first Zone.
+  // This has no observable effect apart from increasing "analyze_elapsed_".
+  uint64_t ZoneDuration(const Packet* packets) {
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(num_zones_ == 0);
+    AnalyzePackets(packets, 2);
+    const uint64_t duration = zones_[0].total_duration;
+    zones_[0].num_calls = 0;
+    zones_[0].total_duration = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    return duration;
+  }
+
+  void SetSelfOverhead(const uint64_t self_overhead) {
+    self_overhead_ = self_overhead;
+  }
+
+  void SetChildOverhead(const uint64_t child_overhead) {
+    child_overhead_ = child_overhead;
+  }
+
+  // Draw all required information from the packets, which can be discarded
+  // afterwards. Called whenever this thread's storage is full.
+  void AnalyzePackets(const Packet* HWY_RESTRICT packets,
+                      const size_t num_packets) {
+    // Ensures prior weakly-ordered streaming stores are globally visible.
+    hwy::FlushStream();
+
+    const uint64_t t0 = TicksBefore();
+
+    for (size_t i = 0; i < num_packets; ++i) {
+      const uint64_t timestamp = packets[i].timestamp;
+      // Entering a zone
+      if (packets[i].name != nullptr) {
+        HWY_ASSERT(depth_ < kMaxDepth);
+        zone_stack_[depth_].name = packets[i].name;
+        zone_stack_[depth_].entry_timestamp = timestamp;
+        zone_stack_[depth_].child_total = 0;
+        ++depth_;
+        continue;
+      }
+
+      HWY_ASSERT(depth_ != 0);
+      const ActiveZone& active = zone_stack_[depth_ - 1];
+      const uint64_t duration = timestamp - active.entry_timestamp;
+      const uint64_t self_duration = ClampedSubtract(
+          duration, self_overhead_ + child_overhead_ + active.child_total);
+
+      UpdateOrAdd(active.name, 1, self_duration);
+      --depth_;
+
+      // "Deduct" the nested time from its parent's self_duration.
+      if (depth_ != 0) {
+        zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+  }
+
+  // Incorporates results from another thread. Call after all threads have
+  // exited any zones.
+  void Assimilate(const Results& other) {
+    const uint64_t t0 = TicksBefore();
+    HWY_ASSERT(depth_ == 0);
+    HWY_ASSERT(other.depth_ == 0);
+
+    for (size_t i = 0; i < other.num_zones_; ++i) {
+      const ZoneTotals& zone = other.zones_[i];
+      UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
+    }
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
+  }
+
+  // Single-threaded.
+  void Print() {
+    const uint64_t t0 = TicksBefore();
+    MergeDuplicates();
+
+    // Sort by decreasing total (self) cost.
+    std::sort(zones_, zones_ + num_zones_,
+              [](const ZoneTotals& r1, const ZoneTotals& r2) {
+                return r1.total_duration > r2.total_duration;
+              });
+
+    uint64_t total_visible_duration = 0;
+    for (size_t i = 0; i < num_zones_; ++i) {
+      const ZoneTotals& r = zones_[i];
+      if (r.name[0] != '@') {
+        total_visible_duration += r.total_duration;
+        printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
+               r.num_calls, r.total_duration / r.num_calls, r.total_duration);
+      }
+    }
+
+    const uint64_t t1 = TicksAfter();
+    analyze_elapsed_ += t1 - t0;
+    printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
+    printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
+  }
+
+  // Single-threaded. Clears all results as if no zones had been recorded.
+  void Reset() {
+    analyze_elapsed_ = 0;
+    HWY_ASSERT(depth_ == 0);
+    num_zones_ = 0;
+    memset(zone_stack_, 0, sizeof(zone_stack_));
+    memset(zones_, 0, sizeof(zones_));
+  }
+
+ private:
+  // Updates ZoneTotals of the same name, or inserts a new one if this thread
+  // has not yet seen that name. Uses a self-organizing list data structure,
+  // which avoids dynamic memory allocations and is faster than unordered_map.
+  void UpdateOrAdd(const char* name, const uint64_t num_calls,
+                   const uint64_t duration) {
+    // Special case for first zone: (maybe) update, without swapping.
+    if (zones_[0].name == name) {
+      zones_[0].total_duration += duration;
+      zones_[0].num_calls += num_calls;
+      return;
+    }
+
+    // Look for a zone with the same name.
+    for (size_t i = 1; i < num_zones_; ++i) {
+      if (zones_[i].name == name) {
+        zones_[i].total_duration += duration;
+        zones_[i].num_calls += num_calls;
+        // Swap with predecessor (more conservative than move to front,
+        // but at least as successful).
+        std::swap(zones_[i - 1], zones_[i]);
+        return;
+      }
+    }
+
+    // Not found; create a new ZoneTotals.
+    HWY_ASSERT(num_zones_ < kMaxZones);
+    ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
+    zone->name = name;
+    zone->num_calls = num_calls;
+    zone->total_duration = duration;
+    ++num_zones_;
+  }
+
+  // Each instantiation of a function template seems to get its own copy of
+  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
+  // acceptable because we only expect a few dozen zones.
+  void MergeDuplicates() {
+    for (size_t i = 0; i < num_zones_; ++i) {
+      // Add any subsequent duplicates to num_calls and total_duration.
+      for (size_t j = i + 1; j < num_zones_;) {
+        if (!strcmp(zones_[i].name, zones_[j].name)) {
+          zones_[i].num_calls += zones_[j].num_calls;
+          zones_[i].total_duration += zones_[j].total_duration;
+          // Fill hole with last item.
+          zones_[j] = zones_[--num_zones_];
+        } else {  // Name differed, try next ZoneTotals.
+          ++j;
+        }
+      }
+    }
+  }
+
+  uint64_t analyze_elapsed_ = 0;
+  uint64_t self_overhead_ = 0;
+  uint64_t child_overhead_ = 0;
+
+  size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
+  size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.
+
+  // After other members to avoid large pointer offsets.
+  alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
+  alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
+};
+
+ThreadSpecific::ThreadSpecific()
+    : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
+      packets_(hwy::AllocateAligned<Packet>(max_packets_)),
+      num_packets_(0),
+      results_(hwy::MakeUniqueAligned<Results>()) {}
+
+ThreadSpecific::~ThreadSpecific() {}
+
+void ThreadSpecific::FlushBuffer() {
+  if (num_packets_ + kBufferCapacity > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+  // This buffering halves observer overhead and decreases the overall
+  // runtime by about 3%.
+  HWY_DYNAMIC_DISPATCH(StreamCacheLine)
+  (buffer_, packets_.get() + num_packets_);
+  num_packets_ += kBufferCapacity;
+  buffer_size_ = 0;
+}
+
+void ThreadSpecific::AnalyzeRemainingPackets() {
+  // Storage full => empty it.
+  if (num_packets_ + buffer_size_ > max_packets_) {
+    results_->AnalyzePackets(packets_.get(), num_packets_);
+    num_packets_ = 0;
+  }
+
+  // Move buffer to storage
+  memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
+  num_packets_ += buffer_size_;
+  buffer_size_ = 0;
+
+  results_->AnalyzePackets(packets_.get(), num_packets_);
+  num_packets_ = 0;
+}
+
+namespace {
+
+class HalfSampleMode {
+ public:
+  // Returns mode. "sorted" must be in ascending order.
+  template <typename T>
+  T operator()(const T* const HWY_RESTRICT sorted,
+               const size_t num_values) const {
+    int64_t center = num_values / 2;
+    int64_t width = num_values;
+
+    // Zoom in on modal intervals of decreasing width. Stop before we reach
+    // width=1, i.e. single values, for which there is no "slope".
+    while (width > 2) {
+      // Round up so we can still reach the outer edges of odd widths.
+      width = (width + 1) / 2;
+
+      center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
+    }
+
+    return sorted[center];  // mode := middle value in modal interval.
+  }
+
+ private:
+  // Returns center of the densest region [c-radius, c+radius].
+  template <typename T>
+  static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
+      const T* HWY_RESTRICT sorted, const int64_t total_values,
+      const int64_t center, const int64_t width) {
+    const int64_t radius = (width + 1) / 2;
+
+    auto compute_slope = [radius, total_values, sorted](
+                             int64_t c, int64_t* actual_center = nullptr) {
+      // For symmetry, check 2*radius+1 values, i.e. [min, max].
+      const int64_t min = std::max(c - radius, int64_t(0));
+      const int64_t max = std::min(c + radius, total_values - 1);
+      HWY_ASSERT(min < max);
+      HWY_ASSERT(sorted[min] <=
+                 sorted[max] + std::numeric_limits<float>::epsilon());
+      const float dx = max - min + 1;
+      const float slope = (sorted[max] - sorted[min]) / dx;
+
+      if (actual_center != nullptr) {
+        // c may be out of bounds, so return center of the clamped bounds.
+        *actual_center = (min + max + 1) / 2;
+      }
+      return slope;
+    };
+
+    // First find min_slope for all centers.
+    float min_slope = std::numeric_limits<float>::max();
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      min_slope = std::min(min_slope, compute_slope(c));
+    }
+
+    // Candidates := centers with slope ~= min_slope.
+    std::vector<int64_t> candidates;
+    for (int64_t c = center - radius; c <= center + radius; ++c) {
+      int64_t actual_center;
+      const float slope = compute_slope(c, &actual_center);
+      if (slope <= min_slope * 1.001f) {
+        candidates.push_back(actual_center);
+      }
+    }
+
+    // Keep the median.
+    HWY_ASSERT(!candidates.empty());
+    if (candidates.size() == 1) return candidates[0];
+    std::nth_element(candidates.begin(),
+                     candidates.begin() + candidates.size() / 2,
+                     candidates.end());
+    return candidates[candidates.size() / 2];
+  }
+};
+
+}  // namespace
+
+void ThreadSpecific::ComputeOverhead() {
+  // Delay after capturing timestamps before/after the actual zone runs. Even
+  // with frequency throttling disabled, this has a multimodal distribution,
+  // including 32, 34, 48, 52, 59, 62.
+  uint64_t self_overhead;
+  {
+    const size_t kNumSamples = 32;
+    uint32_t samples[kNumSamples];
+    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+      const size_t kNumDurations = 1024;
+      uint32_t durations[kNumDurations];
+
+      for (size_t idx_duration = 0; idx_duration < kNumDurations;
+           ++idx_duration) {
+        {  //
+          PROFILER_ZONE("Dummy Zone (never shown)");
+        }
+        const uint64_t duration = results_->ZoneDuration(buffer_);
+        buffer_size_ = 0;
+        durations[idx_duration] = static_cast<uint32_t>(duration);
+        HWY_ASSERT(num_packets_ == 0);
+      }
+      std::sort(durations, durations + kNumDurations);
+      samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+    }
+    // Median.
+    std::sort(samples, samples + kNumSamples);
+    self_overhead = samples[kNumSamples / 2];
+#if PROFILER_PRINT_OVERHEAD
+    printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
+#endif
+    results_->SetSelfOverhead(self_overhead);
+  }
+
+  // Delay before capturing start timestamp / after end timestamp.
+  const size_t kNumSamples = 32;
+  uint32_t samples[kNumSamples];
+  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
+    const size_t kNumDurations = 16;
+    uint32_t durations[kNumDurations];
+    for (size_t idx_duration = 0; idx_duration < kNumDurations;
+         ++idx_duration) {
+      const size_t kReps = 10000;
+      // Analysis time should not be included => must fit within buffer.
+      HWY_ASSERT(kReps * 2 < max_packets_);
+      hwy::FlushStream();
+      const uint64_t t0 = TicksBefore();
+      for (size_t i = 0; i < kReps; ++i) {
+        PROFILER_ZONE("Dummy");
+      }
+      hwy::FlushStream();
+      const uint64_t t1 = TicksAfter();
+      HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
+      buffer_size_ = 0;
+      num_packets_ = 0;
+      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
+      durations[idx_duration] =
+          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
+    }
+    std::sort(durations, durations + kNumDurations);
+    samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
+  }
+  std::sort(samples, samples + kNumSamples);
+  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
+#if PROFILER_PRINT_OVERHEAD
+  printf("Child overhead: %" PRIu64 "\n",
+         static_cast<uint64_t>(child_overhead));
+#endif
+  results_->SetChildOverhead(child_overhead);
+}
+
+namespace {
+
+// Could be a static member of Zone, but that would expose <atomic> in header.
+std::atomic<ThreadSpecific*>& GetHead() {
+  static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
+  return head_;
+}
+
+}  // namespace
+
+// Thread-safe.
+ThreadSpecific* Zone::InitThreadSpecific() {
+  ThreadSpecific* thread_specific =
+      hwy::MakeUniqueAligned<ThreadSpecific>().release();
+
+  // Insert into unordered list
+  std::atomic<ThreadSpecific*>& head = GetHead();
+  ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
+  thread_specific->SetNext(old_head);
+  while (!head.compare_exchange_weak(old_head, thread_specific,
+                                     std::memory_order_release,
+                                     std::memory_order_relaxed)) {
+    thread_specific->SetNext(old_head);
+    // TODO(janwas): pause
+  }
+
+  // ComputeOverhead also creates a Zone, so this needs to be set before that
+  // to prevent infinite recursion.
+  GetThreadSpecific() = thread_specific;
+
+  thread_specific->ComputeOverhead();
+  return thread_specific;
+}
+
+// Single-threaded.
+/*static*/ void Zone::PrintResults() {
+  ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
+  ThreadSpecific* p = head;
+  while (p) {
+    p->AnalyzeRemainingPackets();
+
+    // Combine all threads into a single Result.
+    if (p != head) {
+      head->GetResults().Assimilate(p->GetResults());
+      p->GetResults().Reset();
+    }
+
+    p = p->GetNext();
+  }
+
+  if (head != nullptr) {
+    head->GetResults().Print();
+    head->GetResults().Reset();
+  }
+}
+
+}  // namespace profiler
+}  // namespace jxl
+
+#endif  // HWY_ONCE
+#endif  // JXL_PROFILER_ENABLED
diff --git a/third_party/jpeg-xl/lib/jxl/base/profiler.h b/third_party/jpeg-xl/lib/jxl/base/profiler.h
new file mode 100644
index 0000000000..4c0efa4b3a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/profiler.h
@@ -0,0 +1,170 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_PROFILER_H_
+#define LIB_JXL_BASE_PROFILER_H_
+
+// High precision, low overhead time measurements. Returns exact call counts and
+// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
+//
+// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which
+// defines JXL_PROFILER_ENABLED.
+//
+// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
+// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
+// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
+// print call counts and average durations [CPU cycles] to stdout, sorted in
+// descending order of total duration.
+
+// If zero, this file has no effect and no measurements will be recorded.
+#ifndef JXL_PROFILER_ENABLED
+#define JXL_PROFILER_ENABLED 0
+#endif
+#if JXL_PROFILER_ENABLED
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/aligned_allocator.h>
+#include <hwy/base.h>
+
+#include "lib/jxl/base/tsc_timer.h"
+
+#if HWY_COMPILER_MSVC
+#define PROFILER_PUBLIC
+#else
+#define PROFILER_PUBLIC __attribute__((visibility("default")))
+#endif
+
+namespace jxl {
+namespace profiler {
+
+// Represents zone entry/exit events. POD.
+#pragma pack(push, 1)
+struct Packet {
+  // Computing a hash or string table is likely too expensive, and offsets
+  // from other libraries' string literals can be too large to combine them and
+  // a full-resolution timestamp into 64 bits.
+  uint64_t timestamp;
+  const char* name;  // nullptr for exit packets
+#if UINTPTR_MAX <= 0xFFFFFFFFu
+  uint32_t padding;
+#endif
+};
+#pragma pack(pop)
+static_assert(sizeof(Packet) == 16, "Wrong Packet size");
+
+class Results;  // pImpl
+
+// Per-thread packet storage, dynamically allocated and aligned.
+class ThreadSpecific {
+  static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);
+
+ public:
+  PROFILER_PUBLIC explicit ThreadSpecific();
+  PROFILER_PUBLIC ~ThreadSpecific();
+
+  // Depends on Zone => defined out of line.
+  PROFILER_PUBLIC void ComputeOverhead();
+
+  HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
+  HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }
+
+  PROFILER_PUBLIC void AnalyzeRemainingPackets();
+
+  // Accessors instead of public member for well-defined data layout.
+  void SetNext(ThreadSpecific* next) { next_ = next; }
+  ThreadSpecific* GetNext() const { return next_; }
+
+  Results& GetResults() { return *results_; }
+
+ private:
+  PROFILER_PUBLIC void FlushBuffer();
+
+  // Write packet to buffer/storage, emptying them as needed.
+  void Write(const char* name, const uint64_t timestamp) {
+    if (buffer_size_ == kBufferCapacity) {  // Full
+      FlushBuffer();
+    }
+    buffer_[buffer_size_].name = name;
+    buffer_[buffer_size_].timestamp = timestamp;
+    ++buffer_size_;
+  }
+
+  // Write-combining buffer to avoid cache pollution. Must be the first
+  // non-static member to ensure cache-line alignment.
+  Packet buffer_[kBufferCapacity];
+  size_t buffer_size_ = 0;
+
+  // Contiguous storage for zone enter/exit packets.
+  const size_t max_packets_;
+  hwy::AlignedFreeUniquePtr<Packet[]> packets_;
+  size_t num_packets_;
+
+  // Linked list of all threads.
+  ThreadSpecific* next_ = nullptr;  // Owned, never released.
+
+  hwy::AlignedUniquePtr<Results> results_;
+};
+
+// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
+// responsible for initializing ThreadSpecific.
+class Zone {
+ public:
+  HWY_NOINLINE explicit Zone(const char* name) {
+    HWY_FENCE;
+    ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
+    if (HWY_UNLIKELY(thread_specific == nullptr)) {
+      thread_specific = InitThreadSpecific();
+    }
+
+    thread_specific->WriteEntry(name);
+  }
+
+  HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }
+
+  // Call exactly once after all threads have exited all zones.
+  PROFILER_PUBLIC static void PrintResults();
+
+ private:
+  // Returns reference to the thread's ThreadSpecific pointer (initially null).
+  // Function-local static avoids needing a separate definition.
+  static ThreadSpecific*& GetThreadSpecific() {
+    static thread_local ThreadSpecific* thread_specific;
+    return thread_specific;
+  }
+
+  // Non time-critical.
+  PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
+};
+
+// Creates a zone starting from here until the end of the current scope.
+// Timestamps will be recorded when entering and exiting the zone.
+// To ensure the name pointer remains valid, we require it to be a string
+// literal (by merging with ""). We also compare strings by address.
+#define PROFILER_ZONE(name)                  \
+  HWY_FENCE;                                 \
+  const ::jxl::profiler::Zone zone("" name); \
+  HWY_FENCE
+
+// Creates a zone for an entire function (when placed at its beginning).
+// Shorter/more convenient than ZONE.
+#define PROFILER_FUNC                         \
+  HWY_FENCE;                                  \
+  const ::jxl::profiler::Zone zone(__func__); \
+  HWY_FENCE
+
+#define PROFILER_PRINT_RESULTS ::jxl::profiler::Zone::PrintResults
+
+}  // namespace profiler
+}  // namespace jxl
+
+#else  // !JXL_PROFILER_ENABLED
+#define PROFILER_ZONE(name)
+#define PROFILER_FUNC
+#define PROFILER_PRINT_RESULTS()
+#endif
+
+#endif  // LIB_JXL_BASE_PROFILER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.cc b/third_party/jpeg-xl/lib/jxl/base/random.cc
new file mode 100644
index 0000000000..c99f88921c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/random.h"
+
+#include <cmath>
+
+namespace jxl {
+
+Rng::GeometricDistribution::GeometricDistribution(float p)
+    : inv_log_1mp(1.0 / std::log(1 - p)) {}
+
+uint32_t Rng::Geometric(const GeometricDistribution& dist) {
+  float f = UniformF(0, 1);
+  float log = std::log(1 - f) * dist.inv_log_1mp;
+  return static_cast<uint32_t>(log);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/base/random.h b/third_party/jpeg-xl/lib/jxl/base/random.h
new file mode 100644
index 0000000000..663b88c95d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/random.h
@@ -0,0 +1,95 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_RANDOM_
+#define LIB_JXL_BASE_RANDOM_
+
+// Random number generator + distributions.
+// We don't use <random> because the implementation (and thus results) differs
+// between libstdc++ and libc++.
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+struct Rng {
+  explicit Rng(size_t seed)
+      : s{static_cast<uint64_t>(0x94D049BB133111EBull),
+          static_cast<uint64_t>(0xBF58476D1CE4E5B9ull) + seed} {}
+
+  // Xorshift128+ adapted from xorshift128+-inl.h
+  uint64_t operator()() {
+    uint64_t s1 = s[0];
+    const uint64_t s0 = s[1];
+    const uint64_t bits = s1 + s0;  // b, c
+    s[0] = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s[1] = s1;
+    return bits;
+  }
+
+  // Uniformly distributed int64_t in [begin, end), under the assumption that
+  // `end-begin` is significantly smaller than 1<<64, otherwise there is some
+  // bias.
+  int64_t UniformI(int64_t begin, int64_t end) {
+    JXL_DASSERT(end > begin);
+    return static_cast<int64_t>((*this)() %
+                                static_cast<uint64_t>(end - begin)) +
+           begin;
+  }
+
+  // Same as UniformI, but for uint64_t.
+  uint64_t UniformU(uint64_t begin, uint64_t end) {
+    JXL_DASSERT(end > begin);
+    return (*this)() % (end - begin) + begin;
+  }
+
+  // Uniformly distributed float in [begin, end) range. Note: only 23 bits of
+  // randomness.
+  float UniformF(float begin, float end) {
+    float f;
+    // Bits of a random [1, 2) float.
+    uint32_t u = ((*this)() >> (64 - 23)) | 0x3F800000;
+    static_assert(sizeof(f) == sizeof(u),
+                  "Float and U32 must have the same size");
+    memcpy(&f, &u, sizeof(f));
+    // Note: (end-begin) * f + (2*begin-end) may fail to return a number >=
+    // begin.
+    return (end - begin) * (f - 1.0f) + begin;
+  }
+
+  // Bernoulli trial
+  bool Bernoulli(float p) { return UniformF(0, 1) < p; }
+
+  // State for geometric distributions.
+  struct GeometricDistribution {
+    explicit GeometricDistribution(float p);
+
+   private:
+    float inv_log_1mp;
+    friend struct Rng;
+  };
+
+  uint32_t Geometric(const GeometricDistribution& dist);
+
+  template <typename T>
+  void Shuffle(T* t, size_t n) {
+    for (size_t i = 0; i + 1 < n; i++) {
+      size_t a = UniformU(i, n);
+      std::swap(t[a], t[i]);
+    }
+  }
+
+ private:
+  uint64_t s[2];
+};
+
+}  // namespace jxl
+#endif  // LIB_JXL_BASE_RANDOM_
diff --git a/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
new file mode 100644
index 0000000000..315f3bd003
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/sanitizer_definitions.h
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+#define LIB_JXL_BASE_SANITIZER_DEFINITIONS_H_
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#ifdef ADDRESS_SANITIZER
+#define JXL_ADDRESS_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define JXL_ADDRESS_SANITIZER 1
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+#else
+#define JXL_ADDRESS_SANITIZER 0
+#endif
+
+#ifdef THREAD_SANITIZER
+#define JXL_THREAD_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define JXL_THREAD_SANITIZER 1
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#else
+#define JXL_THREAD_SANITIZER 0
+#endif
+#endif  // LIB_JXL_BASE_SANITIZER_DEFINITIONS_H
diff --git a/third_party/jpeg-xl/lib/jxl/base/scope_guard.h b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
new file mode 100644
index 0000000000..a18a44cb79
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/scope_guard.h
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SCOPE_GUARD_H_
+#define LIB_JXL_BASE_SCOPE_GUARD_H_
+
+#include <utility>
+
+namespace jxl {
+
+template <typename Callback>
+class ScopeGuard {
+ public:
+  // Discourage unnecessary moves / copies.
+  ScopeGuard(const ScopeGuard &) = delete;
+  ScopeGuard &operator=(const ScopeGuard &) = delete;
+  ScopeGuard &operator=(ScopeGuard &&) = delete;
+
+  // Pre-C++17 does not guarantee RVO -> require move constructor.
+  ScopeGuard(ScopeGuard &&other) : callback_(std::move(other.callback_)) {
+    other.armed_ = false;
+  }
+
+  template <typename CallbackParam>
+  explicit ScopeGuard(CallbackParam &&callback)
+      : callback_(std::forward<CallbackParam>(callback)), armed_(true) {}
+
+  ~ScopeGuard() {
+    if (armed_) callback_();
+  }
+
+  void Disarm() { armed_ = false; }
+
+ private:
+  Callback callback_;
+  bool armed_;
+};
+
+template <typename Callback>
+ScopeGuard<Callback> MakeScopeGuard(Callback &&callback) {
+  return ScopeGuard<Callback>{std::forward<Callback>(callback)};
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_SCOPE_GUARD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/span.h b/third_party/jpeg-xl/lib/jxl/base/span.h
new file mode 100644
index 0000000000..41c3623a4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/span.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_SPAN_H_
+#define LIB_JXL_BASE_SPAN_H_
+
+// Span (array view) is a non-owning container that provides cheap "cut"
+// operations and could be used as "ArrayLike" data source for PaddedBytes.
+
+#include <stddef.h>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+template <typename T>
+class Span {
+ public:
+  constexpr Span() noexcept : Span(nullptr, 0) {}
+
+  constexpr Span(T* array, size_t length) noexcept
+      : ptr_(array), len_(length) {}
+
+  template <size_t N>
+  explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {}
+
+  template <typename ArrayLike>
+  explicit constexpr Span(const ArrayLike& other) noexcept
+      : Span(reinterpret_cast<T*>(other.data()), other.size()) {
+    static_assert(sizeof(*other.data()) == sizeof(T),
+                  "Incompatible type of source.");
+  }
+
+  constexpr T* data() const noexcept { return ptr_; }
+
+  constexpr size_t size() const noexcept { return len_; }
+
+  constexpr bool empty() const noexcept { return len_ == 0; }
+
+  constexpr T& operator[](size_t i) const noexcept {
+    // MSVC 2015 accepts this as constexpr, but not ptr_[i]
+    return *(data() + i);
+  }
+
+  void remove_prefix(size_t n) noexcept {
+    JXL_ASSERT(size() >= n);
+    ptr_ += n;
+    len_ -= n;
+  }
+
+ private:
+  T* ptr_;
+  size_t len_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_SPAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/status.h b/third_party/jpeg-xl/lib/jxl/base/status.h
new file mode 100644
index 0000000000..f40be0c434
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/status.h
@@ -0,0 +1,326 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_STATUS_H_
+#define LIB_JXL_BASE_STATUS_H_
+
+// Error handling: Status return type + helper macros.
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/sanitizer_definitions.h"
+
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+#include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
+#endif                                        // defined(*_SANITIZER)
+
+namespace jxl {
+
+// Uncomment to abort when JXL_FAILURE or JXL_STATUS with a fatal error is
+// reached:
+// #define JXL_CRASH_ON_ERROR
+
+#ifndef JXL_ENABLE_ASSERT
+#define JXL_ENABLE_ASSERT 1
+#endif
+
+#ifndef JXL_ENABLE_CHECK
+#define JXL_ENABLE_CHECK 1
+#endif
+
+// Pass -DJXL_DEBUG_ON_ERROR at compile time to print debug messages when a
+// function returns JXL_FAILURE or calls JXL_NOTIFY_ERROR. Note that this is
+// irrelevant if you also pass -DJXL_CRASH_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ERROR) || defined(JXL_CRASH_ON_ERROR)
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else  // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+#ifdef NDEBUG
+#define JXL_DEBUG_ON_ERROR 0
+#else  // NDEBUG
+#define JXL_DEBUG_ON_ERROR 1
+#endif  // NDEBUG
+#endif  // JXL_DEBUG_ON_ERROR || JXL_CRASH_ON_ERROR
+
+// Pass -DJXL_DEBUG_ON_ALL_ERROR at compile time to print debug messages on
+// all error (fatal and non-fatal) status. This implies JXL_DEBUG_ON_ERROR.
+#if defined(JXL_DEBUG_ON_ALL_ERROR)
+#undef JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 1
+// JXL_DEBUG_ON_ALL_ERROR implies JXL_DEBUG_ON_ERROR too.
+#undef JXL_DEBUG_ON_ERROR
+#define JXL_DEBUG_ON_ERROR 1
+#else  // JXL_DEBUG_ON_ALL_ERROR
+#define JXL_DEBUG_ON_ALL_ERROR 0
+#endif  // JXL_DEBUG_ON_ALL_ERROR
+
+// The Verbose level for the library
+#ifndef JXL_DEBUG_V_LEVEL
+#define JXL_DEBUG_V_LEVEL 0
+#endif  // JXL_DEBUG_V_LEVEL
+
+// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT,
+// JXL_CHECK and JXL_ABORT.
+#ifndef JXL_DEBUG_ON_ABORT
+#define JXL_DEBUG_ON_ABORT 1
+#endif  // JXL_DEBUG_ON_ABORT
+
+// Print a debug message on standard error. You should use the JXL_DEBUG macro
+// instead of calling Debug directly. This function returns false, so it can be
+// used as a return value in JXL_FAILURE.
+JXL_FORMAT(1, 2)
+inline JXL_NOINLINE bool Debug(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  return false;
+}
+
+// Print a debug message on standard error if "enabled" is true. "enabled" is
+// normally a macro that evaluates to 0 or 1 at compile time, so the Debug
+// function is never called and optimized out in release builds. Note that the
+// arguments are compiled but not evaluated when enabled is false. The format
+// string must be a explicit string in the call, for example:
+//   JXL_DEBUG(JXL_DEBUG_MYMODULE, "my module message: %d", some_var);
+// Add a header at the top of your module's .cc or .h file (depending on whether
+// you have JXL_DEBUG calls from the .h as well) like this:
+//   #ifndef JXL_DEBUG_MYMODULE
+//   #define JXL_DEBUG_MYMODULE 0
+//   #endif JXL_DEBUG_MYMODULE
+#define JXL_DEBUG_TMP(format, ...) \
+  ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define JXL_DEBUG(enabled, format, ...)     \
+  do {                                      \
+    if (enabled) {                          \
+      JXL_DEBUG_TMP(format, ##__VA_ARGS__); \
+    }                                       \
+  } while (0)
+
+// JXL_DEBUG version that prints the debug message if the global verbose level
+// defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the
+// passed level.
+#define JXL_DEBUG_V(level, format, ...) \
+  JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__)
+
+// Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and
+// debug).
+#ifdef JXL_DEBUG_WARNING
+#undef JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#else  // JXL_DEBUG_WARNING
+#ifdef NDEBUG
+#define JXL_DEBUG_WARNING 0
+#else  // JXL_DEBUG_WARNING
+#define JXL_DEBUG_WARNING 1
+#endif  // NDEBUG
+#endif  // JXL_DEBUG_WARNING
+#define JXL_WARNING(format, ...) \
+  JXL_DEBUG(JXL_DEBUG_WARNING, format, ##__VA_ARGS__)
+
+// Exits the program after printing a stack trace when possible.
+JXL_NORETURN inline JXL_NOINLINE bool Abort() {
+#if JXL_ADDRESS_SANITIZER || JXL_MEMORY_SANITIZER || JXL_THREAD_SANITIZER
+  // If compiled with any sanitizer print a stack trace. This call doesn't crash
+  // the program, instead the trap below will crash it also allowing gdb to
+  // break there.
+  __sanitizer_print_stack_trace();
+#endif  // *_SANITIZER)
+
+#if JXL_COMPILER_MSVC
+  __debugbreak();
+  abort();
+#else
+  __builtin_trap();
+#endif
+}
+
+// Exits the program after printing file/line plus a formatted string.
+#define JXL_ABORT(format, ...)                                              \
+  ((JXL_DEBUG_ON_ABORT) && ::jxl::Debug(("%s:%d: JXL_ABORT: " format "\n"), \
+                                        __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort())
+
+// Does not guarantee running the code, use only for debug mode checks.
+#if JXL_ENABLE_ASSERT
+#define JXL_ASSERT(condition)                                      \
+  do {                                                             \
+    if (!(condition)) {                                            \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_ASSERT: %s", #condition); \
+      ::jxl::Abort();                                              \
+    }                                                              \
+  } while (0)
+#else
+#define JXL_ASSERT(condition) \
+  do {                        \
+  } while (0)
+#endif
+
+// Define JXL_IS_DEBUG_BUILD that denotes asan, msan and other debug builds,
+// but not opt or release.
+#ifndef JXL_IS_DEBUG_BUILD
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) ||         \
+    defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER) || \
+    defined(__clang_analyzer__)
+#define JXL_IS_DEBUG_BUILD 1
+#else
+#define JXL_IS_DEBUG_BUILD 0
+#endif
+#endif  //  JXL_IS_DEBUG_BUILD
+
+// Same as above, but only runs in debug builds (builds where NDEBUG is not
+// defined). This is useful for slower asserts that we want to run more rarely
+// than usual. These will run on asan, msan and other debug builds, but not in
+// opt or release.
+#if JXL_IS_DEBUG_BUILD
+#define JXL_DASSERT(condition)                                      \
+  do {                                                              \
+    if (!(condition)) {                                             \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_DASSERT: %s", #condition); \
+      ::jxl::Abort();                                               \
+    }                                                               \
+  } while (0)
+#else
+#define JXL_DASSERT(condition) \
+  do {                         \
+  } while (0)
+#endif
+
+// Always runs the condition, so can be used for non-debug calls.
+#if JXL_ENABLE_CHECK
+#define JXL_CHECK(condition)                                      \
+  do {                                                            \
+    if (!(condition)) {                                           \
+      JXL_DEBUG(JXL_DEBUG_ON_ABORT, "JXL_CHECK: %s", #condition); \
+      ::jxl::Abort();                                             \
+    }                                                             \
+  } while (0)
+#else
+#define JXL_CHECK(condition) \
+  do {                       \
+    (void)(condition);       \
+  } while (0)
+#endif
+
+// A jxl::Status value from a StatusCode or Status which prints a debug message
+// when enabled.
+#define JXL_STATUS(status, format, ...)                                        \
+  ::jxl::StatusMessage(::jxl::Status(status), "%s:%d: " format "\n", __FILE__, \
+                       __LINE__, ##__VA_ARGS__)
+
+// Notify of an error but discard the resulting Status value. This is only
+// useful for debug builds or when building with JXL_CRASH_ON_ERROR.
+#define JXL_NOTIFY_ERROR(format, ...)                                      \
+  (void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_ERROR: " format, \
+                   ##__VA_ARGS__)
+
+// An error Status with a message. The JXL_STATUS() macro will return a Status
+// object with a kGenericError code, but the comma operator helps with
+// clang-tidy inference and potentially with optimizations.
+#define JXL_FAILURE(format, ...)                                              \
+  ((void)JXL_STATUS(::jxl::StatusCode::kGenericError, "JXL_FAILURE: " format, \
+                    ##__VA_ARGS__),                                           \
+   ::jxl::Status(::jxl::StatusCode::kGenericError))
+
+// Always evaluates the status exactly once, so can be used for non-debug calls.
+// Returns from the current context if the passed Status expression is an error
+// (fatal or non-fatal). The return value is the passed Status.
+#define JXL_RETURN_IF_ERROR(status)                                       \
+  do {                                                                    \
+    ::jxl::Status jxl_return_if_error_status = (status);                  \
+    if (!jxl_return_if_error_status) {                                    \
+      (void)::jxl::StatusMessage(                                         \
+          jxl_return_if_error_status,                                     \
+          "%s:%d: JXL_RETURN_IF_ERROR code=%d: %s\n", __FILE__, __LINE__, \
+          static_cast<int>(jxl_return_if_error_status.code()), #status);  \
+      return jxl_return_if_error_status;                                  \
+    }                                                                     \
+  } while (0)
+
+// As above, but without calling StatusMessage. Intended for bundles (see
+// fields.h), which have numerous call sites (-> relevant for code size) and do
+// not want to generate excessive messages when decoding partial headers.
+#define JXL_QUIET_RETURN_IF_ERROR(status)                \
+  do {                                                   \
+    ::jxl::Status jxl_return_if_error_status = (status); \
+    if (!jxl_return_if_error_status) {                   \
+      return jxl_return_if_error_status;                 \
+    }                                                    \
+  } while (0)
+
+enum class StatusCode : int32_t {
+  // Non-fatal errors (negative values).
+  kNotEnoughBytes = -1,
+
+  // The only non-error status code.
+  kOk = 0,
+
+  // Fatal-errors (positive values)
+  kGenericError = 1,
+};
+
+// Drop-in replacement for bool that raises compiler warnings if not used
+// after being returned from a function. Example:
+// Status LoadFile(...) { return true; } is more compact than
+// bool JXL_MUST_USE_RESULT LoadFile(...) { return true; }
+// In case of error, the status can carry an extra error code in its value which
+// is split between fatal and non-fatal error codes.
+class JXL_MUST_USE_RESULT Status {
+ public:
+  // We want implicit constructor from bool to allow returning "true" or "false"
+  // on a function when using Status. "true" means kOk while "false" means a
+  // generic fatal error.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr Status(bool ok)
+      : code_(ok ? StatusCode::kOk : StatusCode::kGenericError) {}
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr Status(StatusCode code) : code_(code) {}
+
+  // We also want implicit cast to bool to check for return values of functions.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator bool() const { return code_ == StatusCode::kOk; }
+
+  constexpr StatusCode code() const { return code_; }
+
+  // Returns whether the status code is a fatal error.
+  constexpr bool IsFatalError() const {
+    return static_cast<int32_t>(code_) > 0;
+  }
+
+ private:
+  StatusCode code_;
+};
+
+// Helper function to create a Status and print the debug message or abort when
+// needed.
+inline JXL_FORMAT(2, 3) Status
+    StatusMessage(const Status status, const char* format, ...) {
+  // This block will be optimized out when JXL_DEBUG_ON_ERROR and
+  // JXL_DEBUG_ON_ALL_ERROR are both disabled.
+  if ((JXL_DEBUG_ON_ERROR && status.IsFatalError()) ||
+      (JXL_DEBUG_ON_ALL_ERROR && !status)) {
+    va_list args;
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+  }
+#ifdef JXL_CRASH_ON_ERROR
+  // JXL_CRASH_ON_ERROR means to Abort() only on non-fatal errors.
+  if (status.IsFatalError()) {
+    Abort();
+  }
+#endif  // JXL_CRASH_ON_ERROR
+  return status;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_STATUS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
new file mode 100644
index 0000000000..74d51f72d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/base/tsc_timer.h
@@ -0,0 +1,172 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_TSC_TIMER_H_
+#define LIB_JXL_BASE_TSC_TIMER_H_
+
+// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
+// ensure exactly the desired regions are measured.
+
+#include <stdint.h>
+#include <time.h>  // clock_gettime
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif  // WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#ifndef NOGDI
+#define NOGDI
+#endif  // NOGDI
+#include <windows.h>
+// Undef macros to avoid collisions
+#undef LoadFence
+#endif
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <OS.h>
+#endif
+
+#include <ctime>
+#include <hwy/base.h>
+#include <hwy/cache_control.h>  // LoadFence
+
+namespace jxl {
+namespace profiler {
+
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow.
+using Ticks = uint64_t;
+
+// TicksBefore/After return absolute timestamps and must be placed immediately
+// before and after the region to measure. We provide separate Before/After
+// functions because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE.
+//
+// Using Before+Before leads to higher variance and overhead than After+After.
+// However, After+After includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Before+After
+// is faster than Before+Before and more consistent than After+After because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() {
+  Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  hwy::LoadFence();
+  HWY_FENCE;
+  t = __rdtsc();
+  hwy::LoadFence();
+  HWY_FENCE;
+#elif HWY_ARCH_X86_64
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rdx", "memory", "cc");
+#elif HWY_ARCH_RVV
+  asm volatile("rdcycle %0" : "=r"(t));
+#elif defined(_WIN32) || defined(_WIN64)
+  LARGE_INTEGER counter;
+  (void)QueryPerformanceCounter(&counter);
+  t = counter.QuadPart;
+#elif defined(__APPLE__)
+  t = mach_absolute_time();
+#elif defined(__HAIKU__)
+  t = system_time_nsecs();  // since boot
+#else  // POSIX
+  timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+#endif
+  return t;
+}
+
+static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() {
+  Ticks t;
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  HWY_FENCE;
+  unsigned aux;
+  t = __rdtscp(&aux);
+  hwy::LoadFence();
+  HWY_FENCE;
+#elif HWY_ARCH_X86_64
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rcx", "rdx", "memory", "cc");
+#else
+  t = TicksBefore();  // no difference on other platforms.
+#endif
+  return t;
+}
+
+}  // namespace profiler
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_TSC_TIMER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc
new file mode 100644
index 0000000000..24cc9b64e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/bit_reader_test.cc
@@ -0,0 +1,262 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(BitReaderTest, ExtendsWithZeroes) {
+  for (size_t size = 4; size < 32; ++size) {
+    std::vector<uint8_t> data(size, 0xff);
+
+    for (size_t n_bytes = 0; n_bytes < size; n_bytes++) {
+      BitReader br(Span<const uint8_t>(data.data(), n_bytes));
+      // Read all the bits
+      for (size_t i = 0; i < n_bytes * kBitsPerByte; i++) {
+        ASSERT_EQ(br.ReadBits(1), 1u) << "n_bytes=" << n_bytes << " i=" << i;
+      }
+
+      // PEEK more than the declared size - all will be zero. Cannot consume.
+      for (size_t i = 0; i < BitReader::kMaxBitsPerCall; i++) {
+        ASSERT_EQ(br.PeekBits(i), 0u)
+            << "size=" << size << "n_bytes=" << n_bytes << " i=" << i;
+      }
+
+      EXPECT_TRUE(br.Close());
+    }
+  }
+}
+
+struct Symbol {
+  uint32_t num_bits;
+  uint32_t value;
+};
+
+// Reading from output gives the same values.
+TEST(BitReaderTest, TestRoundTrip) {
+  test::ThreadPoolForTests pool(8);
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, 1000, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /* thread */) {
+        constexpr size_t kMaxBits = 8000;
+        BitWriter writer;
+        BitWriter::Allotment allotment(&writer, kMaxBits);
+
+        std::vector<Symbol> symbols;
+        symbols.reserve(1000);
+
+        Rng rng(55537 + 129 * task);
+
+        for (;;) {
+          const uint32_t num_bits = rng.UniformU(1, 33);
+          if (writer.BitsWritten() + num_bits > kMaxBits) break;
+          const uint32_t value = rng.UniformU(0, 1ULL << num_bits);
+          symbols.push_back({num_bits, value});
+          writer.Write(num_bits, value);
+        }
+
+        writer.ZeroPadToByte();
+        allotment.ReclaimAndCharge(&writer, 0, nullptr);
+        BitReader reader(writer.GetSpan());
+        for (const Symbol& s : symbols) {
+          EXPECT_EQ(s.value, reader.ReadBits(s.num_bits));
+        }
+        EXPECT_TRUE(reader.Close());
+      },
+      "TestTBitReaderRoundTrip"));
+}
+
+// SkipBits is the same as reading that many bits.
+TEST(BitReaderTest, TestSkip) {
+  test::ThreadPoolForTests pool(8);
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, 96, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /* thread */) {
+        constexpr size_t kSize = 100;
+
+        for (size_t skip = 0; skip < 128; ++skip) {
+          BitWriter writer;
+          BitWriter::Allotment allotment(&writer, kSize * kBitsPerByte);
+          // Start with "task" 1-bits.
+          for (size_t i = 0; i < task; ++i) {
+            writer.Write(1, 1);
+          }
+
+          // Write 0-bits that we will skip over
+          for (size_t i = 0; i < skip; ++i) {
+            writer.Write(1, 0);
+          }
+
+          // Write terminator bits '101'
+          writer.Write(3, 5);
+          EXPECT_EQ(task + skip + 3, writer.BitsWritten());
+          writer.ZeroPadToByte();
+          AuxOut aux_out;
+          allotment.ReclaimAndCharge(&writer, 0, &aux_out);
+          EXPECT_LT(aux_out.layers[0].total_bits, kSize * 8);
+
+          BitReader reader1(writer.GetSpan());
+          BitReader reader2(writer.GetSpan());
+          // Verify initial 1-bits
+          for (size_t i = 0; i < task; ++i) {
+            EXPECT_EQ(1u, reader1.ReadBits(1));
+            EXPECT_EQ(1u, reader2.ReadBits(1));
+          }
+
+          // SkipBits or manually read "skip" bits
+          reader1.SkipBits(skip);
+          for (size_t i = 0; i < skip; ++i) {
+            EXPECT_EQ(0u, reader2.ReadBits(1))
+                << " skip=" << skip << " i=" << i;
+          }
+          EXPECT_EQ(reader1.TotalBitsConsumed(), reader2.TotalBitsConsumed());
+
+          // Ensure both readers see the terminator bits.
+          EXPECT_EQ(5u, reader1.ReadBits(3));
+          EXPECT_EQ(5u, reader2.ReadBits(3));
+
+          EXPECT_TRUE(reader1.Close());
+          EXPECT_TRUE(reader2.Close());
+        }
+      },
+      "TestSkip"));
+}
+
+// Verifies byte order and different groupings of bits.
+TEST(BitReaderTest, TestOrder) {
+  constexpr size_t kMaxBits = 16;
+
+  // u(1) - bits written into LSBs of first byte
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    for (size_t i = 0; i < 5; ++i) {
+      writer.Write(1, 1);
+    }
+    for (size_t i = 0; i < 5; ++i) {
+      writer.Write(1, 0);
+    }
+    for (size_t i = 0; i < 6; ++i) {
+      writer.Write(1, 1);
+    }
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0x1Fu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0xFCu, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // u(8) - get bytes in the same order
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(8, 0xF8);
+    writer.Write(8, 0x3F);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // u(16) - little-endian bytes
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(16, 0xF83F);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+
+  // Non-byte-aligned, mixed sizes
+  {
+    BitWriter writer;
+    BitWriter::Allotment allotment(&writer, kMaxBits);
+    writer.Write(1, 1);
+    writer.Write(3, 6);
+    writer.Write(8, 0xDB);
+    writer.Write(4, 8);
+
+    writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
+    BitReader reader(writer.GetSpan());
+    EXPECT_EQ(0xBDu, reader.ReadFixedBits<8>());
+    EXPECT_EQ(0x8Du, reader.ReadFixedBits<8>());
+    EXPECT_TRUE(reader.Close());
+  }
+}
+
+TEST(BitReaderTest, TotalCountersTest) {
+  uint8_t buf[8] = {1, 2, 3, 4};
+  BitReader reader(Span<const uint8_t>(buf, sizeof(buf)));
+
+  EXPECT_EQ(sizeof(buf), reader.TotalBytes());
+  EXPECT_EQ(0u, reader.TotalBitsConsumed());
+  reader.ReadFixedBits<1>();
+  EXPECT_EQ(1u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<10>();
+  EXPECT_EQ(11u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<4>();
+  EXPECT_EQ(15u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<1>();
+  EXPECT_EQ(16u, reader.TotalBitsConsumed());
+
+  reader.ReadFixedBits<16>();
+  EXPECT_EQ(32u, reader.TotalBitsConsumed());
+
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(BitReaderTest, MoveTest) {
+  uint8_t buf[8] = {1, 2, 3, 4};
+  BitReader reader2;
+  {
+    BitReader reader1(Span<const uint8_t>(buf, sizeof(buf)));
+
+    EXPECT_EQ(0u, reader1.TotalBitsConsumed());
+    reader1.ReadFixedBits<16>();
+    EXPECT_EQ(16u, reader1.TotalBitsConsumed());
+
+    reader2 = std::move(reader1);
+    // From this point reader1 is invalid, but can continue to access reader2
+    // and we don't need to call Close() on reader1.
+  }
+
+  EXPECT_EQ(16u, reader2.TotalBitsConsumed());
+  EXPECT_EQ(3U, reader2.ReadFixedBits<8>());
+  EXPECT_EQ(24u, reader2.TotalBitsConsumed());
+
+  EXPECT_TRUE(reader2.Close());
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/bits_test.cc b/third_party/jpeg-xl/lib/jxl/bits_test.cc
new file mode 100644
index 0000000000..bd7aa548c8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/bits_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/bits.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(BitsTest, TestNumZeroBits) {
+  // Zero input is well-defined.
+  EXPECT_EQ(32u, Num0BitsAboveMS1Bit(0u));
+  EXPECT_EQ(64u, Num0BitsAboveMS1Bit(0ull));
+  EXPECT_EQ(32u, Num0BitsBelowLS1Bit(0u));
+  EXPECT_EQ(64u, Num0BitsBelowLS1Bit(0ull));
+
+  EXPECT_EQ(31u, Num0BitsAboveMS1Bit(1u));
+  EXPECT_EQ(30u, Num0BitsAboveMS1Bit(2u));
+  EXPECT_EQ(63u, Num0BitsAboveMS1Bit(1ull));
+  EXPECT_EQ(62u, Num0BitsAboveMS1Bit(2ull));
+
+  EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1u));
+  EXPECT_EQ(0u, Num0BitsBelowLS1Bit(1ull));
+  EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2u));
+  EXPECT_EQ(1u, Num0BitsBelowLS1Bit(2ull));
+
+  EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x80000000u));
+  EXPECT_EQ(0u, Num0BitsAboveMS1Bit(0x8000000000000000ull));
+  EXPECT_EQ(31u, Num0BitsBelowLS1Bit(0x80000000u));
+  EXPECT_EQ(63u, Num0BitsBelowLS1Bit(0x8000000000000000ull));
+}
+
+TEST(BitsTest, TestFloorLog2) {
+  // for input = [1, 7]
+  const size_t expected[7] = {0, 1, 1, 2, 2, 2, 2};
+  for (uint32_t i = 1; i <= 7; ++i) {
+    EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(i)) << " " << i;
+    EXPECT_EQ(expected[i - 1], FloorLog2Nonzero(uint64_t(i))) << " " << i;
+  }
+
+  EXPECT_EQ(11u, FloorLog2Nonzero(0x00000fffu));  // 4095
+  EXPECT_EQ(12u, FloorLog2Nonzero(0x00001000u));  // 4096
+  EXPECT_EQ(12u, FloorLog2Nonzero(0x00001001u));  // 4097
+
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000u));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001u));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFu));
+
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000000ull));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0x80000001ull));
+  EXPECT_EQ(31u, FloorLog2Nonzero(0xFFFFFFFFull));
+
+  EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000000ull));
+  EXPECT_EQ(63u, FloorLog2Nonzero(0x8000000000000001ull));
+  EXPECT_EQ(63u, FloorLog2Nonzero(0xFFFFFFFFFFFFFFFFull));
+}
+
+TEST(BitsTest, TestCeilLog2) {
+  // for input = [1, 7]
+  const size_t expected[7] = {0, 1, 2, 2, 3, 3, 3};
+  for (uint32_t i = 1; i <= 7; ++i) {
+    EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(i)) << " " << i;
+    EXPECT_EQ(expected[i - 1], CeilLog2Nonzero(uint64_t(i))) << " " << i;
+  }
+
+  EXPECT_EQ(12u, CeilLog2Nonzero(0x00000fffu));  // 4095
+  EXPECT_EQ(12u, CeilLog2Nonzero(0x00001000u));  // 4096
+  EXPECT_EQ(13u, CeilLog2Nonzero(0x00001001u));  // 4097
+
+  EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000u));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001u));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFu));
+
+  EXPECT_EQ(31u, CeilLog2Nonzero(0x80000000ull));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0x80000001ull));
+  EXPECT_EQ(32u, CeilLog2Nonzero(0xFFFFFFFFull));
+
+  EXPECT_EQ(63u, CeilLog2Nonzero(0x8000000000000000ull));
+  EXPECT_EQ(64u, CeilLog2Nonzero(0x8000000000000001ull));
+  EXPECT_EQ(64u, CeilLog2Nonzero(0xFFFFFFFFFFFFFFFFull));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/blending.cc b/third_party/jpeg-xl/lib/jxl/blending.cc
new file mode 100644
index 0000000000..ab37fdabb5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending.cc
@@ -0,0 +1,152 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/blending.h"
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+bool NeedsBlending(PassesDecoderState* dec_state) {
+  const PassesSharedState& state = *dec_state->shared;
+  if (!(state.frame_header.frame_type == FrameType::kRegularFrame ||
+        state.frame_header.frame_type == FrameType::kSkipProgressive)) {
+    return false;
+  }
+  const auto& info = state.frame_header.blending_info;
+  bool replace_all = (info.mode == BlendMode::kReplace);
+  for (const auto& ec_i : state.frame_header.extra_channel_blending_info) {
+    if (ec_i.mode != BlendMode::kReplace) {
+      replace_all = false;
+    }
+  }
+  // Replace the full frame: nothing to do.
+  if (!state.frame_header.custom_size_or_origin && replace_all) {
+    return false;
+  }
+  return true;
+}
+
+void PerformBlending(const float* const* bg, const float* const* fg,
+                     float* const* out, size_t x0, size_t xsize,
+                     const PatchBlending& color_blending,
+                     const PatchBlending* ec_blending,
+                     const std::vector<ExtraChannelInfo>& extra_channel_info) {
+  bool has_alpha = false;
+  size_t num_ec = extra_channel_info.size();
+  for (size_t i = 0; i < num_ec; i++) {
+    if (extra_channel_info[i].type == jxl::ExtraChannel::kAlpha) {
+      has_alpha = true;
+      break;
+    }
+  }
+  ImageF tmp(xsize, 3 + num_ec);
+  // Blend extra channels first so that we use the pre-blending alpha.
+  for (size_t i = 0; i < num_ec; i++) {
+    if (ec_blending[i].mode == PatchBlendMode::kAdd) {
+      for (size_t x = 0; x < xsize; x++) {
+        tmp.Row(3 + i)[x] = bg[3 + i][x + x0] + fg[3 + i][x + x0];
+      }
+    } else if (ec_blending[i].mode == PatchBlendMode::kBlendAbove) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+      PerformAlphaBlending(bg[3 + i] + x0, bg[3 + alpha] + x0, fg[3 + i] + x0,
+                           fg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                           is_premultiplied, ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kBlendBelow) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+      PerformAlphaBlending(fg[3 + i] + x0, fg[3 + alpha] + x0, bg[3 + i] + x0,
+                           bg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                           is_premultiplied, ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddAbove) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      PerformAlphaWeightedAdd(bg[3 + i] + x0, fg[3 + i] + x0,
+                              fg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                              ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kAlphaWeightedAddBelow) {
+      size_t alpha = ec_blending[i].alpha_channel;
+      PerformAlphaWeightedAdd(fg[3 + i] + x0, bg[3 + i] + x0,
+                              bg[3 + alpha] + x0, tmp.Row(3 + i), xsize,
+                              ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kMul) {
+      PerformMulBlending(bg[3 + i] + x0, fg[3 + i] + x0, tmp.Row(3 + i), xsize,
+                         ec_blending[i].clamp);
+    } else if (ec_blending[i].mode == PatchBlendMode::kReplace) {
+      memcpy(tmp.Row(3 + i), fg[3 + i] + x0, xsize * sizeof(**fg));
+    } else if (ec_blending[i].mode == PatchBlendMode::kNone) {
+      if (xsize) memcpy(tmp.Row(3 + i), bg[3 + i] + x0, xsize * sizeof(**fg));
+    } else {
+      JXL_ABORT("Unreachable");
+    }
+  }
+  size_t alpha = color_blending.alpha_channel;
+
+  if (color_blending.mode == PatchBlendMode::kAdd ||
+      (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove &&
+       !has_alpha) ||
+      (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow &&
+       !has_alpha)) {
+    for (int p = 0; p < 3; p++) {
+      float* out = tmp.Row(p);
+      for (size_t x = 0; x < xsize; x++) {
+        out[x] = bg[p][x + x0] + fg[p][x + x0];
+      }
+    }
+  } else if (color_blending.mode == PatchBlendMode::kBlendAbove
+             // blend without alpha is just replace
+             && has_alpha) {
+    bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+    PerformAlphaBlending(
+        {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0},
+        {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0},
+        {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize,
+        is_premultiplied, color_blending.clamp);
+  } else if (color_blending.mode == PatchBlendMode::kBlendBelow
+             // blend without alpha is just replace
+             && has_alpha) {
+    bool is_premultiplied = extra_channel_info[alpha].alpha_associated;
+    PerformAlphaBlending(
+        {fg[0] + x0, fg[1] + x0, fg[2] + x0, fg[3 + alpha] + x0},
+        {bg[0] + x0, bg[1] + x0, bg[2] + x0, bg[3 + alpha] + x0},
+        {tmp.Row(0), tmp.Row(1), tmp.Row(2), tmp.Row(3 + alpha)}, xsize,
+        is_premultiplied, color_blending.clamp);
+  } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddAbove) {
+    JXL_DASSERT(has_alpha);
+    for (size_t c = 0; c < 3; c++) {
+      PerformAlphaWeightedAdd(bg[c] + x0, fg[c] + x0, fg[3 + alpha] + x0,
+                              tmp.Row(c), xsize, color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kAlphaWeightedAddBelow) {
+    JXL_DASSERT(has_alpha);
+    for (size_t c = 0; c < 3; c++) {
+      PerformAlphaWeightedAdd(fg[c] + x0, bg[c] + x0, bg[3 + alpha] + x0,
+                              tmp.Row(c), xsize, color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kMul) {
+    for (int p = 0; p < 3; p++) {
+      PerformMulBlending(bg[p] + x0, fg[p] + x0, tmp.Row(p), xsize,
+                         color_blending.clamp);
+    }
+  } else if (color_blending.mode == PatchBlendMode::kReplace ||
+             color_blending.mode == PatchBlendMode::kBlendAbove ||
+             color_blending.mode == PatchBlendMode::kBlendBelow) {  // kReplace
+    for (size_t p = 0; p < 3; p++) {
+      memcpy(tmp.Row(p), fg[p] + x0, xsize * sizeof(**fg));
+    }
+  } else if (color_blending.mode == PatchBlendMode::kNone) {
+    for (size_t p = 0; p < 3; p++) {
+      memcpy(tmp.Row(p), bg[p] + x0, xsize * sizeof(**fg));
+    }
+  } else {
+    JXL_ABORT("Unreachable");
+  }
+  for (size_t i = 0; i < 3 + num_ec; i++) {
+    if (xsize != 0) memcpy(out[i] + x0, tmp.Row(i), xsize * sizeof(**out));
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/blending.h b/third_party/jpeg-xl/lib/jxl/blending.h
new file mode 100644
index 0000000000..7eab7d50cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BLENDING_H_
+#define LIB_JXL_BLENDING_H_
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+bool NeedsBlending(PassesDecoderState* dec_state);
+
+void PerformBlending(const float* const* bg, const float* const* fg,
+                     float* const* out, size_t x0, size_t xsize,
+                     const PatchBlending& color_blending,
+                     const PatchBlending* ec_blending,
+                     const std::vector<ExtraChannelInfo>& extra_channel_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/blending_test.cc b/third_party/jpeg-xl/lib/jxl/blending_test.cc
new file mode 100644
index 0000000000..ff4c46c529
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/blending_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::SizeIs;
+
+TEST(BlendingTest, Crops) {
+  const PaddedBytes compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+  CodecInOut decoded;
+  ASSERT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &decoded));
+  ASSERT_THAT(decoded.frames, SizeIs(4));
+
+  int i = 0;
+  for (const ImageBundle& ib : decoded.frames) {
+    std::ostringstream filename;
+    filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png";
+    const PaddedBytes compressed_frame =
+        jxl::test::ReadTestData(filename.str());
+    CodecInOut frame;
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(compressed_frame), &frame));
+    JXL_EXPECT_OK(SamePixels(ib.color(), *frame.Main().color(), _));
+    ++i;
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc
new file mode 100644
index 0000000000..c4cba3a31a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.cc
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/box_content_decoder.h"
+
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+JxlBoxContentDecoder::JxlBoxContentDecoder() {}
+
+JxlBoxContentDecoder::~JxlBoxContentDecoder() {
+  if (brotli_dec) {
+    BrotliDecoderDestroyInstance(brotli_dec);
+  }
+}
+
+void JxlBoxContentDecoder::StartBox(bool brob_decode, bool box_until_eof,
+                                    size_t contents_size) {
+  if (brotli_dec) {
+    BrotliDecoderDestroyInstance(brotli_dec);
+    brotli_dec = nullptr;
+  }
+  header_done_ = false;
+  brob_decode_ = brob_decode;
+  box_until_eof_ = box_until_eof;
+  remaining_ = box_until_eof ? 0 : contents_size;
+  pos_ = 0;
+}
+
+JxlDecoderStatus JxlBoxContentDecoder::Process(const uint8_t* next_in,
+                                               size_t avail_in, size_t box_pos,
+                                               uint8_t** next_out,
+                                               size_t* avail_out) {
+  next_in += pos_ - box_pos;
+  avail_in -= pos_ - box_pos;
+
+  if (brob_decode_) {
+    if (!header_done_) {
+      if (avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (!box_until_eof_) {
+        if (remaining_ < 4) return JXL_DEC_ERROR;
+        remaining_ -= 4;
+      }
+      next_in += 4;
+      avail_in -= 4;
+      pos_ += 4;
+      header_done_ = true;
+    }
+
+    if (!brotli_dec) {
+      brotli_dec = BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
+    }
+
+    const uint8_t* next_in_before = next_in;
+    uint8_t* next_out_before = *next_out;
+    msan::MemoryIsInitialized(next_in, avail_in);
+    BrotliDecoderResult res = BrotliDecoderDecompressStream(
+        brotli_dec, &avail_in, &next_in, avail_out, next_out, nullptr);
+    size_t consumed = next_in - next_in_before;
+    size_t produced = *next_out - next_out_before;
+    if (res == BROTLI_DECODER_RESULT_ERROR) {
+      return JXL_DEC_ERROR;
+    }
+    msan::UnpoisonMemory(next_out_before, produced);
+    pos_ += consumed;
+    if (!box_until_eof_) remaining_ -= consumed;
+    if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) {
+      return JXL_DEC_NEED_MORE_INPUT;
+    }
+    if (res == BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
+      return JXL_DEC_BOX_NEED_MORE_OUTPUT;
+    }
+    if (res == BROTLI_DECODER_RESULT_SUCCESS) {
+      return JXL_DEC_SUCCESS;
+    }
+    // unknown Brotli result
+    return JXL_DEC_ERROR;
+  } else {
+    // remaining box bytes as seen from dec->file_pos
+    size_t can_read = avail_in;
+    if (!box_until_eof_) can_read = std::min<size_t>(can_read, remaining_);
+    size_t to_write = std::min<size_t>(can_read, *avail_out);
+    memcpy(*next_out, next_in, to_write);
+
+    *next_out += to_write;
+    *avail_out -= to_write;
+    if (!box_until_eof_) remaining_ -= to_write;
+    pos_ += to_write;
+
+    if (to_write < can_read) return JXL_DEC_BOX_NEED_MORE_OUTPUT;
+
+    if (!box_until_eof_ && remaining_ > 0) return JXL_DEC_NEED_MORE_INPUT;
+
+    return JXL_DEC_SUCCESS;
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/box_content_decoder.h b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
new file mode 100644
index 0000000000..6153360a8e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BOX_CONTENT_DECODER_H_
+#define LIB_JXL_BOX_CONTENT_DECODER_H_
+
+#include <brotli/decode.h>
+#include <jxl/decode.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <vector>
+
+namespace jxl {
+
+/** Outputs the contents of a box in a streaming fashion, either directly, or
+ * optionally decoding with Brotli, in case of a brob box. The input must be
+ * the contents of a box, excluding the box header.
+ */
+class JxlBoxContentDecoder {
+ public:
+  JxlBoxContentDecoder();
+  ~JxlBoxContentDecoder();
+
+  void StartBox(bool brob_decode, bool box_until_eof, size_t contents_size);
+
+  // Outputs decoded bytes from the box, decoding with brotli if needed.
+  // box_pos is the position in the box content which next_in points to.
+  // Returns success, whether more input or output bytes are needed, or error.
+  JxlDecoderStatus Process(const uint8_t* next_in, size_t avail_in,
+                           size_t box_pos, uint8_t** next_out,
+                           size_t* avail_out);
+
+ private:
+  BrotliDecoderState* brotli_dec;
+
+  bool header_done_;
+  bool brob_decode_;
+  bool box_until_eof_;
+  size_t remaining_;
+  size_t pos_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BOX_CONTENT_DECODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
new file mode 100644
index 0000000000..a412becd0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
@@ -0,0 +1,1988 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
+//
+// The physical architecture of butteraugli is based on the following naming
+// convention:
+//   * Opsin - dynamics of the photosensitive chemicals in the retina
+//             with their immediate electrical processing
+//   * Xyb - hybrid opponent/trichromatic color space
+//     x is roughly red-subtract-green.
+//     y is yellow.
+//     b is blue.
+//     Xyb values are computed from Opsin mixing, not directly from rgb.
+//   * Mask - for visual masking
+//   * Hf - color modeling for spatially high-frequency features
+//   * Lf - color modeling for spatially low-frequency features
+//   * Diffmap - to cluster and build an image of error between the images
+//   * Blur - to hold the smoothing code
+
+#include "lib/jxl/butteraugli/butteraugli.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <new>
+#include <vector>
+
+#if JXL_PROFILER_ENABLED
+#include <chrono>
+#endif  // JXL_PROFILER_ENABLED
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image_ops.h"
+
+#ifndef JXL_BUTTERAUGLI_ONCE
+#define JXL_BUTTERAUGLI_ONCE
+
+namespace jxl {
+
+std::vector<float> ComputeKernel(float sigma) {
+  const float m = 2.25;  // Accuracy increases when m is increased.
+  const double scaler = -1.0 / (2.0 * sigma * sigma);
+  const int diff = std::max<int>(1, m * std::fabs(sigma));
+  std::vector<float> kernel(2 * diff + 1);
+  for (int i = -diff; i <= diff; ++i) {
+    kernel[i + diff] = std::exp(scaler * i * i);
+  }
+  return kernel;
+}
+
+void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
+                          const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
+  const size_t offset = kernel.size() / 2;
+  int minx = x < offset ? 0 : x - offset;
+  int maxx = std::min<int>(in.xsize() - 1, x + offset);
+  float weight = 0.0f;
+  for (int j = minx; j <= maxx; ++j) {
+    weight += kernel[j - x + offset];
+  }
+  float scale = 1.0f / weight;
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
+    float sum = 0.0f;
+    for (int j = minx; j <= maxx; ++j) {
+      sum += row_in[j] * kernel[j - x + offset];
+    }
+    row_out[y] = sum * scale;
+  }
+}
+
+// Computes a horizontal convolution and transposes the result.
+void ConvolutionWithTranspose(const ImageF& in,
+                              const std::vector<float>& kernel,
+                              ImageF* BUTTERAUGLI_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(out->xsize() == in.ysize());
+  JXL_CHECK(out->ysize() == in.xsize());
+  const size_t len = kernel.size();
+  const size_t offset = len / 2;
+  float weight_no_border = 0.0f;
+  for (size_t j = 0; j < len; ++j) {
+    weight_no_border += kernel[j];
+  }
+  const float scale_no_border = 1.0f / weight_no_border;
+  const size_t border1 = std::min(in.xsize(), offset);
+  const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
+  std::vector<float> scaled_kernel(len / 2 + 1);
+  for (size_t i = 0; i <= len / 2; ++i) {
+    scaled_kernel[i] = kernel[i] * scale_no_border;
+  }
+
+  // middle
+  switch (len) {
+    case 7: {
+      PROFILER_ZONE("conv7");
+      const float sk0 = scaled_kernel[0];
+      const float sk1 = scaled_kernel[1];
+      const float sk2 = scaled_kernel[2];
+      const float sk3 = scaled_kernel[3];
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          const float sum0 = (row_in[0] + row_in[6]) * sk0;
+          const float sum1 = (row_in[1] + row_in[5]) * sk1;
+          const float sum2 = (row_in[2] + row_in[4]) * sk2;
+          const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum;
+        }
+      }
+    } break;
+    case 13: {
+      PROFILER_ZONE("conv15");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
+          const float sum = (row_in[6]) * scaled_kernel[6];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    case 15: {
+      PROFILER_ZONE("conv15");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
+          sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
+          const float sum = (row_in[7]) * scaled_kernel[7];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    case 33: {
+      PROFILER_ZONE("conv33");
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
+        for (size_t x = border1; x < border2; ++x, ++row_in) {
+          float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
+          float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
+          float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
+          float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
+          sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
+          sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
+          sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
+          sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
+          sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
+          sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
+          sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
+          sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
+          sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
+          sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
+          sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
+          sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
+          const float sum = (row_in[16]) * scaled_kernel[16];
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
+        }
+      }
+      break;
+    }
+    default:
+      printf("Warning: Unexpected kernel size! %" PRIuS "\n", len);
+      for (size_t y = 0; y < in.ysize(); ++y) {
+        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
+        for (size_t x = border1; x < border2; ++x) {
+          const int d = x - offset;
+          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
+          float sum = 0.0f;
+          size_t j;
+          for (j = 0; j <= len / 2; ++j) {
+            sum += row_in[d + j] * scaled_kernel[j];
+          }
+          for (; j < len; ++j) {
+            sum += row_in[d + j] * scaled_kernel[len - 1 - j];
+          }
+          row_out[y] = sum;
+        }
+      }
+  }
+  // left border
+  for (size_t x = 0; x < border1; ++x) {
+    ConvolveBorderColumn(in, kernel, x, out->Row(x));
+  }
+
+  // right border
+  for (size_t x = border2; x < in.xsize(); ++x) {
+    ConvolveBorderColumn(in, kernel, x, out->Row(x));
+  }
+}
+
+// A blur somewhat similar to a 2D Gaussian blur.
+// See: https://en.wikipedia.org/wiki/Gaussian_blur
+//
+// This is a bottleneck because the sigma can be quite large (>7). We can use
+// gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
+// Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
+// zero-valued. This leads to noticeable differences at the edges of diffmaps.
+// We retain a special case for 5x5 kernels (even faster than gauss_blur),
+// optionally use gauss_blur followed by fixup of the borders for large images,
+// or fall back to the previous truncated FIR followed by a transpose.
+void Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
+          BlurTemp* temp, ImageF* out) {
+  std::vector<float> kernel = ComputeKernel(sigma);
+  // Separable5 does an in-place convolution, so this fast path is not safe if
+  // in aliases out.
+  if (kernel.size() == 5 && &in != out) {
+    float sum_weights = 0.0f;
+    for (const float w : kernel) {
+      sum_weights += w;
+    }
+    const float scale = 1.0f / sum_weights;
+    const float w0 = kernel[2] * scale;
+    const float w1 = kernel[1] * scale;
+    const float w2 = kernel[0] * scale;
+    const WeightsSeparable5 weights = {
+        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+    };
+    Separable5(in, Rect(in), weights, /*pool=*/nullptr, out);
+    return;
+  }
+
+  ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in);
+  ConvolutionWithTranspose(in, kernel, temp_t);
+  ConvolutionWithTranspose(*temp_t, kernel, out);
+}
+
+// Allows PaddedMaltaUnit to call either function via overloading.
+struct MaltaTagLF {};
+struct MaltaTag {};
+
+}  // namespace jxl
+
+#endif  // JXL_BUTTERAUGLI_ONCE
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::MulSub;
+using hwy::HWY_NAMESPACE::Neg;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+template <class D, class V>
+HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
+  static const double kMul = 0.724216145665;
+  const V mul = Set(d, kMul);
+  const V maxval = Set(d, kMaxVal);
+  // If greater than maxval or less than -maxval, replace with if_*.
+  const V if_pos = MulAdd(Sub(v, maxval), mul, maxval);
+  const V if_neg = MulSub(Add(v, maxval), mul, maxval);
+  const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v);
+  return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v);
+}
+
+// Make area around zero less important (remove it).
+template <class D, class V>
+HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
+  const auto w = Set(d, kw);
+  return IfThenElse(Gt(x, w), Sub(x, w),
+                    IfThenElseZero(Lt(x, Neg(w)), Add(x, w)));
+}
+
+// Make area around zero more important (2x it until the limit).
+template <class D, class V>
+HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
+  const auto w = Set(d, kw);
+  return IfThenElse(Gt(x, w), Add(x, w),
+                    IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x)));
+}
+
+// XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
+// Vals space can be converted to L2-norm space (Euclidean and normalized)
+// through visual masking.
+template <class D, class V>
+HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
+                                 const V& b_arg, V* HWY_RESTRICT valx,
+                                 V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
+  static const double xmul_scalar = 33.832837186260;
+  static const double ymul_scalar = 14.458268100570;
+  static const double bmul_scalar = 49.87984651440;
+  static const double y_to_b_mul_scalar = -0.362267051518;
+  const V xmul = Set(d, xmul_scalar);
+  const V ymul = Set(d, ymul_scalar);
+  const V bmul = Set(d, bmul_scalar);
+  const V y_to_b_mul = Set(d, y_to_b_mul_scalar);
+  const V b = MulAdd(y_to_b_mul, y, b_arg);
+  *valb = Mul(b, bmul);
+  *valx = Mul(x, xmul);
+  *valy = Mul(y, ymul);
+}
+
+void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw,
+                  ImageF* HWY_RESTRICT out) {
+  JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out));
+  const size_t xsize = in_x.xsize();
+  const size_t ysize = in_x.ysize();
+
+  const HWY_FULL(float) d;
+  static const double s = 0.653020556257;
+  const auto sv = Set(d, s);
+  const auto one_minus_s = Set(d, 1.0 - s);
+  const auto ywv = Set(d, yw);
+
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* HWY_RESTRICT row_x = in_x.ConstRow(y);
+    const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
+    float* HWY_RESTRICT row_out = out->Row(y);
+
+    for (size_t x = 0; x < xsize; x += Lanes(d)) {
+      const auto vx = Load(d, row_x + x);
+      const auto vy = Load(d, row_y + x);
+      const auto scaler =
+          MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
+      Store(Mul(scaler, vx), d, row_out + x);
+    }
+  }
+}
+
+static void SeparateFrequencies(size_t xsize, size_t ysize,
+                                const ButteraugliParams& params,
+                                BlurTemp* blur_temp, const Image3F& xyb,
+                                PsychoImage& ps) {
+  PROFILER_FUNC;
+  const HWY_FULL(float) d;
+
+  // Extract lf ...
+  static const double kSigmaLf = 7.15593339443;
+  static const double kSigmaHf = 3.22489901262;
+  static const double kSigmaUhf = 1.56416327805;
+  ps.mf = Image3F(xsize, ysize);
+  ps.hf[0] = ImageF(xsize, ysize);
+  ps.hf[1] = ImageF(xsize, ysize);
+  ps.lf = Image3F(xyb.xsize(), xyb.ysize());
+  ps.mf = Image3F(xyb.xsize(), xyb.ysize());
+  for (int i = 0; i < 3; ++i) {
+    Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i));
+
+    // ... and keep everything else in mf.
+    for (size_t y = 0; y < ysize; ++y) {
+      const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y);
+      const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y);
+      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
+      for (size_t x = 0; x < xsize; x += Lanes(d)) {
+        const auto mf = Sub(Load(d, row_xyb + x), Load(d, row_lf + x));
+        Store(mf, d, row_mf + x);
+      }
+    }
+    if (i == 2) {
+      Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+      break;
+    }
+    // Divide mf into mf and hf.
+    for (size_t y = 0; y < ysize; ++y) {
+      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
+      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      for (size_t x = 0; x < xsize; x += Lanes(d)) {
+        Store(Load(d, row_mf + x), d, row_hf + x);
+      }
+    }
+    Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+    static const double kRemoveMfRange = 0.29;
+    static const double kAddMfRange = 0.1;
+    if (i == 0) {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto mf = Load(d, row_mf + x);
+          auto hf = Sub(Load(d, row_hf + x), mf);
+          mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
+          Store(mf, d, row_mf + x);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    } else {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto mf = Load(d, row_mf + x);
+          auto hf = Sub(Load(d, row_hf + x), mf);
+
+          mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
+          Store(mf, d, row_mf + x);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    }
+  }
+
+  // Temporarily used as output of SuppressXByY
+  ps.uhf[0] = ImageF(xsize, ysize);
+  ps.uhf[1] = ImageF(xsize, ysize);
+
+  // Suppress red-green by intensity change in the high freq channels.
+  static const double suppress = 46.0;
+  SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]);
+  // hf is the SuppressXByY output, uhf will be written below.
+  ps.hf[0].Swap(ps.uhf[0]);
+
+  for (int i = 0; i < 2; ++i) {
+    // Divide hf into hf and uhf.
+    for (size_t y = 0; y < ysize; ++y) {
+      float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y);
+      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_uhf[x] = row_hf[x];
+      }
+    }
+    Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]);
+    static const double kRemoveHfRange = 1.5;
+    static const double kAddHfRange = 0.132;
+    static const double kRemoveUhfRange = 0.04;
+    static const double kMaxclampHf = 28.4691806922;
+    static const double kMaxclampUhf = 5.19175294647;
+    static double kMulYHf = 2.155;
+    static double kMulYUhf = 2.69313763794;
+    if (i == 0) {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto hf = Load(d, row_hf + x);
+          auto uhf = Sub(Load(d, row_uhf + x), hf);
+          hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
+          uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
+          Store(hf, d, row_hf + x);
+          Store(uhf, d, row_uhf + x);
+        }
+      }
+    } else {
+      for (size_t y = 0; y < ysize; ++y) {
+        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          auto hf = Load(d, row_hf + x);
+          hf = MaximumClamp(d, hf, kMaxclampHf);
+
+          auto uhf = Sub(Load(d, row_uhf + x), hf);
+          uhf = MaximumClamp(d, uhf, kMaxclampUhf);
+          uhf = Mul(uhf, Set(d, kMulYUhf));
+          Store(uhf, d, row_uhf + x);
+
+          hf = Mul(hf, Set(d, kMulYHf));
+          hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
+          Store(hf, d, row_hf + x);
+        }
+      }
+    }
+  }
+  // Modify range around zero code only concerns the high frequency
+  // planes and only the X and Y channels.
+  // Convert low freq xyb to vals space so that we can do a simple squared sum
+  // diff on the low frequencies later.
+  for (size_t y = 0; y < ysize; ++y) {
+    float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y);
+    for (size_t x = 0; x < xsize; x += Lanes(d)) {
+      auto valx = Undefined(d);
+      auto valy = Undefined(d);
+      auto valb = Undefined(d);
+      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
+                       Load(d, row_b + x), &valx, &valy, &valb);
+      Store(valx, d, row_x + x);
+      Store(valy, d, row_y + x);
+      Store(valb, d, row_b + x);
+    }
+  }
+}
+
+namespace {
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
+  return Add(Add(a, b), Add(c, d));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) {
+  return Sum(a, b, c, Add(d, e));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) {
+  return Sum(a, b, c, Sum(d, e, f, g));
+}
+template <typename V>
+BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) {
+  return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i);
+}
+}  // namespace
+
+template <class D>
+Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
+                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
+  const intptr_t xs3 = 3 * xs;
+
+  const auto center = LoadU(df, d);
+
+  // x grows, y constant
+  const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center,
+                              LoadU(df, d + 2), LoadU(df, d + 4));
+  // Will return this, sum of all line kernels
+  auto retval = Mul(sum_yconst, sum_yconst);
+  {
+    // y grows, x constant
+    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center,
+                   LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // both grow
+    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center,
+                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows, x shrinks
+    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center,
+                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows -4 to 4, x shrinks 1 -> -1
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center,
+            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    //  y grows -4 to 4, x grows -1 -> 1
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center,
+            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y grows -1 to 1
+    auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center,
+                   LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y shrinks 1 to -1
+    auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center,
+                   LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1__*______
+       2___*_____
+       3_________
+       4____0____
+       5_________
+       6_____*___
+       7______*__
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center,
+                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1______*__
+       2_____*___
+       3_________
+       4____0____
+       5_________
+       6___*_____
+       7__*______
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center,
+                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_*_______
+       3__*______
+       4____0____
+       5______*__
+       6_______*_
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center,
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_______*_
+       3______*__
+       4____0____
+       5__*______
+       6_*_______
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center,
+                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2________*
+       3______*__
+       4____0____
+       5__*______
+       6*________
+       7_________
+       8_________ */
+
+    auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center,
+                   LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2*________
+       3__*______
+       4____0____
+       5______*__
+       6________*
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center,
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0__*______
+       1_________
+       2___*_____
+       3_________
+       4____0____
+       5_________
+       6_____*___
+       7_________
+       8______*__ */
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center,
+            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0______*__
+       1_________
+       2_____*___
+       3_________
+       4____0____
+       5_________
+       6___*_____
+       7_________
+       8__*______ */
+    auto sum =
+        Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center,
+            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  return retval;
+}
+
+template <class D>
+Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
+                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
+  const intptr_t xs3 = 3 * xs;
+
+  const auto center = LoadU(df, d);
+
+  // x grows, y constant
+  const auto sum_yconst =
+      Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2),
+          LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2),
+          LoadU(df, d + 3), LoadU(df, d + 4));
+  // Will return this, sum of all line kernels
+  auto retval = Mul(sum_yconst, sum_yconst);
+
+  {
+    // y grows, x constant
+    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3),
+                   LoadU(df, d - xs - xs), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs),
+                   LoadU(df, d + xs3), LoadU(df, d + xs3 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // both grow
+    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows, x shrinks
+    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // y grows -4 to 4, x shrinks 1 -> -1
+    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
+                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
+                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    //  y grows -4 to 4, x grows -1 -> 1
+    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
+                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
+                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y grows -1 to 1
+    auto sum =
+        Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs),
+            LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    // x grows -4 to 4, y shrinks 1 to -1
+    auto sum =
+        Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs),
+            LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1__*______
+       2___*_____
+       3___*_____
+       4____0____
+       5_____*___
+       6_____*___
+       7______*__
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1______*__
+       2_____*___
+       3_____*___
+       4____0____
+       5___*_____
+       6___*_____
+       7__*______
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_*_______
+       3__**_____
+       4____0____
+       5_____**__
+       6_______*_
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2),
+                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
+                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_______*_
+       3_____**__
+       4____0____
+       5__**_____
+       6_*_______
+       7_________
+       8_________ */
+    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2),
+                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
+                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_________
+       3______***
+       4___*0*___
+       5***______
+       6_________
+       7_________
+       8_________ */
+
+    auto sum =
+        Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2),
+            LoadU(df, d - xs + 3), LoadU(df, d - xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_________
+       1_________
+       2_________
+       3***______
+       4___*0*___
+       5______***
+       6_________
+       7_________
+       8_________ */
+    auto sum =
+        Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2),
+            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2),
+            LoadU(df, d + xs + 3), LoadU(df, d + xs + 4));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0___*_____
+       1___*_____
+       2___*_____
+       3____*____
+       4____0____
+       5____*____
+       6_____*___
+       7_____*___
+       8_____*___ */
+    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
+                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
+                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  {
+    /* 0_____*___
+       1_____*___
+       2____ *___
+       3____*____
+       4____0____
+       5____*____
+       6___*_____
+       7___*_____
+       8___*_____ */
+    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
+                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
+                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
+                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
+    retval = MulAdd(sum, sum, retval);
+  }
+  return retval;
+}
+
+// Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
+// to be far enough from the image borders. "diffs" is a packed image.
+template <class Tag>
+static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
+                                                const size_t x0,
+                                                const size_t y0) {
+  const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
+  const HWY_CAPPED(float, 1) df;
+  if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
+       y0 < (diffs.ysize() - 4))) {
+    return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
+  }
+
+  PROFILER_ZONE("Padded Malta");
+  float borderimage[12 * 9];  // round up to 4
+  for (int dy = 0; dy < 9; ++dy) {
+    int y = y0 + dy - 4;
+    if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
+      for (int dx = 0; dx < 12; ++dx) {
+        borderimage[dy * 12 + dx] = 0.0f;
+      }
+      continue;
+    }
+
+    const float* row_diffs = diffs.ConstRow(y);
+    for (int dx = 0; dx < 9; ++dx) {
+      int x = x0 + dx - 4;
+      if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
+        borderimage[dy * 12 + dx] = 0.0f;
+      } else {
+        borderimage[dy * 12 + dx] = row_diffs[x];
+      }
+    }
+    std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
+  }
+  return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
+}
+
+template <class Tag>
+static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
+                          const double w_0gt1, const double w_0lt1,
+                          const double norm1, const double len,
+                          const double mulli, ImageF* HWY_RESTRICT diffs,
+                          Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
+  const size_t xsize_ = lum0.xsize();
+  const size_t ysize_ = lum0.ysize();
+
+  const float kWeight0 = 0.5;
+  const float kWeight1 = 0.33;
+
+  const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
+  const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
+  const float norm2_0gt1 = w_pre0gt1 * norm1;
+  const float norm2_0lt1 = w_pre0lt1 * norm1;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
+    const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
+    float* HWY_RESTRICT row_diffs = diffs->Row(y);
+    for (size_t x = 0; x < xsize_; ++x) {
+      const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
+      const float diff = row0[x] - row1[x];
+      const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
+
+      // Primary symmetric quadratic objective.
+      row_diffs[x] = scaler * diff;
+
+      const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
+      const double fabs0 = std::fabs(row0[x]);
+
+      // Secondary half-open quadratic objectives.
+      const double too_small = 0.55 * fabs0;
+      const double too_big = 1.05 * fabs0;
+
+      if (row0[x] < 0) {
+        if (row1[x] > -too_small) {
+          double impact = scaler2 * (row1[x] + too_small);
+          row_diffs[x] -= impact;
+        } else if (row1[x] < -too_big) {
+          double impact = scaler2 * (-row1[x] - too_big);
+          row_diffs[x] += impact;
+        }
+      } else {
+        if (row1[x] < too_small) {
+          double impact = scaler2 * (too_small - row1[x]);
+          row_diffs[x] += impact;
+        } else if (row1[x] > too_big) {
+          double impact = scaler2 * (row1[x] - too_big);
+          row_diffs[x] -= impact;
+        }
+      }
+    }
+  }
+
+  size_t y0 = 0;
+  // Top
+  for (; y0 < 4; ++y0) {
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    for (size_t x0 = 0; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+
+  const HWY_FULL(float) df;
+  const size_t aligned_x = std::max(size_t(4), Lanes(df));
+  const intptr_t stride = diffs->PixelsPerRow();
+
+  // Middle
+  for (; y0 < ysize_ - 4; ++y0) {
+    const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    size_t x0 = 0;
+    for (; x0 < aligned_x; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+    for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
+      auto diff = Load(df, row_diff + x0);
+      diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride));
+      Store(diff, df, row_diff + x0);
+    }
+
+    for (; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+
+  // Bottom
+  for (; y0 < ysize_; ++y0) {
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    for (size_t x0 = 0; x0 < xsize_; ++x0) {
+      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
+    }
+  }
+}
+
+// Need non-template wrapper functions for HWY_EXPORT.
+void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                  const double w_0lt1, const double norm1, const double len,
+                  const double mulli, ImageF* HWY_RESTRICT diffs,
+                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
+                diffs, block_diff_ac, c);
+}
+
+void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                    const double w_0lt1, const double norm1, const double len,
+                    const double mulli, ImageF* HWY_RESTRICT diffs,
+                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
+                diffs, block_diff_ac, c);
+}
+
+void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
+  PROFILER_FUNC;
+  const size_t xsize = xyb.xsize();
+  const size_t ysize = xyb.ysize();
+  const float bias = mul * bias_arg;
+  const float sqrt_bias = sqrt(bias);
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
+    float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      // kBias makes sqrt behave more linearly.
+      row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
+    }
+  }
+}
+
+// std::log(80.0) / std::log(255.0);
+constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
+static const float kInternalGoodQualityThreshold =
+    17.83f * kIntensityTargetNormalizationHack;
+static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
+
+void StoreMin3(const float v, float& min0, float& min1, float& min2) {
+  if (v < min2) {
+    if (v < min0) {
+      min2 = min1;
+      min1 = min0;
+      min0 = v;
+    } else if (v < min1) {
+      min2 = min1;
+      min1 = v;
+    } else {
+      min2 = v;
+    }
+  }
+}
+
+// Look for smooth areas near the area of degradation.
+// If the areas area generally smooth, don't do masking.
+void FuzzyErosion(const ImageF& from, ImageF* to) {
+  const size_t xsize = from.xsize();
+  const size_t ysize = from.ysize();
+  static const int kStep = 3;
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      float min0 = from.Row(y)[x];
+      float min1 = 2 * min0;
+      float min2 = min1;
+      if (x >= kStep) {
+        float v = from.Row(y)[x - kStep];
+        StoreMin3(v, min0, min1, min2);
+        if (y >= kStep) {
+          float v = from.Row(y - kStep)[x - kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+        if (y < ysize - kStep) {
+          float v = from.Row(y + kStep)[x - kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+      }
+      if (x < xsize - kStep) {
+        float v = from.Row(y)[x + kStep];
+        StoreMin3(v, min0, min1, min2);
+        if (y >= kStep) {
+          float v = from.Row(y - kStep)[x + kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+        if (y < ysize - kStep) {
+          float v = from.Row(y + kStep)[x + kStep];
+          StoreMin3(v, min0, min1, min2);
+        }
+      }
+      if (y >= kStep) {
+        float v = from.Row(y - kStep)[x];
+        StoreMin3(v, min0, min1, min2);
+      }
+      if (y < ysize - kStep) {
+        float v = from.Row(y + kStep)[x];
+        StoreMin3(v, min0, min1, min2);
+      }
+      to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
+    }
+  }
+}
+
+// Compute values of local frequency and dc masking based on the activity
+// in the two images. img_diff_ac may be null.
+void Mask(const ImageF& mask0, const ImageF& mask1,
+          const ButteraugliParams& params, BlurTemp* blur_temp,
+          ImageF* BUTTERAUGLI_RESTRICT mask,
+          ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
+  // Only X and Y components are involved in masking. B's influence
+  // is considered less important in the high frequency area, and we
+  // don't model masking from lower frequency signals.
+  PROFILER_FUNC;
+  const size_t xsize = mask0.xsize();
+  const size_t ysize = mask0.ysize();
+  *mask = ImageF(xsize, ysize);
+  static const float kMul = 6.19424080439;
+  static const float kBias = 12.61050594197;
+  static const float kRadius = 2.7;
+  ImageF diff0(xsize, ysize);
+  ImageF diff1(xsize, ysize);
+  ImageF blurred0(xsize, ysize);
+  ImageF blurred1(xsize, ysize);
+  DiffPrecompute(mask0, kMul, kBias, &diff0);
+  DiffPrecompute(mask1, kMul, kBias, &diff1);
+  Blur(diff0, kRadius, params, blur_temp, &blurred0);
+  FuzzyErosion(blurred0, &diff0);
+  Blur(diff1, kRadius, params, blur_temp, &blurred1);
+  FuzzyErosion(blurred1, &diff1);
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      mask->Row(y)[x] = diff0.Row(y)[x];
+      if (diff_ac != nullptr) {
+        static const float kMaskToErrorMul = 10.0;
+        float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
+        diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
+      }
+    }
+  }
+}
+
+// `diff_ac` may be null.
+void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
+                     const size_t xsize, const size_t ysize,
+                     const ButteraugliParams& params, Image3F* temp,
+                     BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask,
+                     ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
+  ImageF mask0(xsize, ysize);
+  ImageF mask1(xsize, ysize);
+  static const float muls[3] = {
+      2.5f,
+      0.4f,
+      0.4f,
+  };
+  // Silly and unoptimized approach here. TODO(jyrki): rework this.
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y);
+    float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y);
+    float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0];
+      float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0];
+      float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2];
+      float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2];
+      row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0;
+      row0[x] = sqrt(row0[x]);
+      row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1;
+      row1[x] = sqrt(row1[x]);
+    }
+  }
+  Mask(mask0, mask1, params, blur_temp, mask, diff_ac);
+}
+
+double MaskY(double delta) {
+  static const double offset = 0.829591754942;
+  static const double scaler = 0.451936922203;
+  static const double mul = 2.5485944793;
+  const double c = mul / ((scaler * delta) + offset);
+  const double retval = kGlobalScale * (1.0 + c);
+  return retval * retval;
+}
+
+double MaskDcY(double delta) {
+  static const double offset = 0.20025578522;
+  static const double scaler = 3.87449418804;
+  static const double mul = 0.505054525019;
+  const double c = mul / ((scaler * delta) + offset);
+  const double retval = kGlobalScale * (1.0 + c);
+  return retval * retval;
+}
+
+inline float MaskColor(const float color[3], const float mask) {
+  return color[0] * mask + color[1] * mask + color[2] * mask;
+}
+
+// Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks}
+void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
+                              const Image3F& block_diff_ac, float xmul,
+                              ImageF* result) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(mask, *result));
+  size_t xsize = mask.xsize();
+  size_t ysize = mask.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      float val = mask.Row(y)[x];
+      float maskval = MaskY(val);
+      float dc_maskval = MaskDcY(val);
+      float diff_dc[3];
+      float diff_ac[3];
+      for (int i = 0; i < 3; ++i) {
+        diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
+        diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
+      }
+      diff_ac[0] *= xmul;
+      diff_dc[0] *= xmul;
+      row_out[x] =
+          sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval));
+    }
+  }
+}
+
+// Adds weighted L2 difference between i0 and i1 to diffmap.
+static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
+                   Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w == 0) return;
+
+  const HWY_FULL(float) d;
+  const auto weight = Set(d, w);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
+      const auto diff2 = Mul(diff, diff);
+      const auto prev = Load(d, row_diff + x);
+      Store(MulAdd(diff2, weight, prev), d, row_diff + x);
+    }
+  }
+}
+
+// Initializes diffmap to the weighted L2 difference between i0 and i1.
+static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
+                      Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w == 0) return;
+
+  const HWY_FULL(float) d;
+  const auto weight = Set(d, w);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
+      const auto diff2 = Mul(diff, diff);
+      Store(Mul(diff2, weight), d, row_diff + x);
+    }
+  }
+}
+
+// i0 is the original image.
+// i1 is the deformed copy.
+static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
+                             float w_0lt1,
+                             Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+  if (w_0gt1 == 0 && w_0lt1 == 0) {
+    return;
+  }
+
+  const HWY_FULL(float) d;
+  const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
+  const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
+
+  for (size_t y = 0; y < i0.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
+    const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+
+    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
+      const auto val0 = Load(d, row0 + x);
+      const auto val1 = Load(d, row1 + x);
+
+      // Primary symmetric quadratic objective.
+      const auto diff = Sub(val0, val1);
+      auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x));
+
+      // Secondary half-open quadratic objectives.
+      const auto fabs0 = Abs(val0);
+      const auto too_small = Mul(Set(d, 0.4), fabs0);
+      const auto too_big = fabs0;
+
+      const auto if_neg = IfThenElse(
+          Gt(val1, Neg(too_small)), Add(val1, too_small),
+          IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big)));
+      const auto if_pos =
+          IfThenElse(Lt(val1, too_small), Sub(too_small, val1),
+                     IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big)));
+      const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos);
+      total = MulAdd(vw_0lt1, Mul(v, v), total);
+      Store(total, d, row_diff + x);
+    }
+  }
+}
+
+// A simple HDR compatible gamma function.
+template <class DF, class V>
+V Gamma(const DF df, V v) {
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
+  const auto kRetAdd = Set(df, -23.16046239805755);
+  // This should happen rarely, but may lead to a NaN in log, which is
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  v = ZeroIfNegative(v);
+
+  const auto biased = Add(v, Set(df, 9.9710635769299145));
+  const auto log = FastLog2f(df, biased);
+  // We could fold this into a custom Log2 polynomial, but there would be
+  // relatively little gain.
+  return MulAdd(kRetMul, log, kRetAdd);
+}
+
+template <bool Clamp, class DF, class V>
+BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
+                                        const V& in2, V* JXL_RESTRICT out0,
+                                        V* JXL_RESTRICT out1,
+                                        V* JXL_RESTRICT out2) {
+  // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
+  static const double mixi0 = 0.29956550340058319;
+  static const double mixi1 = 0.63373087833825936;
+  static const double mixi2 = 0.077705617820981968;
+  static const double mixi3 = 1.7557483643287353;
+  static const double mixi4 = 0.22158691104574774;
+  static const double mixi5 = 0.69391388044116142;
+  static const double mixi6 = 0.0987313588422;
+  static const double mixi7 = 1.7557483643287353;
+  static const double mixi8 = 0.02;
+  static const double mixi9 = 0.02;
+  static const double mixi10 = 0.20480129041026129;
+  static const double mixi11 = 12.226454707163354;
+
+  const V mix0 = Set(df, mixi0);
+  const V mix1 = Set(df, mixi1);
+  const V mix2 = Set(df, mixi2);
+  const V mix3 = Set(df, mixi3);
+  const V mix4 = Set(df, mixi4);
+  const V mix5 = Set(df, mixi5);
+  const V mix6 = Set(df, mixi6);
+  const V mix7 = Set(df, mixi7);
+  const V mix8 = Set(df, mixi8);
+  const V mix9 = Set(df, mixi9);
+  const V mix10 = Set(df, mixi10);
+  const V mix11 = Set(df, mixi11);
+
+  *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3)));
+  *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7)));
+  *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11)));
+
+  if (Clamp) {
+    *out0 = Max(*out0, mix3);
+    *out1 = Max(*out1, mix7);
+    *out2 = Max(*out2, mix11);
+  }
+}
+
+// `blurred` is a temporary image used inside this function and not returned.
+Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
+                           Image3F* blurred, BlurTemp* blur_temp) {
+  PROFILER_FUNC;
+  Image3F xyb(rgb.xsize(), rgb.ysize());
+  const double kSigma = 1.2;
+  Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0));
+  Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1));
+  Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2));
+  const HWY_FULL(float) df;
+  const auto intensity_target_multiplier = Set(df, params.intensity_target);
+  for (size_t y = 0; y < rgb.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y);
+    const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y);
+    const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_r =
+        blurred->ConstPlaneRow(0, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_g =
+        blurred->ConstPlaneRow(1, y);
+    const float* BUTTERAUGLI_RESTRICT row_blurred_b =
+        blurred->ConstPlaneRow(2, y);
+    float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y);
+    const auto min = Set(df, 1e-4f);
+    for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
+      auto sensitivity0 = Undefined(df);
+      auto sensitivity1 = Undefined(df);
+      auto sensitivity2 = Undefined(df);
+      {
+        // Calculate sensitivity based on the smoothed image gamma derivative.
+        auto pre_mixed0 = Undefined(df);
+        auto pre_mixed1 = Undefined(df);
+        auto pre_mixed2 = Undefined(df);
+        OpsinAbsorbance<true>(
+            df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier),
+            Mul(Load(df, row_blurred_g + x), intensity_target_multiplier),
+            Mul(Load(df, row_blurred_b + x), intensity_target_multiplier),
+            &pre_mixed0, &pre_mixed1, &pre_mixed2);
+        pre_mixed0 = Max(pre_mixed0, min);
+        pre_mixed1 = Max(pre_mixed1, min);
+        pre_mixed2 = Max(pre_mixed2, min);
+        sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0);
+        sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1);
+        sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2);
+        sensitivity0 = Max(sensitivity0, min);
+        sensitivity1 = Max(sensitivity1, min);
+        sensitivity2 = Max(sensitivity2, min);
+      }
+      auto cur_mixed0 = Undefined(df);
+      auto cur_mixed1 = Undefined(df);
+      auto cur_mixed2 = Undefined(df);
+      OpsinAbsorbance<false>(
+          df, Mul(Load(df, row_r + x), intensity_target_multiplier),
+          Mul(Load(df, row_g + x), intensity_target_multiplier),
+          Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0,
+          &cur_mixed1, &cur_mixed2);
+      cur_mixed0 = Mul(cur_mixed0, sensitivity0);
+      cur_mixed1 = Mul(cur_mixed1, sensitivity1);
+      cur_mixed2 = Mul(cur_mixed2, sensitivity2);
+      // This is a kludge. The negative values should be zeroed away before
+      // blurring. Ideally there would be no negative values in the first place.
+      const auto min01 = Set(df, 1.7557483643287353f);
+      const auto min2 = Set(df, 12.226454707163354f);
+      cur_mixed0 = Max(cur_mixed0, min01);
+      cur_mixed1 = Max(cur_mixed1, min01);
+      cur_mixed2 = Max(cur_mixed2, min2);
+
+      Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x);
+      Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x);
+      Store(cur_mixed2, df, row_out_b + x);
+    }
+  }
+  return xyb;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(SeparateFrequencies);       // Local function.
+HWY_EXPORT(MaskPsychoImage);           // Local function.
+HWY_EXPORT(L2DiffAsymmetric);          // Local function.
+HWY_EXPORT(L2Diff);                    // Local function.
+HWY_EXPORT(SetL2Diff);                 // Local function.
+HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
+HWY_EXPORT(MaltaDiffMap);              // Local function.
+HWY_EXPORT(MaltaDiffMapLF);            // Local function.
+HWY_EXPORT(OpsinDynamicsImage);        // Local function.
+
+#if BUTTERAUGLI_ENABLE_CHECKS
+
+static inline bool IsNan(const float x) {
+  uint32_t bits;
+  memcpy(&bits, &x, sizeof(bits));
+  const uint32_t bitmask_exp = 0x7F800000;
+  return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
+}
+
+static inline bool IsNan(const double x) {
+  uint64_t bits;
+  memcpy(&bits, &x, sizeof(bits));
+  return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
+         (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
+}
+
+static inline void CheckImage(const ImageF& image, const char* name) {
+  PROFILER_FUNC;
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      if (IsNan(row[x])) {
+        printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS
+               ")\n",
+               name, x, y, image.xsize(), image.ysize());
+        exit(1);
+      }
+    }
+  }
+}
+
+#define CHECK_NAN(x, str)                \
+  do {                                   \
+    if (IsNan(x)) {                      \
+      printf("%d: %s\n", __LINE__, str); \
+      abort();                           \
+    }                                    \
+  } while (0)
+
+#define CHECK_IMAGE(image, name) CheckImage(image, name)
+
+#else  // BUTTERAUGLI_ENABLE_CHECKS
+
+#define CHECK_NAN(x, str)
+#define CHECK_IMAGE(image, name)
+
+#endif  // BUTTERAUGLI_ENABLE_CHECKS
+
+// Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
+// multiresolution.
+static Image3F SubSample2x(const Image3F& in) {
+  size_t xs = (in.xsize() + 1) / 2;
+  size_t ys = (in.ysize() + 1) / 2;
+  Image3F retval(xs, ys);
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ys; ++y) {
+      for (size_t x = 0; x < xs; ++x) {
+        retval.PlaneRow(c, y)[x] = 0;
+      }
+    }
+  }
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < in.ysize(); ++y) {
+      for (size_t x = 0; x < in.xsize(); ++x) {
+        retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
+      }
+    }
+    if ((in.xsize() & 1) != 0) {
+      for (size_t y = 0; y < retval.ysize(); ++y) {
+        size_t last_column = retval.xsize() - 1;
+        retval.PlaneRow(c, y)[last_column] *= 2.0f;
+      }
+    }
+    if ((in.ysize() & 1) != 0) {
+      for (size_t x = 0; x < retval.xsize(); ++x) {
+        size_t last_row = retval.ysize() - 1;
+        retval.PlaneRow(c, last_row)[x] *= 2.0f;
+      }
+    }
+  }
+  return retval;
+}
+
+// Supersample src by 2x and add it to dest.
+static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
+  for (size_t y = 0; y < dest.ysize(); ++y) {
+    for (size_t x = 0; x < dest.xsize(); ++x) {
+      // There will be less errors from the more averaged images.
+      // We take it into account to some extent using a scaler.
+      static const double kHeuristicMixingValue = 0.3;
+      dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
+      dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
+    }
+  }
+}
+
+Image3F* ButteraugliComparator::Temp() const {
+  bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
+  JXL_ASSERT(!was_in_use);
+  (void)was_in_use;
+  return &temp_;
+}
+
+void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
+
+ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
+                                             const ButteraugliParams& params)
+    : xsize_(rgb0.xsize()),
+      ysize_(rgb0.ysize()),
+      params_(params),
+      temp_(xsize_, ysize_) {
+  if (xsize_ < 8 || ysize_ < 8) {
+    return;
+  }
+
+  Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(),
+                                                          &blur_temp_);
+  ReleaseTemp();
+  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
+  (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_);
+
+  // Awful recursive construction of samples of different resolution.
+  // This is an after-thought and possibly somewhat parallel in
+  // functionality with the PsychoImage multi-resolution approach.
+  sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params));
+}
+
+void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
+  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
+  (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr);
+  ReleaseTemp();
+}
+
+void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&result);
+    return;
+  }
+  const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
+      rgb1, params_, Temp(), &blur_temp_);
+  ReleaseTemp();
+  DiffmapOpsinDynamicsImage(xyb1, result);
+  if (sub_) {
+    if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
+      return;
+    }
+    const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
+        SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_);
+    sub_->ReleaseTemp();
+    ImageF subresult;
+    sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult);
+    AddSupersampled2x(subresult, 0.5, result);
+  }
+}
+
+void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
+                                                      ImageF& result) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&result);
+    return;
+  }
+  PsychoImage pi1;
+  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
+  (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1);
+  result = ImageF(xsize_, ysize_);
+  DiffmapPsychoImage(pi1, result);
+}
+
+namespace {
+
+void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                  const double w_0lt1, const double norm1,
+                  ImageF* HWY_RESTRICT diffs,
+                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  PROFILER_FUNC;
+  const double len = 3.75;
+  static const double mulli = 0.39905817637;
+  HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+}
+
+void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
+                    const double w_0lt1, const double norm1,
+                    ImageF* HWY_RESTRICT diffs,
+                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+  PROFILER_FUNC;
+  const double len = 3.75;
+  static const double mulli = 0.611612573796;
+  HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+}
+
+}  // namespace
+
+void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
+                                               ImageF& diffmap) const {
+  PROFILER_FUNC;
+  if (xsize_ < 8 || ysize_ < 8) {
+    ZeroFillImage(&diffmap);
+    return;
+  }
+
+  const float hf_asymmetry_ = params_.hf_asymmetry;
+  const float xmul_ = params_.xmul;
+
+  ImageF diffs(xsize_, ysize_);
+  Image3F block_diff_ac(xsize_, ysize_);
+  ZeroFillImage(&block_diff_ac);
+  static const double wUhfMalta = 1.10039032555;
+  static const double norm1Uhf = 71.7800275169;
+  MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
+               wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
+
+  static const double wUhfMaltaX = 173.5;
+  static const double norm1UhfX = 5.0;
+  MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
+               wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
+               0);
+
+  static const double wHfMalta = 18.7237414387;
+  static const double norm1Hf = 4498534.45232;
+  MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
+                 wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
+                 &block_diff_ac, 1);
+
+  static const double wHfMaltaX = 6923.99476109;
+  static const double norm1HfX = 8051.15833247;
+  MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
+                 wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
+                 &block_diff_ac, 0);
+
+  static const double wMfMalta = 37.0819870399;
+  static const double norm1Mf = 130262059.556;
+  MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
+                 &diffs, &block_diff_ac, 1);
+
+  static const double wMfMaltaX = 8246.75321353;
+  static const double norm1MfX = 1009002.70582;
+  MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
+                 norm1MfX, &diffs, &block_diff_ac, 0);
+
+  static const double wmul[9] = {
+      400.0,         1.50815703118,  0,
+      2150.0,        10.6195433239,  16.2176043152,
+      29.2353797994, 0.844626970982, 0.703646627719,
+  };
+  Image3F block_diff_dc(xsize_, ysize_);
+  for (size_t c = 0; c < 3; ++c) {
+    if (c < 2) {  // No blue channel error accumulated at HF.
+      HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
+      (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
+       &block_diff_ac, c);
+    }
+    HWY_DYNAMIC_DISPATCH(L2Diff)
+    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c);
+    HWY_DYNAMIC_DISPATCH(SetL2Diff)
+    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c);
+  }
+
+  ImageF mask;
+  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
+  (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask,
+   &block_diff_ac.Plane(1));
+  ReleaseTemp();
+
+  HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
+  (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
+}
+
+double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
+                                   const ButteraugliParams* params) {
+  PROFILER_FUNC;
+  float retval = 0.0f;
+  for (size_t y = 0; y < diffmap.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
+    for (size_t x = 0; x < diffmap.xsize(); ++x) {
+      retval = std::max(retval, row[x]);
+    }
+  }
+  return retval;
+}
+
+bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
+                        double hf_asymmetry, double xmul, ImageF& diffmap) {
+  ButteraugliParams params;
+  params.hf_asymmetry = hf_asymmetry;
+  params.xmul = xmul;
+  return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
+}
+
+bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
+                        const ButteraugliParams& params, ImageF& diffmap) {
+  PROFILER_FUNC;
+  const size_t xsize = rgb0.xsize();
+  const size_t ysize = rgb0.ysize();
+  if (xsize < 1 || ysize < 1) {
+    return JXL_FAILURE("Zero-sized image");
+  }
+  if (!SameSize(rgb0, rgb1)) {
+    return JXL_FAILURE("Size mismatch");
+  }
+  static const int kMax = 8;
+  if (xsize < kMax || ysize < kMax) {
+    // Butteraugli values for small (where xsize or ysize is smaller
+    // than 8 pixels) images are non-sensical, but most likely it is
+    // less disruptive to try to compute something than just give up.
+    // Temporarily extend the borders of the image to fit 8 x 8 size.
+    size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
+    size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
+    size_t xscaled = std::max<size_t>(kMax, xsize);
+    size_t yscaled = std::max<size_t>(kMax, ysize);
+    Image3F scaled0(xscaled, yscaled);
+    Image3F scaled1(xscaled, yscaled);
+    for (int i = 0; i < 3; ++i) {
+      for (size_t y = 0; y < yscaled; ++y) {
+        for (size_t x = 0; x < xscaled; ++x) {
+          size_t x2 =
+              std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
+          size_t y2 =
+              std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
+          scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
+          scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
+        }
+      }
+    }
+    ImageF diffmap_scaled;
+    const bool ok =
+        ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
+    diffmap = ImageF(xsize, ysize);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
+      }
+    }
+    return ok;
+  }
+  ButteraugliComparator butteraugli(rgb0, params);
+  butteraugli.Diffmap(rgb1, diffmap);
+  return true;
+}
+
+bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
+                          float hf_asymmetry, float xmul, ImageF& diffmap,
+                          double& diffvalue) {
+  ButteraugliParams params;
+  params.hf_asymmetry = hf_asymmetry;
+  params.xmul = xmul;
+  return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
+}
+
+bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
+                          const ButteraugliParams& params, ImageF& diffmap,
+                          double& diffvalue) {
+#if JXL_PROFILER_ENABLED
+  auto trace_start = std::chrono::steady_clock::now();
+#endif
+  if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
+    return false;
+  }
+#if JXL_PROFILER_ENABLED
+  auto trace_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed = trace_end - trace_start;
+  const size_t mp = rgb0.xsize() * rgb0.ysize();
+  printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6);
+#endif
+  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
+  return true;
+}
+
+double ButteraugliFuzzyClass(double score) {
+  static const double fuzzy_width_up = 4.8;
+  static const double fuzzy_width_down = 4.8;
+  static const double m0 = 2.0;
+  static const double scaler = 0.7777;
+  double val;
+  if (score < 1.0) {
+    // val in [scaler .. 2.0]
+    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
+    val -= 1.0;           // from [1 .. 2] to [0 .. 1]
+    val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
+    val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
+  } else {
+    // val in [0 .. scaler]
+    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
+    val *= scaler;
+  }
+  return val;
+}
+
+// #define PRINT_OUT_NORMALIZATION
+
+double ButteraugliFuzzyInverse(double seek) {
+  double pos = 0;
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (double range = 1.0; range >= 1e-10; range *= 0.5) {
+    double cur = ButteraugliFuzzyClass(pos);
+    if (cur < seek) {
+      pos -= range;
+    } else {
+      pos += range;
+    }
+  }
+#ifdef PRINT_OUT_NORMALIZATION
+  if (seek == 1.0) {
+    fprintf(stderr, "Fuzzy inverse %g\n", pos);
+  }
+#endif
+  return pos;
+}
+
+#ifdef PRINT_OUT_NORMALIZATION
+static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
+#endif
+
+namespace {
+
+void ScoreToRgb(double score, double good_threshold, double bad_threshold,
+                float rgb[3]) {
+  double heatmap[12][3] = {
+      {0, 0, 0},       {0, 0, 1},
+      {0, 1, 1},       {0, 1, 0},  // Good level
+      {1, 1, 0},       {1, 0, 0},  // Bad level
+      {1, 0, 1},       {0.5, 0.5, 1.0},
+      {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
+      {1.0, 1.0, 0.5}, {1, 1, 1},
+      {1, 1, 1},  // Last color repeated to have a solid range of white.
+  };
+  if (score < good_threshold) {
+    score = (score / good_threshold) * 0.3;
+  } else if (score < bad_threshold) {
+    score = 0.3 +
+            (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
+  } else {
+    score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
+  }
+  static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
+  score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
+                           kTableSize - 2);
+  int ix = static_cast<int>(score);
+  ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
+  double mix = score - ix;
+  for (int i = 0; i < 3; ++i) {
+    double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
+    rgb[i] = pow(v, 0.5);
+  }
+}
+
+}  // namespace
+
+Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold,
+                           double bad_threshold) {
+  Image3F heatmap(distmap.xsize(), distmap.ysize());
+  for (size_t y = 0; y < distmap.ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
+    float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
+    for (size_t x = 0; x < distmap.xsize(); ++x) {
+      const float d = row_distmap[x];
+      float rgb[3];
+      ScoreToRgb(d, good_threshold, bad_threshold, rgb);
+      row_h0[x] = rgb[0];
+      row_h1[x] = rgb[1];
+      row_h2[x] = rgb[2];
+    }
+  }
+  return heatmap;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h
new file mode 100644
index 0000000000..652b9528c4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.h
@@ -0,0 +1,209 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
+
+#ifndef LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
+#define LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <atomic>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+#define BUTTERAUGLI_ENABLE_CHECKS 0
+#define BUTTERAUGLI_RESTRICT JXL_RESTRICT
+
+// This is the main interface to butteraugli image similarity
+// analysis function.
+
+namespace jxl {
+
+struct ButteraugliParams {
+  // Multiplier for penalizing new HF artifacts more than blurring away
+  // features. 1.0=neutral.
+  float hf_asymmetry = 1.0f;
+
+  // Multiplier for the psychovisual difference in the X channel.
+  float xmul = 1.0f;
+
+  // Number of nits that correspond to 1.0f input values.
+  float intensity_target = 80.0f;
+};
+
+// ButteraugliInterface defines the public interface for butteraugli.
+//
+// It calculates the difference between rgb0 and rgb1.
+//
+// rgb0 and rgb1 contain the images. rgb0[c][px] and rgb1[c][px] contains
+// the red image for c == 0, green for c == 1, blue for c == 2. Location index
+// px is calculated as y * xsize + x.
+//
+// Value of pixels of images rgb0 and rgb1 need to be represented as raw
+// intensity. Most image formats store gamma corrected intensity in pixel
+// values. This gamma correction has to be removed, by applying the following
+// function to values in the 0-1 range:
+// butteraugli_val = pow(input_val, gamma);
+// A typical value of gamma is 2.2. It is usually stored in the image header.
+// Take care not to confuse that value with its inverse. The gamma value should
+// be always greater than one.
+// Butteraugli does not work as intended if the caller does not perform
+// gamma correction.
+//
+// hf_asymmetry is a multiplier for penalizing new HF artifacts more than
+// blurring away features (1.0 -> neutral).
+//
+// diffmap will contain an image of the size xsize * ysize, containing
+// localized differences for values px (indexed with the px the same as rgb0
+// and rgb1). diffvalue will give a global score of similarity.
+//
+// A diffvalue smaller than kButteraugliGood indicates that images can be
+// observed as the same image.
+// diffvalue larger than kButteraugliBad indicates that a difference between
+// the images can be observed.
+// A diffvalue between kButteraugliGood and kButteraugliBad indicates that
+// a subtle difference can be observed between the images.
+//
+// Returns true on success.
+bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1,
+                          const ButteraugliParams &params, ImageF &diffmap,
+                          double &diffvalue);
+
+// Deprecated (calls the previous function)
+bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1,
+                          float hf_asymmetry, float xmul, ImageF &diffmap,
+                          double &diffvalue);
+
+// Converts the butteraugli score into fuzzy class values that are continuous
+// at the class boundary. The class boundary location is based on human
+// raters, but the slope is arbitrary. Particularly, it does not reflect
+// the expectation value of probabilities of the human raters. It is just
+// expected that a smoother class boundary will allow for higher-level
+// optimization algorithms to work faster.
+//
+// Returns 2.0 for a perfect match, and 1.0 for 'ok', 0.0 for bad. Because the
+// scoring is fuzzy, a butteraugli score of 0.96 would return a class of
+// around 1.9.
+double ButteraugliFuzzyClass(double score);
+
+// Input values should be in range 0 (bad) to 2 (good). Use
+// kButteraugliNormalization as normalization.
+double ButteraugliFuzzyInverse(double seek);
+
+// Implementation details, don't use anything below or your code will
+// break in the future.
+
+#ifdef _MSC_VER
+#define BUTTERAUGLI_INLINE __forceinline
+#else
+#define BUTTERAUGLI_INLINE inline
+#endif
+
+#ifdef __clang__
+// Early versions of Clang did not support __builtin_assume_aligned.
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED __has_builtin(__builtin_assume_aligned)
+#elif defined(__GNUC__)
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 1
+#else
+#define BUTTERAUGLI_HAS_ASSUME_ALIGNED 0
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* JXL_RESTRICT aligned = (float*)JXL_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if BUTTERAUGLI_HAS_ASSUME_ALIGNED
+#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) \
+  __builtin_assume_aligned((ptr), (align))
+#else
+#define BUTTERAUGLI_ASSUME_ALIGNED(ptr, align) (ptr)
+#endif  // BUTTERAUGLI_HAS_ASSUME_ALIGNED
+
+struct PsychoImage {
+  ImageF uhf[2];  // XY
+  ImageF hf[2];   // XY
+  Image3F mf;     // XYB
+  Image3F lf;     // XYB
+};
+
+// Blur needs a transposed image.
+// Hold it here and only allocate on demand to reduce memory usage.
+struct BlurTemp {
+  ImageF *GetTransposed(const ImageF &in) {
+    if (transposed_temp.xsize() == 0) {
+      transposed_temp = ImageF(in.ysize(), in.xsize());
+    }
+    return &transposed_temp;
+  }
+
+  ImageF transposed_temp;
+};
+
+class ButteraugliComparator {
+ public:
+  // Butteraugli is calibrated at xmul = 1.0. We add a multiplier here so that
+  // we can test the hypothesis that a higher weighing of the X channel would
+  // improve results at higher Butteraugli values.
+  ButteraugliComparator(const Image3F &rgb0, const ButteraugliParams &params);
+  virtual ~ButteraugliComparator() = default;
+
+  // Computes the butteraugli map between the original image given in the
+  // constructor and the distorted image give here.
+  void Diffmap(const Image3F &rgb1, ImageF &result) const;
+
+  // Same as above, but OpsinDynamicsImage() was already applied.
+  void DiffmapOpsinDynamicsImage(const Image3F &xyb1, ImageF &result) const;
+
+  // Same as above, but the frequency decomposition was already applied.
+  void DiffmapPsychoImage(const PsychoImage &pi1, ImageF &diffmap) const;
+
+  void Mask(ImageF *BUTTERAUGLI_RESTRICT mask) const;
+
+ private:
+  Image3F *Temp() const;
+  void ReleaseTemp() const;
+
+  const size_t xsize_;
+  const size_t ysize_;
+  ButteraugliParams params_;
+  PsychoImage pi0_;
+
+  // Shared temporary image storage to reduce the number of allocations;
+  // obtained via Temp(), must call ReleaseTemp when no longer needed.
+  mutable Image3F temp_;
+  mutable std::atomic_flag temp_in_use_ = ATOMIC_FLAG_INIT;
+
+  mutable BlurTemp blur_temp_;
+  std::unique_ptr<ButteraugliComparator> sub_;
+};
+
+// Deprecated.
+bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1,
+                        double hf_asymmetry, double xmul, ImageF &diffmap);
+
+bool ButteraugliDiffmap(const Image3F &rgb0, const Image3F &rgb1,
+                        const ButteraugliParams &params, ImageF &diffmap);
+
+double ButteraugliScoreFromDiffmap(const ImageF &diffmap,
+                                   const ButteraugliParams *params = nullptr);
+
+// Generate rgb-representation of the distance between two images.
+Image3F CreateHeatMapImage(const ImageF &distmap, double good_threshold,
+                           double bad_threshold);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc
new file mode 100644
index 0000000000..3fdec09725
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli_test.cc
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/butteraugli.h>
+#include <jxl/butteraugli_cxx.h>
+
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+TEST(ButteraugliTest, Lossless) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
+      &pixel_format, pixels.data(), pixels.size()));
+  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+}
+
+TEST(ButteraugliTest, Distmap) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
+      &pixel_format, pixels.data(), pixels.size()));
+  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+  const float* distmap;
+  uint32_t row_stride;
+  JxlButteraugliResultGetDistmap(result.get(), &distmap, &row_stride);
+  for (uint32_t y = 0; y < ysize; y++) {
+    for (uint32_t x = 0; x < xsize; x++) {
+      EXPECT_EQ(0.0, distmap[y * row_stride + x]);
+    }
+  }
+}
+
+TEST(ButteraugliTest, Distorted) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> orig_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> dist_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  dist_pixels[0] += 128;
+
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
+      orig_pixels.size(), &pixel_format, dist_pixels.data(),
+      dist_pixels.size()));
+  EXPECT_NE(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
+}
+
+TEST(ButteraugliTest, Api) {
+  uint32_t xsize = 171;
+  uint32_t ysize = 219;
+  std::vector<uint8_t> orig_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> dist_pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  dist_pixels[0] += 128;
+
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
+  JxlButteraugliApiSetHFAsymmetry(api.get(), 1.0f);
+  JxlButteraugliApiSetIntensityTarget(api.get(), 250.0f);
+  JxlButteraugliResultPtr result(JxlButteraugliCompute(
+      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
+      orig_pixels.size(), &pixel_format, dist_pixels.data(),
+      dist_pixels.size()));
+  double distance0 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  JxlButteraugliApiSetHFAsymmetry(api.get(), 2.0f);
+  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
+                                     orig_pixels.data(), orig_pixels.size(),
+                                     &pixel_format, dist_pixels.data(),
+                                     dist_pixels.size()));
+  double distance1 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  EXPECT_NE(distance0, distance1);
+
+  JxlButteraugliApiSetIntensityTarget(api.get(), 80.0f);
+  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
+                                     orig_pixels.data(), orig_pixels.size(),
+                                     &pixel_format, dist_pixels.data(),
+                                     dist_pixels.size()));
+  double distance2 = JxlButteraugliResultGetDistance(result.get(), 8.0);
+
+  EXPECT_NE(distance1, distance2);
+}
diff --git a/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc
new file mode 100644
index 0000000000..c5a1a8e506
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli_wrapper.cc
@@ -0,0 +1,203 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/butteraugli.h>
+#include <jxl/parallel_runner.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atomic>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/memory_manager_internal.h"
+
+namespace {
+
+void SetMetadataFromPixelFormat(const JxlPixelFormat* pixel_format,
+                                jxl::ImageMetadata* metadata) {
+  uint32_t potential_alpha_bits = 0;
+  switch (pixel_format->data_type) {
+    case JXL_TYPE_FLOAT:
+      metadata->SetFloat32Samples();
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_FLOAT16:
+      metadata->SetFloat16Samples();
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_UINT16:
+      metadata->SetUintSamples(16);
+      potential_alpha_bits = 16;
+      break;
+    case JXL_TYPE_UINT8:
+      metadata->SetUintSamples(8);
+      potential_alpha_bits = 8;
+      break;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
+    metadata->SetAlphaBits(potential_alpha_bits);
+  }
+}
+
+}  // namespace
+
+struct JxlButteraugliResultStruct {
+  JxlMemoryManager memory_manager;
+
+  jxl::ImageF distmap;
+  jxl::ButteraugliParams params;
+};
+
+struct JxlButteraugliApiStruct {
+  // Multiplier for penalizing new HF artifacts more than blurring away
+  // features. 1.0=neutral.
+  float hf_asymmetry = 1.0f;
+
+  // Multiplier for the psychovisual difference in the X channel.
+  float xmul = 1.0f;
+
+  // Number of nits that correspond to 1.0f input values.
+  float intensity_target = jxl::kDefaultIntensityTarget;
+
+  JxlCmsInterface cms;
+  JxlMemoryManager memory_manager;
+  std::unique_ptr<jxl::ThreadPool> thread_pool{nullptr};
+};
+
+JxlButteraugliApi* JxlButteraugliApiCreate(
+    const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager))
+    return nullptr;
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlButteraugliApi));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlButteraugliApi* ret = new (alloc) JxlButteraugliApi();
+  ret->cms = jxl::GetJxlCms();
+  ret->memory_manager = local_memory_manager;
+  return ret;
+}
+
+void JxlButteraugliApiSetParallelRunner(JxlButteraugliApi* api,
+                                        JxlParallelRunner parallel_runner,
+                                        void* parallel_runner_opaque) {
+  api->thread_pool = jxl::make_unique<jxl::ThreadPool>(parallel_runner,
+                                                       parallel_runner_opaque);
+}
+
+void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, float v) {
+  api->hf_asymmetry = v;
+}
+
+void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, float v) {
+  api->intensity_target = v;
+}
+
+void JxlButteraugliApiDestroy(JxlButteraugliApi* api) {
+  if (api) {
+    JxlMemoryManager local_memory_manager = api->memory_manager;
+    // Call destructor directly since custom free function is used.
+    api->~JxlButteraugliApi();
+    jxl::MemoryManagerFree(&local_memory_manager, api);
+  }
+}
+
+JxlButteraugliResult* JxlButteraugliCompute(
+    const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize,
+    const JxlPixelFormat* pixel_format_orig, const void* buffer_orig,
+    size_t size_orig, const JxlPixelFormat* pixel_format_dist,
+    const void* buffer_dist, size_t size_dist) {
+  jxl::ImageMetadata orig_metadata;
+  SetMetadataFromPixelFormat(pixel_format_orig, &orig_metadata);
+  jxl::ImageBundle orig_ib(&orig_metadata);
+  jxl::ColorEncoding c_current;
+  if (pixel_format_orig->data_type == JXL_TYPE_FLOAT) {
+    c_current =
+        jxl::ColorEncoding::LinearSRGB(pixel_format_orig->num_channels < 3);
+  } else {
+    c_current = jxl::ColorEncoding::SRGB(pixel_format_orig->num_channels < 3);
+  }
+  if (!jxl::BufferToImageBundle(*pixel_format_orig, xsize, ysize, buffer_orig,
+                                size_orig, api->thread_pool.get(), c_current,
+                                &orig_ib)) {
+    return nullptr;
+  }
+
+  jxl::ImageMetadata dist_metadata;
+  SetMetadataFromPixelFormat(pixel_format_dist, &dist_metadata);
+  jxl::ImageBundle dist_ib(&dist_metadata);
+  if (pixel_format_dist->data_type == JXL_TYPE_FLOAT) {
+    c_current =
+        jxl::ColorEncoding::LinearSRGB(pixel_format_dist->num_channels < 3);
+  } else {
+    c_current = jxl::ColorEncoding::SRGB(pixel_format_dist->num_channels < 3);
+  }
+  if (!jxl::BufferToImageBundle(*pixel_format_dist, xsize, ysize, buffer_dist,
+                                size_dist, api->thread_pool.get(), c_current,
+                                &dist_ib)) {
+    return nullptr;
+  }
+
+  void* alloc = jxl::MemoryManagerAlloc(&api->memory_manager,
+                                        sizeof(JxlButteraugliResult));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlButteraugliResult* result = new (alloc) JxlButteraugliResult();
+  result->memory_manager = api->memory_manager;
+  result->params.hf_asymmetry = api->hf_asymmetry;
+  result->params.xmul = api->xmul;
+  result->params.intensity_target = api->intensity_target;
+  jxl::ButteraugliDistance(orig_ib, dist_ib, result->params, api->cms,
+                           &result->distmap, api->thread_pool.get());
+
+  return result;
+}
+
+float JxlButteraugliResultGetDistance(const JxlButteraugliResult* result,
+                                      float pnorm) {
+  return static_cast<float>(
+      jxl::ComputeDistanceP(result->distmap, result->params, pnorm));
+}
+
+void JxlButteraugliResultGetDistmap(const JxlButteraugliResult* result,
+                                    const float** buffer,
+                                    uint32_t* row_stride) {
+  *buffer = result->distmap.Row(0);
+  *row_stride = result->distmap.PixelsPerRow();
+}
+
+float JxlButteraugliResultGetMaxDistance(const JxlButteraugliResult* result) {
+  float max_distance = 0.0;
+  for (uint32_t y = 0; y < result->distmap.ysize(); y++) {
+    for (uint32_t x = 0; x < result->distmap.xsize(); x++) {
+      if (result->distmap.ConstRow(y)[x] > max_distance) {
+        max_distance = result->distmap.ConstRow(y)[x];
+      }
+    }
+  }
+  return max_distance;
+}
+
+void JxlButteraugliResultDestroy(JxlButteraugliResult* result) {
+  if (result) {
+    JxlMemoryManager local_memory_manager = result->memory_manager;
+    // Call destructor directly since custom free function is used.
+    result->~JxlButteraugliResult();
+    jxl::MemoryManagerFree(&local_memory_manager, result);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/byte_order_test.cc b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc
new file mode 100644
index 0000000000..17d7ef6643
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/byte_order_test.cc
@@ -0,0 +1,53 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/byte_order.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ByteOrderTest, TestRoundTripBE16) {
+  const uint32_t in = 0x1234;
+  uint8_t buf[2];
+  StoreBE16(in, buf);
+  EXPECT_EQ(in, LoadBE16(buf));
+  EXPECT_NE(in, LoadLE16(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE16) {
+  const uint32_t in = 0x1234;
+  uint8_t buf[2];
+  StoreLE16(in, buf);
+  EXPECT_EQ(in, LoadLE16(buf));
+  EXPECT_NE(in, LoadBE16(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripBE32) {
+  const uint32_t in = 0xFEDCBA98u;
+  uint8_t buf[4];
+  StoreBE32(in, buf);
+  EXPECT_EQ(in, LoadBE32(buf));
+  EXPECT_NE(in, LoadLE32(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE32) {
+  const uint32_t in = 0xFEDCBA98u;
+  uint8_t buf[4];
+  StoreLE32(in, buf);
+  EXPECT_EQ(in, LoadLE32(buf));
+  EXPECT_NE(in, LoadBE32(buf));
+}
+
+TEST(ByteOrderTest, TestRoundTripLE64) {
+  const uint64_t in = 0xFEDCBA9876543210ull;
+  uint8_t buf[8];
+  StoreLE64(in, buf);
+  EXPECT_EQ(in, LoadLE64(buf));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc
new file mode 100644
index 0000000000..63d21cbb4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/chroma_from_luma.h"
+
+namespace jxl {
+
+ColorCorrelationMap::ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB)
+    : ytox_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)),
+      ytob_map(DivCeil(xsize, kColorTileDim), DivCeil(ysize, kColorTileDim)) {
+  ZeroFillImage(&ytox_map);
+  ZeroFillImage(&ytob_map);
+  if (!XYB) {
+    base_correlation_b_ = 0;
+  }
+  RecomputeDCFactors();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h
new file mode 100644
index 0000000000..9a7f3d45bc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/chroma_from_luma.h
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CHROMA_FROM_LUMA_H_
+#define LIB_JXL_CHROMA_FROM_LUMA_H_
+
+// Chroma-from-luma, computed using heuristics to determine the best linear
+// model for the X and B channels from the Y channel.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+// Tile is the rectangular grid of blocks that share color correlation
+// parameters ("factor_x/b" such that residual_b = blue - Y * factor_b).
+static constexpr size_t kColorTileDim = 64;
+
+static_assert(kColorTileDim % kBlockDim == 0,
+              "Color tile dim should be divisible by block dim");
+static constexpr size_t kColorTileDimInBlocks = kColorTileDim / kBlockDim;
+
+static_assert(kGroupDimInBlocks % kColorTileDimInBlocks == 0,
+              "Group dim should be divisible by color tile dim");
+
+static constexpr uint8_t kDefaultColorFactor = 84;
+
+// JPEG DCT coefficients are at most 1024. CfL constants are at most 127, and
+// the ratio of two entries in a JPEG quantization table is at most 255. Thus,
+// since the CfL denominator is 84, this leaves 12 bits of mantissa to be used.
+// For extra caution, we use 11.
+static constexpr uint8_t kCFLFixedPointPrecision = 11;
+
+static constexpr U32Enc kColorFactorDist(Val(kDefaultColorFactor), Val(256),
+                                         BitsOffset(8, 2), BitsOffset(16, 258));
+
+struct ColorCorrelationMap {
+  ColorCorrelationMap() = default;
+  // xsize/ysize are in pixels
+  // set XYB=false to do something close to no-op cmap (needed for now since
+  // cmap is mandatory)
+  ColorCorrelationMap(size_t xsize, size_t ysize, bool XYB = true);
+
+  float YtoXRatio(int32_t x_factor) const {
+    return base_correlation_x_ + x_factor * color_scale_;
+  }
+
+  float YtoBRatio(int32_t b_factor) const {
+    return base_correlation_b_ + b_factor * color_scale_;
+  }
+
+  Status DecodeDC(BitReader* br) {
+    if (br->ReadFixedBits<1>() == 1) {
+      // All default.
+      return true;
+    }
+    SetColorFactor(U32Coder::Read(kColorFactorDist, br));
+    JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_x_));
+    if (std::abs(base_correlation_x_) > 4.0f) {
+      return JXL_FAILURE("Base X correlation is out of range");
+    }
+    JXL_RETURN_IF_ERROR(F16Coder::Read(br, &base_correlation_b_));
+    if (std::abs(base_correlation_b_) > 4.0f) {
+      return JXL_FAILURE("Base B correlation is out of range");
+    }
+    ytox_dc_ = static_cast<int>(br->ReadFixedBits<kBitsPerByte>()) +
+               std::numeric_limits<int8_t>::min();
+    ytob_dc_ = static_cast<int>(br->ReadFixedBits<kBitsPerByte>()) +
+               std::numeric_limits<int8_t>::min();
+    RecomputeDCFactors();
+    return true;
+  }
+
+  // We consider a CfL map to be JPEG-reconstruction-compatible if base
+  // correlation is 0, no DC correlation is used, and we use the default color
+  // factor.
+  bool IsJPEGCompatible() const {
+    return base_correlation_x_ == 0 && base_correlation_b_ == 0 &&
+           ytob_dc_ == 0 && ytox_dc_ == 0 &&
+           color_factor_ == kDefaultColorFactor;
+  }
+
+  int32_t RatioJPEG(int32_t factor) const {
+    return factor * (1 << kCFLFixedPointPrecision) / kDefaultColorFactor;
+  }
+
+  void SetColorFactor(uint32_t factor) {
+    color_factor_ = factor;
+    color_scale_ = 1.0f / color_factor_;
+    RecomputeDCFactors();
+  }
+
+  void SetYToBDC(int32_t ytob_dc) {
+    ytob_dc_ = ytob_dc;
+    RecomputeDCFactors();
+  }
+  void SetYToXDC(int32_t ytox_dc) {
+    ytox_dc_ = ytox_dc;
+    RecomputeDCFactors();
+  }
+
+  int32_t GetYToXDC() const { return ytox_dc_; }
+  int32_t GetYToBDC() const { return ytob_dc_; }
+  float GetColorFactor() const { return color_factor_; }
+  float GetBaseCorrelationX() const { return base_correlation_x_; }
+  float GetBaseCorrelationB() const { return base_correlation_b_; }
+
+  const float* DCFactors() const { return dc_factors_; }
+
+  void RecomputeDCFactors() {
+    dc_factors_[0] = YtoXRatio(ytox_dc_);
+    dc_factors_[2] = YtoBRatio(ytob_dc_);
+  }
+
+  ImageSB ytox_map;
+  ImageSB ytob_map;
+
+ private:
+  float dc_factors_[4] = {};
+  // range of factor: -1.51 to +1.52
+  uint32_t color_factor_ = kDefaultColorFactor;
+  float color_scale_ = 1.0f / color_factor_;
+  float base_correlation_x_ = 0.0f;
+  float base_correlation_b_ = kYToBRatio;
+  int32_t ytox_dc_ = 0;
+  int32_t ytob_dc_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CHROMA_FROM_LUMA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/codec_in_out.h b/third_party/jpeg-xl/lib/jxl/codec_in_out.h
new file mode 100644
index 0000000000..9e48b5e937
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/codec_in_out.h
@@ -0,0 +1,116 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CODEC_IN_OUT_H_
+#define LIB_JXL_CODEC_IN_OUT_H_
+
+// Holds inputs/outputs for decoding/encoding images.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/luminance.h"
+
+namespace jxl {
+
+// Optional text/EXIF metadata.
+struct Blobs {
+  std::vector<uint8_t> exif;
+  std::vector<uint8_t> iptc;
+  std::vector<uint8_t> jumbf;
+  std::vector<uint8_t> xmp;
+};
+
+// Holds a preview, a main image or one or more frames, plus the inputs/outputs
+// to/from decoding/encoding.
+class CodecInOut {
+ public:
+  CodecInOut() : preview_frame(&metadata.m) {
+    frames.reserve(1);
+    frames.emplace_back(&metadata.m);
+  }
+
+  // Move-only.
+  CodecInOut(CodecInOut&&) = default;
+  CodecInOut& operator=(CodecInOut&&) = default;
+
+  size_t LastStillFrame() const {
+    JXL_DASSERT(!frames.empty());
+    size_t last = 0;
+    for (size_t i = 0; i < frames.size(); i++) {
+      last = i;
+      if (frames[i].duration > 0) break;
+    }
+    return last;
+  }
+
+  ImageBundle& Main() { return frames[LastStillFrame()]; }
+  const ImageBundle& Main() const { return frames[LastStillFrame()]; }
+
+  // If c_current.IsGray(), all planes must be identical.
+  void SetFromImage(Image3F&& color, const ColorEncoding& c_current) {
+    Main().SetFromImage(std::move(color), c_current);
+    SetIntensityTarget(&this->metadata.m);
+    SetSize(Main().xsize(), Main().ysize());
+  }
+
+  void SetSize(size_t xsize, size_t ysize) {
+    JXL_CHECK(metadata.size.Set(xsize, ysize));
+  }
+
+  void CheckMetadata() const {
+    JXL_CHECK(metadata.m.bit_depth.bits_per_sample != 0);
+    JXL_CHECK(!metadata.m.color_encoding.ICC().empty());
+
+    if (preview_frame.xsize() != 0) preview_frame.VerifyMetadata();
+    JXL_CHECK(preview_frame.metadata() == &metadata.m);
+
+    for (const ImageBundle& ib : frames) {
+      ib.VerifyMetadata();
+      JXL_CHECK(ib.metadata() == &metadata.m);
+    }
+  }
+
+  size_t xsize() const { return metadata.size.xsize(); }
+  size_t ysize() const { return metadata.size.ysize(); }
+  void ShrinkTo(size_t xsize, size_t ysize) {
+    // preview is unaffected.
+    for (ImageBundle& ib : frames) {
+      ib.ShrinkTo(xsize, ysize);
+    }
+    SetSize(xsize, ysize);
+  }
+
+  // -- DECODER OUTPUT, ENCODER INPUT:
+
+  // Metadata stored into / retrieved from bitstreams.
+
+  Blobs blobs;
+
+  CodecMetadata metadata;  // applies to preview and all frames
+
+  // If metadata.have_preview:
+  ImageBundle preview_frame;
+
+  std::vector<ImageBundle> frames;  // size=1 if !metadata.have_animation
+
+  // If the image should be written to a JPEG, use this quality for encoding.
+  size_t jpeg_quality;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CODEC_IN_OUT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.cc b/third_party/jpeg-xl/lib/jxl/coeff_order.cc
new file mode 100644
index 0000000000..43adafd82a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order.cc
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/coeff_order.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/lehmer_code.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+uint32_t CoeffOrderContext(uint32_t val) {
+  uint32_t token, nbits, bits;
+  HybridUintConfig(0, 0, 0).Encode(val, &token, &nbits, &bits);
+  return std::min(token, kPermutationContexts - 1);
+}
+
+namespace {
+Status ReadPermutation(size_t skip, size_t size, coeff_order_t* order,
+                       BitReader* br, ANSSymbolReader* reader,
+                       const std::vector<uint8_t>& context_map) {
+  std::vector<LehmerT> lehmer(size);
+  // temp space needs to be as large as the next power of 2, so doubling the
+  // allocated size is enough.
+  std::vector<uint32_t> temp(size * 2);
+  uint32_t end =
+      reader->ReadHybridUint(CoeffOrderContext(size), br, context_map) + skip;
+  if (end > size) {
+    return JXL_FAILURE("Invalid permutation size");
+  }
+  uint32_t last = 0;
+  for (size_t i = skip; i < end; ++i) {
+    lehmer[i] =
+        reader->ReadHybridUint(CoeffOrderContext(last), br, context_map);
+    last = lehmer[i];
+    if (lehmer[i] + i >= size) {
+      return JXL_FAILURE("Invalid lehmer code");
+    }
+  }
+  if (order == nullptr) return true;
+  DecodeLehmerCode(lehmer.data(), temp.data(), size, order);
+  return true;
+}
+
+}  // namespace
+
+Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order,
+                         BitReader* br) {
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kPermutationContexts, &code, &context_map));
+  ANSSymbolReader reader(&code, br);
+  JXL_RETURN_IF_ERROR(
+      ReadPermutation(skip, size, order, br, &reader, context_map));
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("Invalid ANS stream");
+  }
+  return true;
+}
+
+namespace {
+
+Status DecodeCoeffOrder(AcStrategy acs, coeff_order_t* order, BitReader* br,
+                        ANSSymbolReader* reader,
+                        std::vector<coeff_order_t>& natural_order,
+                        const std::vector<uint8_t>& context_map) {
+  PROFILER_FUNC;
+  const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+  const size_t size = kDCTBlockSize * llf;
+
+  JXL_RETURN_IF_ERROR(
+      ReadPermutation(llf, size, order, br, reader, context_map));
+  if (order == nullptr) return true;
+  for (size_t k = 0; k < size; ++k) {
+    order[k] = natural_order[order[k]];
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs,
+                         coeff_order_t* order, BitReader* br) {
+  uint16_t computed = 0;
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  std::unique_ptr<ANSSymbolReader> reader;
+  std::vector<coeff_order_t> natural_order;
+  // Bitstream does not have histograms if no coefficient order is used.
+  if (used_orders != 0) {
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(br, kPermutationContexts, &code, &context_map));
+    reader = make_unique<ANSSymbolReader>(&code, br);
+  }
+  uint32_t acs_mask = 0;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    if ((used_acs & (1 << o)) == 0) continue;
+    acs_mask |= 1 << kStrategyOrder[o];
+  }
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    bool used = (acs_mask & (1 << ord)) != 0;
+
+    const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+    const size_t size = kDCTBlockSize * llf;
+
+    if (used || (used_orders & (1 << ord))) {
+      if (natural_order.size() < size) natural_order.resize(size);
+      acs.ComputeNaturalCoeffOrder(natural_order.data());
+    }
+
+    if ((used_orders & (1 << ord)) == 0) {
+      // No need to set the default order if no ACS uses this order.
+      if (used) {
+        for (size_t c = 0; c < 3; c++) {
+          memcpy(&order[CoeffOrderOffset(ord, c)], natural_order.data(),
+                 size * sizeof(*order));
+        }
+      }
+    } else {
+      for (size_t c = 0; c < 3; c++) {
+        coeff_order_t* dest = used ? &order[CoeffOrderOffset(ord, c)] : nullptr;
+        JXL_RETURN_IF_ERROR(DecodeCoeffOrder(acs, dest, br, reader.get(),
+                                             natural_order, context_map));
+      }
+    }
+  }
+  if (used_orders && !reader->CheckANSFinalState()) {
+    return JXL_FAILURE("Invalid ANS stream");
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order.h b/third_party/jpeg-xl/lib/jxl/coeff_order.h
new file mode 100644
index 0000000000..fb32499f2f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order.h
@@ -0,0 +1,64 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COEFF_ORDER_H_
+#define LIB_JXL_COEFF_ORDER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+
+namespace jxl {
+
+class BitReader;
+
+// Those offsets get multiplied by kDCTBlockSize.
+static constexpr size_t kCoeffOrderOffset[] = {
+    0,    1,    2,    3,    4,    5,    6,    10,   14,   18,
+    34,   50,   66,   68,   70,   72,   76,   80,   84,   92,
+    100,  108,  172,  236,  300,  332,  364,  396,  652,  908,
+    1164, 1292, 1420, 1548, 2572, 3596, 4620, 5132, 5644, 6156,
+};
+static_assert(3 * kNumOrders + 1 ==
+                  sizeof(kCoeffOrderOffset) / sizeof(*kCoeffOrderOffset),
+              "Update this array when adding or removing order types.");
+
+static constexpr size_t CoeffOrderOffset(size_t order, size_t c) {
+  return kCoeffOrderOffset[3 * order + c] * kDCTBlockSize;
+}
+
+static constexpr size_t kCoeffOrderMaxSize =
+    kCoeffOrderOffset[3 * kNumOrders] * kDCTBlockSize;
+
+// Mapping from AC strategy to order bucket. Strategies with different natural
+// orders must have different buckets.
+constexpr uint8_t kStrategyOrder[] = {
+    0, 1, 1, 1, 2, 3, 4, 4, 5,  5,  6,  6,  1,  1,
+    1, 1, 1, 1, 7, 8, 8, 9, 10, 10, 11, 12, 12,
+};
+
+static_assert(AcStrategy::kNumValidStrategies ==
+                  sizeof(kStrategyOrder) / sizeof(*kStrategyOrder),
+              "Update this array when adding or removing AC strategies.");
+
+constexpr uint32_t kPermutationContexts = 8;
+
+uint32_t CoeffOrderContext(uint32_t val);
+
+Status DecodeCoeffOrders(uint16_t used_orders, uint32_t used_acs,
+                         coeff_order_t* order, BitReader* br);
+
+Status DecodePermutation(size_t skip, size_t size, coeff_order_t* order,
+                         BitReader* br);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COEFF_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h
new file mode 100644
index 0000000000..26306575c1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order_fwd.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COEFF_ORDER_FWD_H_
+#define LIB_JXL_COEFF_ORDER_FWD_H_
+
+// Breaks circular dependency between ac_strategy and coeff_order.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Needs at least 16 bits. A 32-bit type speeds up DecodeAC by 2% at the cost of
+// more memory.
+using coeff_order_t = uint32_t;
+
+// Maximum number of orders to be used. Note that this needs to be multiplied by
+// the number of channels. One per "size class" (plus one extra for DCT8),
+// shared between transforms of size XxY and of size YxX.
+constexpr uint8_t kNumOrders = 13;
+
+// DCT coefficients are laid out in such a way that the number of rows of
+// coefficients is always the smaller coordinate.
+JXL_INLINE constexpr size_t CoefficientRows(size_t rows, size_t columns) {
+  return rows < columns ? rows : columns;
+}
+
+JXL_INLINE constexpr size_t CoefficientColumns(size_t rows, size_t columns) {
+  return rows < columns ? columns : rows;
+}
+
+JXL_INLINE void CoefficientLayout(size_t* JXL_RESTRICT rows,
+                                  size_t* JXL_RESTRICT columns) {
+  size_t r = *rows;
+  size_t c = *columns;
+  *rows = CoefficientRows(r, c);
+  *columns = CoefficientColumns(r, c);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COEFF_ORDER_FWD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc
new file mode 100644
index 0000000000..6fa0775697
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/coeff_order_test.cc
@@ -0,0 +1,97 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/coeff_order.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <numeric>  // iota
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void RoundtripPermutation(coeff_order_t* perm, coeff_order_t* out, size_t len,
+                          size_t* size) {
+  BitWriter writer;
+  EncodePermutation(perm, 0, len, &writer, 0, nullptr);
+  writer.ZeroPadToByte();
+  Status status = true;
+  {
+    BitReader reader(writer.GetSpan());
+    BitReaderScopedCloser closer(&reader, &status);
+    ASSERT_TRUE(DecodePermutation(0, len, out, &reader));
+  }
+  ASSERT_TRUE(status);
+  *size = writer.GetSpan().size();
+}
+
+enum Permutation { kIdentity, kFewSwaps, kFewSlides, kRandom };
+
+constexpr size_t kSwaps = 32;
+
+void TestPermutation(Permutation kind, size_t len) {
+  std::vector<coeff_order_t> perm(len);
+  std::iota(perm.begin(), perm.end(), 0);
+  Rng rng(0);
+  if (kind == kFewSwaps) {
+    for (size_t i = 0; i < kSwaps; i++) {
+      size_t a = rng.UniformU(0, len - 1);
+      size_t b = rng.UniformU(0, len - 1);
+      std::swap(perm[a], perm[b]);
+    }
+  }
+  if (kind == kFewSlides) {
+    for (size_t i = 0; i < kSwaps; i++) {
+      size_t a = rng.UniformU(0, len - 1);
+      size_t b = rng.UniformU(0, len - 1);
+      size_t from = std::min(a, b);
+      size_t to = std::max(a, b);
+      size_t start = perm[from];
+      for (size_t j = from; j < to; j++) {
+        perm[j] = perm[j + 1];
+      }
+      perm[to] = start;
+    }
+  }
+  if (kind == kRandom) {
+    rng.Shuffle(perm.data(), perm.size());
+  }
+  std::vector<coeff_order_t> out(len);
+  size_t size = 0;
+  RoundtripPermutation(perm.data(), out.data(), len, &size);
+  for (size_t idx = 0; idx < len; idx++) {
+    EXPECT_EQ(perm[idx], out[idx]);
+  }
+  printf("Encoded size: %" PRIuS "\n", size);
+}
+
+TEST(CoeffOrderTest, IdentitySmall) { TestPermutation(kIdentity, 256); }
+TEST(CoeffOrderTest, FewSlidesSmall) { TestPermutation(kFewSlides, 256); }
+TEST(CoeffOrderTest, FewSwapsSmall) { TestPermutation(kFewSwaps, 256); }
+TEST(CoeffOrderTest, RandomSmall) { TestPermutation(kRandom, 256); }
+
+TEST(CoeffOrderTest, IdentityMedium) { TestPermutation(kIdentity, 1 << 12); }
+TEST(CoeffOrderTest, FewSlidesMedium) { TestPermutation(kFewSlides, 1 << 12); }
+TEST(CoeffOrderTest, FewSwapsMedium) { TestPermutation(kFewSwaps, 1 << 12); }
+TEST(CoeffOrderTest, RandomMedium) { TestPermutation(kRandom, 1 << 12); }
+
+TEST(CoeffOrderTest, IdentityBig) { TestPermutation(kIdentity, 1 << 16); }
+TEST(CoeffOrderTest, FewSlidesBig) { TestPermutation(kFewSlides, 1 << 16); }
+TEST(CoeffOrderTest, FewSwapsBig) { TestPermutation(kFewSwaps, 1 << 16); }
+TEST(CoeffOrderTest, RandomBig) { TestPermutation(kRandom, 1 << 16); }
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc
new file mode 100644
index 0000000000..e496accfed
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc
@@ -0,0 +1,753 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_encoding_internal.h"
+
+#include <errno.h>
+
+#include <array>
+#include <cmath>
+
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/matrix_ops.h"
+
+namespace jxl {
+namespace {
+
+// Highest reasonable value for the gamma of a transfer curve.
+constexpr uint32_t kMaxGamma = 8192;
+
+// These strings are baked into Description - do not change.
+
+std::string ToString(ColorSpace color_space) {
+  switch (color_space) {
+    case ColorSpace::kRGB:
+      return "RGB";
+    case ColorSpace::kGray:
+      return "Gra";
+    case ColorSpace::kXYB:
+      return "XYB";
+    case ColorSpace::kUnknown:
+      return "CS?";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid ColorSpace %u", static_cast<uint32_t>(color_space));
+}
+
+std::string ToString(WhitePoint white_point) {
+  switch (white_point) {
+    case WhitePoint::kD65:
+      return "D65";
+    case WhitePoint::kCustom:
+      return "Cst";
+    case WhitePoint::kE:
+      return "EER";
+    case WhitePoint::kDCI:
+      return "DCI";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
+}
+
+std::string ToString(Primaries primaries) {
+  switch (primaries) {
+    case Primaries::kSRGB:
+      return "SRG";
+    case Primaries::k2100:
+      return "202";
+    case Primaries::kP3:
+      return "DCI";
+    case Primaries::kCustom:
+      return "Cst";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+}
+
+std::string ToString(TransferFunction transfer_function) {
+  switch (transfer_function) {
+    case TransferFunction::kSRGB:
+      return "SRG";
+    case TransferFunction::kLinear:
+      return "Lin";
+    case TransferFunction::k709:
+      return "709";
+    case TransferFunction::kPQ:
+      return "PeQ";
+    case TransferFunction::kHLG:
+      return "HLG";
+    case TransferFunction::kDCI:
+      return "DCI";
+    case TransferFunction::kUnknown:
+      return "TF?";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid TransferFunction %u",
+            static_cast<uint32_t>(transfer_function));
+}
+
+std::string ToString(RenderingIntent rendering_intent) {
+  switch (rendering_intent) {
+    case RenderingIntent::kPerceptual:
+      return "Per";
+    case RenderingIntent::kRelative:
+      return "Rel";
+    case RenderingIntent::kSaturation:
+      return "Sat";
+    case RenderingIntent::kAbsolute:
+      return "Abs";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_ABORT("Invalid RenderingIntent %u",
+            static_cast<uint32_t>(rendering_intent));
+}
+
+static double F64FromCustomxyI32(const int32_t i) { return i * 1E-6; }
+static Status F64ToCustomxyI32(const double f, int32_t* JXL_RESTRICT i) {
+  if (!(-4 <= f && f <= 4)) {
+    return JXL_FAILURE("F64 out of bounds for CustomxyI32");
+  }
+  *i = static_cast<int32_t>(roundf(f * 1E6));
+  return true;
+}
+
+Status ConvertExternalToInternalWhitePoint(const JxlWhitePoint external,
+                                           WhitePoint* internal) {
+  switch (external) {
+    case JXL_WHITE_POINT_D65:
+      *internal = WhitePoint::kD65;
+      return true;
+    case JXL_WHITE_POINT_CUSTOM:
+      *internal = WhitePoint::kCustom;
+      return true;
+    case JXL_WHITE_POINT_E:
+      *internal = WhitePoint::kE;
+      return true;
+    case JXL_WHITE_POINT_DCI:
+      *internal = WhitePoint::kDCI;
+      return true;
+  }
+  return JXL_FAILURE("Invalid WhitePoint enum value");
+}
+
+Status ConvertExternalToInternalPrimaries(const JxlPrimaries external,
+                                          Primaries* internal) {
+  switch (external) {
+    case JXL_PRIMARIES_SRGB:
+      *internal = Primaries::kSRGB;
+      return true;
+    case JXL_PRIMARIES_CUSTOM:
+      *internal = Primaries::kCustom;
+      return true;
+    case JXL_PRIMARIES_2100:
+      *internal = Primaries::k2100;
+      return true;
+    case JXL_PRIMARIES_P3:
+      *internal = Primaries::kP3;
+      return true;
+  }
+  return JXL_FAILURE("Invalid Primaries enum value");
+}
+
+Status ConvertExternalToInternalTransferFunction(
+    const JxlTransferFunction external, TransferFunction* internal) {
+  switch (external) {
+    case JXL_TRANSFER_FUNCTION_709:
+      *internal = TransferFunction::k709;
+      return true;
+    case JXL_TRANSFER_FUNCTION_UNKNOWN:
+      *internal = TransferFunction::kUnknown;
+      return true;
+    case JXL_TRANSFER_FUNCTION_LINEAR:
+      *internal = TransferFunction::kLinear;
+      return true;
+    case JXL_TRANSFER_FUNCTION_SRGB:
+      *internal = TransferFunction::kSRGB;
+      return true;
+    case JXL_TRANSFER_FUNCTION_PQ:
+      *internal = TransferFunction::kPQ;
+      return true;
+    case JXL_TRANSFER_FUNCTION_DCI:
+      *internal = TransferFunction::kDCI;
+      return true;
+    case JXL_TRANSFER_FUNCTION_HLG:
+      *internal = TransferFunction::kHLG;
+      return true;
+    case JXL_TRANSFER_FUNCTION_GAMMA:
+      return JXL_FAILURE("Gamma should be handled separately");
+  }
+  return JXL_FAILURE("Invalid TransferFunction enum value");
+}
+
+Status ConvertExternalToInternalRenderingIntent(
+    const JxlRenderingIntent external, RenderingIntent* internal) {
+  switch (external) {
+    case JXL_RENDERING_INTENT_PERCEPTUAL:
+      *internal = RenderingIntent::kPerceptual;
+      return true;
+    case JXL_RENDERING_INTENT_RELATIVE:
+      *internal = RenderingIntent::kRelative;
+      return true;
+    case JXL_RENDERING_INTENT_SATURATION:
+      *internal = RenderingIntent::kSaturation;
+      return true;
+    case JXL_RENDERING_INTENT_ABSOLUTE:
+      *internal = RenderingIntent::kAbsolute;
+      return true;
+  }
+  return JXL_FAILURE("Invalid RenderingIntent enum value");
+}
+
+}  // namespace
+
+CIExy Customxy::Get() const {
+  CIExy xy;
+  xy.x = F64FromCustomxyI32(x);
+  xy.y = F64FromCustomxyI32(y);
+  return xy;
+}
+
+Status Customxy::Set(const CIExy& xy) {
+  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.x, &x));
+  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.y, &y));
+  size_t extension_bits, total_bits;
+  if (!Bundle::CanEncode(*this, &extension_bits, &total_bits)) {
+    return JXL_FAILURE("Unable to encode XY %f %f", xy.x, xy.y);
+  }
+  return true;
+}
+
+bool CustomTransferFunction::SetImplicit() {
+  if (nonserialized_color_space == ColorSpace::kXYB) {
+    if (!SetGamma(1.0 / 3)) JXL_ASSERT(false);
+    return true;
+  }
+  return false;
+}
+
+Status CustomTransferFunction::SetGamma(double gamma) {
+  if (gamma < (1.0f / kMaxGamma) || gamma > 1.0) {
+    return JXL_FAILURE("Invalid gamma %f", gamma);
+  }
+
+  have_gamma_ = false;
+  if (ApproxEq(gamma, 1.0)) {
+    transfer_function_ = TransferFunction::kLinear;
+    return true;
+  }
+  if (ApproxEq(gamma, 1.0 / 2.6)) {
+    transfer_function_ = TransferFunction::kDCI;
+    return true;
+  }
+  // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel
+  // values because those curves also have a linear part.
+
+  have_gamma_ = true;
+  gamma_ = roundf(gamma * kGammaMul);
+  transfer_function_ = TransferFunction::kUnknown;
+  return true;
+}
+
+namespace {
+
+std::array<ColorEncoding, 2> CreateC2(const Primaries pr,
+                                      const TransferFunction tf) {
+  std::array<ColorEncoding, 2> c2;
+
+  {
+    ColorEncoding* c_rgb = c2.data() + 0;
+    c_rgb->SetColorSpace(ColorSpace::kRGB);
+    c_rgb->white_point = WhitePoint::kD65;
+    c_rgb->primaries = pr;
+    c_rgb->tf.SetTransferFunction(tf);
+    JXL_CHECK(c_rgb->CreateICC());
+  }
+
+  {
+    ColorEncoding* c_gray = c2.data() + 1;
+    c_gray->SetColorSpace(ColorSpace::kGray);
+    c_gray->white_point = WhitePoint::kD65;
+    c_gray->primaries = pr;
+    c_gray->tf.SetTransferFunction(tf);
+    JXL_CHECK(c_gray->CreateICC());
+  }
+
+  return c2;
+}
+
+}  // namespace
+
+const ColorEncoding& ColorEncoding::SRGB(bool is_gray) {
+  static std::array<ColorEncoding, 2> c2 =
+      CreateC2(Primaries::kSRGB, TransferFunction::kSRGB);
+  return c2[is_gray];
+}
+const ColorEncoding& ColorEncoding::LinearSRGB(bool is_gray) {
+  static std::array<ColorEncoding, 2> c2 =
+      CreateC2(Primaries::kSRGB, TransferFunction::kLinear);
+  return c2[is_gray];
+}
+
+CIExy ColorEncoding::GetWhitePoint() const {
+  JXL_DASSERT(have_fields_);
+  CIExy xy;
+  switch (white_point) {
+    case WhitePoint::kCustom:
+      return white_.Get();
+
+    case WhitePoint::kD65:
+      xy.x = 0.3127;
+      xy.y = 0.3290;
+      return xy;
+
+    case WhitePoint::kDCI:
+      // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11
+      xy.x = 0.314;
+      xy.y = 0.351;
+      return xy;
+
+    case WhitePoint::kE:
+      xy.x = xy.y = 1.0 / 3;
+      return xy;
+  }
+  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
+}
+
+Status ColorEncoding::SetWhitePoint(const CIExy& xy) {
+  JXL_DASSERT(have_fields_);
+  if (xy.x == 0.0 || xy.y == 0.0) {
+    return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y);
+  }
+  if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) {
+    white_point = WhitePoint::kD65;
+    return true;
+  }
+  if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) {
+    white_point = WhitePoint::kE;
+    return true;
+  }
+  if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) {
+    white_point = WhitePoint::kDCI;
+    return true;
+  }
+  white_point = WhitePoint::kCustom;
+  return white_.Set(xy);
+}
+
+PrimariesCIExy ColorEncoding::GetPrimaries() const {
+  JXL_DASSERT(have_fields_);
+  JXL_ASSERT(HasPrimaries());
+  PrimariesCIExy xy;
+  switch (primaries) {
+    case Primaries::kCustom:
+      xy.r = red_.Get();
+      xy.g = green_.Get();
+      xy.b = blue_.Get();
+      return xy;
+
+    case Primaries::kSRGB:
+      xy.r.x = 0.639998686;
+      xy.r.y = 0.330010138;
+      xy.g.x = 0.300003784;
+      xy.g.y = 0.600003357;
+      xy.b.x = 0.150002046;
+      xy.b.y = 0.059997204;
+      return xy;
+
+    case Primaries::k2100:
+      xy.r.x = 0.708;
+      xy.r.y = 0.292;
+      xy.g.x = 0.170;
+      xy.g.y = 0.797;
+      xy.b.x = 0.131;
+      xy.b.y = 0.046;
+      return xy;
+
+    case Primaries::kP3:
+      xy.r.x = 0.680;
+      xy.r.y = 0.320;
+      xy.g.x = 0.265;
+      xy.g.y = 0.690;
+      xy.b.x = 0.150;
+      xy.b.y = 0.060;
+      return xy;
+  }
+  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+}
+
+Status ColorEncoding::SetPrimaries(const PrimariesCIExy& xy) {
+  JXL_DASSERT(have_fields_);
+  JXL_ASSERT(HasPrimaries());
+  if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 ||
+      xy.b.x == 0.0 || xy.b.y == 0.0) {
+    return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y,
+                       xy.g.x, xy.g.y, xy.b.x, xy.b.y);
+  }
+
+  if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) &&
+      ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) &&
+      ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) {
+    primaries = Primaries::kSRGB;
+    return true;
+  }
+
+  if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) &&
+      ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) &&
+      ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) {
+    primaries = Primaries::k2100;
+    return true;
+  }
+  if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) &&
+      ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) &&
+      ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) {
+    primaries = Primaries::kP3;
+    return true;
+  }
+
+  primaries = Primaries::kCustom;
+  JXL_RETURN_IF_ERROR(red_.Set(xy.r));
+  JXL_RETURN_IF_ERROR(green_.Set(xy.g));
+  JXL_RETURN_IF_ERROR(blue_.Set(xy.b));
+  return true;
+}
+
+Status ColorEncoding::CreateICC() {
+  InternalRemoveICC();
+  if (!MaybeCreateProfile(*this, &icc_)) {
+    return JXL_FAILURE("Failed to create profile from fields");
+  }
+  return true;
+}
+
+std::string Description(const ColorEncoding& c_in) {
+  // Copy required for Implicit*
+  ColorEncoding c = c_in;
+
+  std::string d = ToString(c.GetColorSpace());
+
+  if (!c.ImplicitWhitePoint()) {
+    d += '_';
+    if (c.white_point == WhitePoint::kCustom) {
+      const CIExy wp = c.GetWhitePoint();
+      d += ToString(wp.x) + ';';
+      d += ToString(wp.y);
+    } else {
+      d += ToString(c.white_point);
+    }
+  }
+
+  if (c.HasPrimaries()) {
+    d += '_';
+    if (c.primaries == Primaries::kCustom) {
+      const PrimariesCIExy pr = c.GetPrimaries();
+      d += ToString(pr.r.x) + ';';
+      d += ToString(pr.r.y) + ';';
+      d += ToString(pr.g.x) + ';';
+      d += ToString(pr.g.y) + ';';
+      d += ToString(pr.b.x) + ';';
+      d += ToString(pr.b.y);
+    } else {
+      d += ToString(c.primaries);
+    }
+  }
+
+  d += '_';
+  d += ToString(c.rendering_intent);
+
+  if (!c.tf.SetImplicit()) {
+    d += '_';
+    if (c.tf.IsGamma()) {
+      d += 'g';
+      d += ToString(c.tf.GetGamma());
+    } else {
+      d += ToString(c.tf.GetTransferFunction());
+    }
+  }
+
+  return d;
+}
+
+Customxy::Customxy() { Bundle::Init(this); }
+Status Customxy::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  uint32_t ux = PackSigned(x);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
+                                         BitsOffset(20, 1048576),
+                                         BitsOffset(21, 2097152), 0, &ux));
+  x = UnpackSigned(ux);
+  uint32_t uy = PackSigned(y);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
+                                         BitsOffset(20, 1048576),
+                                         BitsOffset(21, 2097152), 0, &uy));
+  y = UnpackSigned(uy);
+  return true;
+}
+
+CustomTransferFunction::CustomTransferFunction() { Bundle::Init(this); }
+Status CustomTransferFunction::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->Conditional(!SetImplicit())) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_gamma_));
+
+    if (visitor->Conditional(have_gamma_)) {
+      // Gamma is represented as a 24-bit int, the exponent used is
+      // gamma_ / 1e7. Valid values are (0, 1]. On the low end side, we also
+      // limit it to kMaxGamma/1e7.
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(24, kGammaMul, &gamma_));
+      if (gamma_ > kGammaMul ||
+          static_cast<uint64_t>(gamma_) * kMaxGamma < kGammaMul) {
+        return JXL_FAILURE("Invalid gamma %u", gamma_);
+      }
+    }
+
+    if (visitor->Conditional(!have_gamma_)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Enum(TransferFunction::kSRGB, &transfer_function_));
+    }
+  }
+
+  return true;
+}
+
+ColorEncoding::ColorEncoding() { Bundle::Init(this); }
+Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &want_icc_));
+
+  // Always send even if want_icc_ because this affects decoding.
+  // We can skip the white point/primaries because they do not.
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ColorSpace::kRGB, &color_space_));
+
+  if (visitor->Conditional(!WantICC())) {
+    // Serialize enums. NOTE: we set the defaults to the most common values so
+    // ImageMetadata.all_default is true in the common case.
+
+    if (visitor->Conditional(!ImplicitWhitePoint())) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(WhitePoint::kD65, &white_point));
+      if (visitor->Conditional(white_point == WhitePoint::kCustom)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&white_));
+      }
+    }
+
+    if (visitor->Conditional(HasPrimaries())) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(Primaries::kSRGB, &primaries));
+      if (visitor->Conditional(primaries == Primaries::kCustom)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&red_));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&green_));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blue_));
+      }
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf));
+
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->Enum(RenderingIntent::kRelative, &rendering_intent));
+
+    // We didn't have ICC, so all fields should be known.
+    if (color_space_ == ColorSpace::kUnknown || tf.IsUnknown()) {
+      return JXL_FAILURE(
+          "No ICC but cs %u and tf %u%s",
+          static_cast<unsigned int>(color_space_),
+          tf.IsGamma() ? 0
+                       : static_cast<unsigned int>(tf.GetTransferFunction()),
+          tf.IsGamma() ? "(gamma)" : "");
+    }
+
+    JXL_RETURN_IF_ERROR(CreateICC());
+  }
+
+  if (WantICC() && visitor->IsReading()) {
+    // Haven't called SetICC() yet, do nothing.
+  } else {
+    if (ICC().empty()) return JXL_FAILURE("Empty ICC");
+  }
+
+  return true;
+}
+
+void ConvertInternalToExternalColorEncoding(const ColorEncoding& internal,
+                                            JxlColorEncoding* external) {
+  external->color_space = static_cast<JxlColorSpace>(internal.GetColorSpace());
+
+  external->white_point = static_cast<JxlWhitePoint>(internal.white_point);
+
+  jxl::CIExy whitepoint = internal.GetWhitePoint();
+  external->white_point_xy[0] = whitepoint.x;
+  external->white_point_xy[1] = whitepoint.y;
+
+  if (external->color_space == JXL_COLOR_SPACE_RGB ||
+      external->color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    external->primaries = static_cast<JxlPrimaries>(internal.primaries);
+    jxl::PrimariesCIExy primaries = internal.GetPrimaries();
+    external->primaries_red_xy[0] = primaries.r.x;
+    external->primaries_red_xy[1] = primaries.r.y;
+    external->primaries_green_xy[0] = primaries.g.x;
+    external->primaries_green_xy[1] = primaries.g.y;
+    external->primaries_blue_xy[0] = primaries.b.x;
+    external->primaries_blue_xy[1] = primaries.b.y;
+  }
+
+  if (internal.tf.IsGamma()) {
+    external->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    external->gamma = internal.tf.GetGamma();
+  } else {
+    external->transfer_function =
+        static_cast<JxlTransferFunction>(internal.tf.GetTransferFunction());
+    external->gamma = 0;
+  }
+
+  external->rendering_intent =
+      static_cast<JxlRenderingIntent>(internal.rendering_intent);
+}
+
+Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
+                                              ColorEncoding* internal) {
+  internal->SetColorSpace(static_cast<ColorSpace>(external.color_space));
+
+  JXL_RETURN_IF_ERROR(ConvertExternalToInternalWhitePoint(
+      external.white_point, &internal->white_point));
+  if (external.white_point == JXL_WHITE_POINT_CUSTOM) {
+    CIExy wp;
+    wp.x = external.white_point_xy[0];
+    wp.y = external.white_point_xy[1];
+    JXL_RETURN_IF_ERROR(internal->SetWhitePoint(wp));
+  }
+
+  if (external.color_space == JXL_COLOR_SPACE_RGB ||
+      external.color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalPrimaries(
+        external.primaries, &internal->primaries));
+    if (external.primaries == JXL_PRIMARIES_CUSTOM) {
+      PrimariesCIExy primaries;
+      primaries.r.x = external.primaries_red_xy[0];
+      primaries.r.y = external.primaries_red_xy[1];
+      primaries.g.x = external.primaries_green_xy[0];
+      primaries.g.y = external.primaries_green_xy[1];
+      primaries.b.x = external.primaries_blue_xy[0];
+      primaries.b.y = external.primaries_blue_xy[1];
+      JXL_RETURN_IF_ERROR(internal->SetPrimaries(primaries));
+    }
+  }
+  CustomTransferFunction tf;
+  tf.nonserialized_color_space = internal->GetColorSpace();
+  if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
+    JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma));
+  } else {
+    TransferFunction tf_enum;
+    // JXL_TRANSFER_FUNCTION_GAMMA is not handled by this function since there's
+    // no internal enum value for it.
+    JXL_RETURN_IF_ERROR(ConvertExternalToInternalTransferFunction(
+        external.transfer_function, &tf_enum));
+    tf.SetTransferFunction(tf_enum);
+  }
+  internal->tf = tf;
+
+  JXL_RETURN_IF_ERROR(ConvertExternalToInternalRenderingIntent(
+      external.rendering_intent, &internal->rendering_intent));
+
+  // The ColorEncoding caches an ICC profile it created earlier that may no
+  // longer match the profile with the changed fields, so re-create it.
+  if (!(internal->CreateICC())) {
+    // This is not an error: for example, it doesn't have ICC profile creation
+    // implemented for XYB. This should not be returned as error, since
+    // ConvertExternalToInternalColorEncoding still worked correctly, and what
+    // matters is that internal->ICC() will not return the wrong profile.
+  }
+
+  return true;
+}
+
+/* Chromatic adaptation matrices*/
+static const float kBradford[9] = {
+    0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f,
+    0.0367f, 0.0389f, -0.0685f, 1.0296f,
+};
+
+static const float kBradfordInv[9] = {
+    0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f,
+    0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f,
+};
+
+// Adapts whitepoint x, y to D50
+Status AdaptToXYZD50(float wx, float wy, float matrix[9]) {
+  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
+    // Out of range values can cause division through zero
+    // further down with the bradford adaptation too.
+    return JXL_FAILURE("Invalid white point");
+  }
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float w50[3] = {0.96422f, 1.0f, 0.82521f};
+
+  float lms[3];
+  float lms50[3];
+
+  Mul3x3Vector(kBradford, w, lms);
+  Mul3x3Vector(kBradford, w50, lms50);
+
+  if (lms[0] == 0 || lms[1] == 0 || lms[2] == 0) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  float a[9] = {
+      //       /----> 0, 1, 2, 3,          /----> 4, 5, 6, 7,          /----> 8,
+      lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2],
+  };
+  if (!std::isfinite(a[0]) || !std::isfinite(a[4]) || !std::isfinite(a[8])) {
+    return JXL_FAILURE("Invalid white point");
+  }
+
+  float b[9];
+  Mul3x3Matrix(a, kBradford, b);
+  Mul3x3Matrix(kBradfordInv, b, matrix);
+
+  return true;
+}
+
+Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
+                      float by, float wx, float wy, float matrix[9]) {
+  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC
+  // profiles in theory forbid negative XYZ values, but in practice the ACES P0
+  // color space uses a negative y for the blue primary.
+  float primaries[9] = {
+      rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by};
+  float primaries_inv[9];
+  memcpy(primaries_inv, primaries, sizeof(float) * 9);
+  JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv));
+
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float xyz[3];
+  Mul3x3Vector(primaries_inv, w, xyz);
+
+  float a[9] = {
+      xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2],
+  };
+
+  Mul3x3Matrix(primaries, a, matrix);
+  return true;
+}
+
+Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
+                         float by, float wx, float wy, float matrix[9]) {
+  float toXYZ[9];
+  JXL_RETURN_IF_ERROR(PrimariesToXYZ(rx, ry, gx, gy, bx, by, wx, wy, toXYZ));
+  float d50[9];
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50));
+
+  Mul3x3Matrix(d50, toXYZ, matrix);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
new file mode 100644
index 0000000000..713f216538
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h
@@ -0,0 +1,463 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COLOR_ENCODING_INTERNAL_H_
+#define LIB_JXL_COLOR_ENCODING_INTERNAL_H_
+
+// Metadata for color space conversions.
+
+#include <jxl/color_encoding.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cmath>  // std::abs
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// (All CIE units are for the standard 1931 2 degree observer)
+
+// Color space the color pixel data is encoded in. The color pixel data is
+// 3-channel in all cases except in case of kGray, where it uses only 1 channel.
+// This also determines the amount of channels used in modular encoding.
+enum class ColorSpace : uint32_t {
+  // Trichromatic color data. This also includes CMYK if a kBlack
+  // ExtraChannelInfo is present. This implies, if there is an ICC profile, that
+  // the ICC profile uses a 3-channel color space if no kBlack extra channel is
+  // present, or uses color space 'CMYK' if a kBlack extra channel is present.
+  kRGB,
+  // Single-channel data. This implies, if there is an ICC profile, that the ICC
+  // profile also represents single-channel data and has the appropriate color
+  // space ('GRAY').
+  kGray,
+  // Like kRGB, but implies fixed values for primaries etc.
+  kXYB,
+  // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise
+  // the same conditions as kRGB apply.
+  kUnknown
+};
+
+static inline const char* EnumName(ColorSpace /*unused*/) {
+  return "ColorSpace";
+}
+static inline constexpr uint64_t EnumBits(ColorSpace /*unused*/) {
+  using CS = ColorSpace;
+  return MakeBit(CS::kRGB) | MakeBit(CS::kGray) | MakeBit(CS::kXYB) |
+         MakeBit(CS::kUnknown);
+}
+
+// Values from CICP ColourPrimaries.
+enum class WhitePoint : uint32_t {
+  kD65 = 1,     // sRGB/BT.709/Display P3/BT.2020
+  kCustom = 2,  // Actual values encoded in separate fields
+  kE = 10,      // XYZ
+  kDCI = 11,    // DCI-P3
+};
+
+static inline const char* EnumName(WhitePoint /*unused*/) {
+  return "WhitePoint";
+}
+static inline constexpr uint64_t EnumBits(WhitePoint /*unused*/) {
+  return MakeBit(WhitePoint::kD65) | MakeBit(WhitePoint::kCustom) |
+         MakeBit(WhitePoint::kE) | MakeBit(WhitePoint::kDCI);
+}
+
+// Values from CICP ColourPrimaries
+enum class Primaries : uint32_t {
+  kSRGB = 1,    // Same as BT.709
+  kCustom = 2,  // Actual values encoded in separate fields
+  k2100 = 9,    // Same as BT.2020
+  kP3 = 11,
+};
+
+static inline const char* EnumName(Primaries /*unused*/) { return "Primaries"; }
+static inline constexpr uint64_t EnumBits(Primaries /*unused*/) {
+  using Pr = Primaries;
+  return MakeBit(Pr::kSRGB) | MakeBit(Pr::kCustom) | MakeBit(Pr::k2100) |
+         MakeBit(Pr::kP3);
+}
+
+// Values from CICP TransferCharacteristics
+enum class TransferFunction : uint32_t {
+  k709 = 1,
+  kUnknown = 2,
+  kLinear = 8,
+  kSRGB = 13,
+  kPQ = 16,   // from BT.2100
+  kDCI = 17,  // from SMPTE RP 431-2 reference projector
+  kHLG = 18,  // from BT.2100
+};
+
+static inline const char* EnumName(TransferFunction /*unused*/) {
+  return "TransferFunction";
+}
+static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) {
+  using TF = TransferFunction;
+  return MakeBit(TF::k709) | MakeBit(TF::kLinear) | MakeBit(TF::kSRGB) |
+         MakeBit(TF::kPQ) | MakeBit(TF::kDCI) | MakeBit(TF::kHLG) |
+         MakeBit(TF::kUnknown);
+}
+
+enum class RenderingIntent : uint32_t {
+  // Values match ICC sRGB encodings.
+  kPerceptual = 0,  // good for photos, requires a profile with LUT.
+  kRelative,        // good for logos.
+  kSaturation,      // perhaps useful for CG with fully saturated colors.
+  kAbsolute,        // leaves white point unchanged; good for proofing.
+};
+
+static inline const char* EnumName(RenderingIntent /*unused*/) {
+  return "RenderingIntent";
+}
+static inline constexpr uint64_t EnumBits(RenderingIntent /*unused*/) {
+  using RI = RenderingIntent;
+  return MakeBit(RI::kPerceptual) | MakeBit(RI::kRelative) |
+         MakeBit(RI::kSaturation) | MakeBit(RI::kAbsolute);
+}
+
+// Chromaticity (Y is omitted because it is 1 for primaries/white points)
+struct CIExy {
+  double x = 0.0;
+  double y = 0.0;
+};
+
+struct PrimariesCIExy {
+  CIExy r;
+  CIExy g;
+  CIExy b;
+};
+
+// Serializable form of CIExy.
+struct Customxy : public Fields {
+  Customxy();
+  JXL_FIELDS_NAME(Customxy)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  CIExy Get() const;
+  // Returns false if x or y do not fit in the encoding.
+  Status Set(const CIExy& xy);
+
+  int32_t x;
+  int32_t y;
+};
+
+struct CustomTransferFunction : public Fields {
+  CustomTransferFunction();
+  JXL_FIELDS_NAME(CustomTransferFunction)
+
+  // Sets fields and returns true if nonserialized_color_space has an implicit
+  // transfer function, otherwise leaves fields unchanged and returns false.
+  bool SetImplicit();
+
+  // Gamma: only used for PNG inputs
+  bool IsGamma() const { return have_gamma_; }
+  double GetGamma() const {
+    JXL_ASSERT(IsGamma());
+    return gamma_ * 1E-7;  // (0, 1)
+  }
+  Status SetGamma(double gamma);
+
+  TransferFunction GetTransferFunction() const {
+    JXL_ASSERT(!IsGamma());
+    return transfer_function_;
+  }
+  void SetTransferFunction(const TransferFunction tf) {
+    have_gamma_ = false;
+    transfer_function_ = tf;
+  }
+
+  bool IsUnknown() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kUnknown);
+  }
+  bool IsSRGB() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kSRGB);
+  }
+  bool IsLinear() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kLinear);
+  }
+  bool IsPQ() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kPQ);
+  }
+  bool IsHLG() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kHLG);
+  }
+  bool Is709() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::k709);
+  }
+  bool IsDCI() const {
+    return !have_gamma_ && (transfer_function_ == TransferFunction::kDCI);
+  }
+  bool IsSame(const CustomTransferFunction& other) const {
+    if (have_gamma_ != other.have_gamma_) return false;
+    if (have_gamma_) {
+      if (gamma_ != other.gamma_) return false;
+    } else {
+      if (transfer_function_ != other.transfer_function_) return false;
+    }
+    return true;
+  }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Must be set before calling VisitFields!
+  ColorSpace nonserialized_color_space = ColorSpace::kRGB;
+
+ private:
+  static constexpr uint32_t kGammaMul = 10000000;
+
+  bool have_gamma_;
+
+  // OETF exponent to go from linear to gamma-compressed.
+  uint32_t gamma_;  // Only used if have_gamma_.
+
+  // Can be kUnknown.
+  TransferFunction transfer_function_;  // Only used if !have_gamma_.
+};
+
+// Compact encoding of data required to interpret and translate pixels to a
+// known color space. Stored in Metadata. Thread-compatible.
+struct ColorEncoding : public Fields {
+  ColorEncoding();
+  JXL_FIELDS_NAME(ColorEncoding)
+
+  // Returns ready-to-use color encodings (initialized on-demand).
+  static const ColorEncoding& SRGB(bool is_gray = false);
+  static const ColorEncoding& LinearSRGB(bool is_gray = false);
+
+  // Returns true if an ICC profile was successfully created from fields.
+  // Must be called after modifying fields. Defined in color_management.cc.
+  Status CreateICC();
+
+  // Returns non-empty and valid ICC profile, unless:
+  // - between calling InternalRemoveICC() and CreateICC() in tests;
+  // - WantICC() == true and SetICC() was not yet called;
+  // - after a failed call to SetSRGB(), SetICC(), or CreateICC().
+  const PaddedBytes& ICC() const { return icc_; }
+
+  // Internal only, do not call except from tests.
+  void InternalRemoveICC() { icc_.clear(); }
+
+  // Returns true if `icc` is assigned and decoded successfully. If so,
+  // subsequent WantICC() will return true until DecideIfWantICC() changes it.
+  // Returning false indicates data has been lost.
+  Status SetICC(PaddedBytes&& icc) {
+    if (icc.empty()) return false;
+    icc_ = std::move(icc);
+
+    if (!SetFieldsFromICC()) {
+      InternalRemoveICC();
+      return false;
+    }
+
+    want_icc_ = true;
+    return true;
+  }
+
+  // Sets the raw ICC profile bytes, without parsing the ICC, and without
+  // updating the direct fields such as whitepoint, primaries and color
+  // space. Functions to get and set fields, such as SetWhitePoint, cannot be
+  // used anymore after this and functions such as IsSRGB return false no matter
+  // what the contents of the icc profile.
+  Status SetICCRaw(PaddedBytes&& icc) {
+    if (icc.empty()) return false;
+    icc_ = std::move(icc);
+
+    want_icc_ = true;
+    have_fields_ = false;
+    return true;
+  }
+
+  // Returns whether to send the ICC profile in the codestream.
+  bool WantICC() const { return want_icc_; }
+
+  // Return whether the direct fields are set, if false but ICC is set, only
+  // raw ICC bytes are known.
+  bool HaveFields() const { return have_fields_; }
+
+  // Causes WantICC() to return false if ICC() can be reconstructed from fields.
+  // Defined in color_management.cc.
+  void DecideIfWantICC();
+
+  bool IsGray() const { return color_space_ == ColorSpace::kGray; }
+  bool IsCMYK() const { return cmyk_; }
+  size_t Channels() const { return IsGray() ? 1 : 3; }
+
+  // Returns false if the field is invalid and unusable.
+  bool HasPrimaries() const {
+    return !IsGray() && color_space_ != ColorSpace::kXYB;
+  }
+
+  // Returns true after setting the field to a value defined by color_space,
+  // otherwise false and leaves the field unchanged.
+  bool ImplicitWhitePoint() {
+    if (color_space_ == ColorSpace::kXYB) {
+      white_point = WhitePoint::kD65;
+      return true;
+    }
+    return false;
+  }
+
+  // Returns whether the color space is known to be sRGB. If a raw unparsed ICC
+  // profile is set without the fields being set, this returns false, even if
+  // the content of the ICC profile would match sRGB.
+  bool IsSRGB() const {
+    if (!have_fields_) return false;
+    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
+    if (white_point != WhitePoint::kD65) return false;
+    if (primaries != Primaries::kSRGB) return false;
+    if (!tf.IsSRGB()) return false;
+    return true;
+  }
+
+  // Returns whether the color space is known to be linear sRGB. If a raw
+  // unparsed ICC profile is set without the fields being set, this returns
+  // false, even if the content of the ICC profile would match linear sRGB.
+  bool IsLinearSRGB() const {
+    if (!have_fields_) return false;
+    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
+    if (white_point != WhitePoint::kD65) return false;
+    if (primaries != Primaries::kSRGB) return false;
+    if (!tf.IsLinear()) return false;
+    return true;
+  }
+
+  Status SetSRGB(const ColorSpace cs,
+                 const RenderingIntent ri = RenderingIntent::kRelative) {
+    InternalRemoveICC();
+    JXL_ASSERT(cs == ColorSpace::kGray || cs == ColorSpace::kRGB);
+    color_space_ = cs;
+    white_point = WhitePoint::kD65;
+    primaries = Primaries::kSRGB;
+    tf.SetTransferFunction(TransferFunction::kSRGB);
+    rendering_intent = ri;
+    return CreateICC();
+  }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Accessors ensure tf.nonserialized_color_space is updated at the same time.
+  ColorSpace GetColorSpace() const { return color_space_; }
+  void SetColorSpace(const ColorSpace cs) {
+    color_space_ = cs;
+    tf.nonserialized_color_space = cs;
+  }
+
+  CIExy GetWhitePoint() const;
+  Status SetWhitePoint(const CIExy& xy);
+
+  PrimariesCIExy GetPrimaries() const;
+  Status SetPrimaries(const PrimariesCIExy& xy);
+
+  // Checks if the color spaces (including white point / primaries) are the
+  // same, but ignores the transfer function, rendering intent and ICC bytes.
+  bool SameColorSpace(const ColorEncoding& other) const {
+    if (color_space_ != other.color_space_) return false;
+
+    if (white_point != other.white_point) return false;
+    if (white_point == WhitePoint::kCustom) {
+      if (white_.x != other.white_.x || white_.y != other.white_.y)
+        return false;
+    }
+
+    if (HasPrimaries() != other.HasPrimaries()) return false;
+    if (HasPrimaries()) {
+      if (primaries != other.primaries) return false;
+      if (primaries == Primaries::kCustom) {
+        if (red_.x != other.red_.x || red_.y != other.red_.y) return false;
+        if (green_.x != other.green_.x || green_.y != other.green_.y)
+          return false;
+        if (blue_.x != other.blue_.x || blue_.y != other.blue_.y) return false;
+      }
+    }
+    return true;
+  }
+
+  // Checks if the color space and transfer function are the same, ignoring
+  // rendering intent and ICC bytes
+  bool SameColorEncoding(const ColorEncoding& other) const {
+    return SameColorSpace(other) && tf.IsSame(other.tf);
+  }
+
+  mutable bool all_default;
+
+  // Only valid if HaveFields()
+  WhitePoint white_point;
+  Primaries primaries;  // Only valid if HasPrimaries()
+  CustomTransferFunction tf;
+  RenderingIntent rendering_intent;
+
+ private:
+  // Returns true if all fields have been initialized (possibly to kUnknown).
+  // Returns false if the ICC profile is invalid or decoding it fails.
+  // Defined in enc_color_management.cc.
+  Status SetFieldsFromICC();
+
+  // If true, the codestream contains an ICC profile and we do not serialize
+  // fields. Otherwise, fields are serialized and we create an ICC profile.
+  bool want_icc_;
+
+  // When false, fields such as white_point and tf are invalid and must not be
+  // used. This occurs after setting a raw bytes-only ICC profile, only the
+  // ICC bytes may be used. The color_space_ field is still valid.
+  bool have_fields_ = true;
+
+  PaddedBytes icc_;  // Valid ICC profile
+
+  ColorSpace color_space_;  // Can be kUnknown
+  bool cmyk_ = false;
+
+  // Only used if white_point == kCustom.
+  Customxy white_;
+
+  // Only used if primaries == kCustom.
+  Customxy red_;
+  Customxy green_;
+  Customxy blue_;
+};
+
+// Returns whether the two inputs are approximately equal.
+static inline bool ApproxEq(const double a, const double b,
+#if JPEGXL_ENABLE_SKCMS
+                            double max_l1 = 1E-3) {
+#else
+                            double max_l1 = 8E-5) {
+#endif
+  // Threshold should be sufficient for ICC's 15-bit fixed-point numbers.
+  // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms.
+  return std::abs(a - b) <= max_l1;
+}
+
+// Returns a representation of the ColorEncoding fields (not icc).
+// Example description: "RGB_D65_SRG_Rel_Lin"
+std::string Description(const ColorEncoding& c);
+static inline std::ostream& operator<<(std::ostream& os,
+                                       const ColorEncoding& c) {
+  return os << Description(c);
+}
+
+void ConvertInternalToExternalColorEncoding(const jxl::ColorEncoding& internal,
+                                            JxlColorEncoding* external);
+
+Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
+                                              jxl::ColorEncoding* internal);
+
+Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
+                      float by, float wx, float wy, float matrix[9]);
+Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
+                         float by, float wx, float wy, float matrix[9]);
+Status AdaptToXYZD50(float wx, float wy, float matrix[9]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COLOR_ENCODING_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
new file mode 100644
index 0000000000..6ad47e1923
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_encoding_internal.h"
+
+#include <stdio.h>
+
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ColorEncodingTest, RoundTripAll) {
+  for (const test::ColorEncodingDescriptor& cdesc : test::AllEncodings()) {
+    const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc);
+    // Verify Set(Get) yields the same white point/primaries/gamma.
+    {
+      ColorEncoding c;
+      EXPECT_TRUE(c.SetWhitePoint(c_original.GetWhitePoint()));
+      EXPECT_EQ(c_original.white_point, c.white_point);
+    }
+    {
+      ColorEncoding c;
+      EXPECT_TRUE(c.SetPrimaries(c_original.GetPrimaries()));
+      EXPECT_EQ(c_original.primaries, c.primaries);
+    }
+    if (c_original.tf.IsGamma()) {
+      ColorEncoding c;
+      EXPECT_TRUE(c.tf.SetGamma(c_original.tf.GetGamma()));
+      EXPECT_TRUE(c_original.tf.IsSame(c.tf));
+    }
+  }
+}
+
+TEST(ColorEncodingTest, CustomWhitePoint) {
+  ColorEncoding c;
+  // Nonsensical values
+  CIExy xy_in;
+  xy_in.x = 0.8;
+  xy_in.y = 0.01;
+  EXPECT_TRUE(c.SetWhitePoint(xy_in));
+  const CIExy xy = c.GetWhitePoint();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.SetWhitePoint(xy));
+  EXPECT_TRUE(c.SameColorSpace(c2));
+}
+
+TEST(ColorEncodingTest, CustomPrimaries) {
+  ColorEncoding c;
+  PrimariesCIExy xy_in;
+  // Nonsensical values
+  xy_in.r.x = -0.01;
+  xy_in.r.y = 0.2;
+  xy_in.g.x = 0.4;
+  xy_in.g.y = 0.401;
+  xy_in.b.x = 1.1;
+  xy_in.b.y = -1.2;
+  EXPECT_TRUE(c.SetPrimaries(xy_in));
+  const PrimariesCIExy xy = c.GetPrimaries();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.SetPrimaries(xy));
+  EXPECT_TRUE(c.SameColorSpace(c2));
+}
+
+TEST(ColorEncodingTest, CustomGamma) {
+  ColorEncoding c;
+#ifndef JXL_CRASH_ON_ERROR
+  EXPECT_FALSE(c.tf.SetGamma(0.0));
+  EXPECT_FALSE(c.tf.SetGamma(-1E-6));
+  EXPECT_FALSE(c.tf.SetGamma(1.001));
+#endif
+  EXPECT_TRUE(c.tf.SetGamma(1.0));
+  EXPECT_FALSE(c.tf.IsGamma());
+  EXPECT_TRUE(c.tf.IsLinear());
+
+  EXPECT_TRUE(c.tf.SetGamma(0.123));
+  EXPECT_TRUE(c.tf.IsGamma());
+  const double gamma = c.tf.GetGamma();
+
+  ColorEncoding c2;
+  EXPECT_TRUE(c2.tf.SetGamma(gamma));
+  EXPECT_TRUE(c.SameColorEncoding(c2));
+  EXPECT_TRUE(c2.tf.IsGamma());
+}
+
+TEST(ColorEncodingTest, InternalExternalConversion) {
+  ColorEncoding source_internal;
+  JxlColorEncoding external;
+  ColorEncoding destination_internal;
+
+  for (int i = 0; i < 100; i++) {
+    source_internal.SetColorSpace(static_cast<ColorSpace>(rand() % 4));
+    CIExy wp;
+    wp.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+    wp.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+    EXPECT_TRUE(source_internal.SetWhitePoint(wp));
+    if (source_internal.HasPrimaries()) {
+      PrimariesCIExy primaries;
+      primaries.r.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.r.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.g.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.g.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.b.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      primaries.b.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
+      EXPECT_TRUE(source_internal.SetPrimaries(primaries));
+    }
+    CustomTransferFunction tf;
+    EXPECT_TRUE(tf.SetGamma((float(rand()) / float((RAND_MAX)) * 0.5) + 0.25));
+    source_internal.tf = tf;
+    source_internal.rendering_intent = static_cast<RenderingIntent>(rand() % 4);
+
+    ConvertInternalToExternalColorEncoding(source_internal, &external);
+    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(external,
+                                                       &destination_internal));
+
+    EXPECT_EQ(source_internal.GetColorSpace(),
+              destination_internal.GetColorSpace());
+    EXPECT_EQ(source_internal.white_point, destination_internal.white_point);
+    EXPECT_EQ(source_internal.GetWhitePoint().x,
+              destination_internal.GetWhitePoint().x);
+    EXPECT_EQ(source_internal.GetWhitePoint().y,
+              destination_internal.GetWhitePoint().y);
+    if (source_internal.HasPrimaries()) {
+      EXPECT_EQ(source_internal.GetPrimaries().r.x,
+                destination_internal.GetPrimaries().r.x);
+      EXPECT_EQ(source_internal.GetPrimaries().r.y,
+                destination_internal.GetPrimaries().r.y);
+      EXPECT_EQ(source_internal.GetPrimaries().g.x,
+                destination_internal.GetPrimaries().g.x);
+      EXPECT_EQ(source_internal.GetPrimaries().g.y,
+                destination_internal.GetPrimaries().g.y);
+      EXPECT_EQ(source_internal.GetPrimaries().b.x,
+                destination_internal.GetPrimaries().b.x);
+      EXPECT_EQ(source_internal.GetPrimaries().b.y,
+                destination_internal.GetPrimaries().b.y);
+    }
+    EXPECT_EQ(source_internal.tf.IsGamma(), destination_internal.tf.IsGamma());
+    if (source_internal.tf.IsGamma()) {
+      EXPECT_EQ(source_internal.tf.GetGamma(),
+                destination_internal.tf.GetGamma());
+    } else {
+      EXPECT_EQ(source_internal.tf.GetTransferFunction(),
+                destination_internal.tf.GetTransferFunction());
+    }
+    EXPECT_EQ(source_internal.rendering_intent,
+              destination_internal.rendering_intent);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/color_management.cc b/third_party/jpeg-xl/lib/jxl/color_management.cc
new file mode 100644
index 0000000000..d656888a8b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management.cc
@@ -0,0 +1,682 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_management.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/color_management.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// NOTE: this is only used to provide a reasonable ICC profile that other
+// software can read. Our own transforms use ExtraTF instead because that is
+// more precise and supports unbounded mode.
+std::vector<uint16_t> CreateTableCurve(uint32_t N, const ExtraTF tf) {
+  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
+  JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG);
+  // No point using float - LCMS converts to 16-bit for A2B/MFT.
+  std::vector<uint16_t> table(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
+    const double dx = static_cast<double>(x);
+    // LCMS requires EOTF (e.g. 2.4 exponent).
+    double y = (tf == ExtraTF::kHLG) ? TF_HLG().DisplayFromEncoded(dx)
+                                     : TF_PQ().DisplayFromEncoded(dx);
+    JXL_ASSERT(y >= 0.0);
+    // Clamp to table range - necessary for HLG.
+    if (y > 1.0) y = 1.0;
+    // 1.0 corresponds to table value 0xFFFF.
+    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
+  }
+  return table;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(CreateTableCurve);  // Local function.
+
+Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]) {
+  // Target Y = 1.
+  if (std::abs(xy.y) < 1e-12) return JXL_FAILURE("Y value is too small");
+  const float factor = 1 / xy.y;
+  XYZ[0] = xy.x * factor;
+  XYZ[1] = 1;
+  XYZ[2] = (1 - xy.x - xy.y) * factor;
+  return true;
+}
+
+namespace {
+
+// NOTE: this is only used to provide a reasonable ICC profile that other
+// software can read. Our own transforms use ExtraTF instead because that is
+// more precise and supports unbounded mode.
+template <class Func>
+std::vector<uint16_t> CreateTableCurve(uint32_t N, const Func& func) {
+  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
+  // No point using float - LCMS converts to 16-bit for A2B/MFT.
+  std::vector<uint16_t> table(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
+    // LCMS requires EOTF (e.g. 2.4 exponent).
+    double y = func.DisplayFromEncoded(static_cast<double>(x));
+    JXL_ASSERT(y >= 0.0);
+    // Clamp to table range - necessary for HLG.
+    if (y > 1.0) y = 1.0;
+    // 1.0 corresponds to table value 0xFFFF.
+    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
+  }
+  return table;
+}
+
+void ICCComputeMD5(const PaddedBytes& data, uint8_t sum[16])
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  PaddedBytes data64 = data;
+  data64.push_back(128);
+  // Add bytes such that ((size + 8) & 63) == 0.
+  size_t extra = ((64 - ((data64.size() + 8) & 63)) & 63);
+  data64.resize(data64.size() + extra, 0);
+  for (uint64_t i = 0; i < 64; i += 8) {
+    data64.push_back(static_cast<uint64_t>(data.size() << 3u) >> i);
+  }
+
+  static const uint32_t sineparts[64] = {
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+  };
+  static const uint32_t shift[64] = {
+      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21,
+  };
+
+  uint32_t a0 = 0x67452301, b0 = 0xefcdab89, c0 = 0x98badcfe, d0 = 0x10325476;
+
+  for (size_t i = 0; i < data64.size(); i += 64) {
+    uint32_t a = a0, b = b0, c = c0, d = d0, f, g;
+    for (size_t j = 0; j < 64; j++) {
+      if (j < 16) {
+        f = (b & c) | ((~b) & d);
+        g = j;
+      } else if (j < 32) {
+        f = (d & b) | ((~d) & c);
+        g = (5 * j + 1) & 0xf;
+      } else if (j < 48) {
+        f = b ^ c ^ d;
+        g = (3 * j + 5) & 0xf;
+      } else {
+        f = c ^ (b | (~d));
+        g = (7 * j) & 0xf;
+      }
+      uint32_t dg0 = data64[i + g * 4 + 0], dg1 = data64[i + g * 4 + 1],
+               dg2 = data64[i + g * 4 + 2], dg3 = data64[i + g * 4 + 3];
+      uint32_t u = dg0 | (dg1 << 8u) | (dg2 << 16u) | (dg3 << 24u);
+      f += a + sineparts[j] + u;
+      a = d;
+      d = c;
+      c = b;
+      b += (f << shift[j]) | (f >> (32u - shift[j]));
+    }
+    a0 += a;
+    b0 += b;
+    c0 += c;
+    d0 += d;
+  }
+  sum[0] = a0;
+  sum[1] = a0 >> 8u;
+  sum[2] = a0 >> 16u;
+  sum[3] = a0 >> 24u;
+  sum[4] = b0;
+  sum[5] = b0 >> 8u;
+  sum[6] = b0 >> 16u;
+  sum[7] = b0 >> 24u;
+  sum[8] = c0;
+  sum[9] = c0 >> 8u;
+  sum[10] = c0 >> 16u;
+  sum[11] = c0 >> 24u;
+  sum[12] = d0;
+  sum[13] = d0 >> 8u;
+  sum[14] = d0 >> 16u;
+  sum[15] = d0 >> 24u;
+}
+
+Status CreateICCChadMatrix(CIExy w, float result[9]) {
+  float m[9];
+  if (w.y == 0) {  // WhitePoint can not be pitch-black.
+    return JXL_FAILURE("Invalid WhitePoint");
+  }
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(w.x, w.y, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy.
+Status CreateICCRGBMatrix(CIExy r, CIExy g, CIExy b, CIExy w, float result[9]) {
+  float m[9];
+  JXL_RETURN_IF_ERROR(
+      PrimariesToXYZD50(r.x, r.y, g.x, g.y, b.x, b.y, w.x, w.y, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+void WriteICCUint32(uint32_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  (*icc)[pos + 0] = (value >> 24u) & 255;
+  (*icc)[pos + 1] = (value >> 16u) & 255;
+  (*icc)[pos + 2] = (value >> 8u) & 255;
+  (*icc)[pos + 3] = value & 255;
+}
+
+void WriteICCUint16(uint16_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 2) icc->resize(pos + 2);
+  (*icc)[pos + 0] = (value >> 8u) & 255;
+  (*icc)[pos + 1] = value & 255;
+}
+
+void WriteICCUint8(uint8_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 1) icc->resize(pos + 1);
+  (*icc)[pos] = value;
+}
+
+// Writes a 4-character tag
+void WriteICCTag(const char* value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  memcpy(icc->data() + pos, value, 4);
+}
+
+Status WriteICCS15Fixed16(float value, size_t pos,
+                          PaddedBytes* JXL_RESTRICT icc) {
+  // "nextafterf" for 32768.0f towards zero are:
+  // 32767.998046875, 32767.99609375, 32767.994140625
+  // Even the first value works well,...
+  bool ok = (-32767.995f <= value) && (value <= 32767.995f);
+  if (!ok) return JXL_FAILURE("ICC value is out of range / NaN");
+  int32_t i = value * 65536.0f + 0.5f;
+  // Use two's complement
+  uint32_t u = static_cast<uint32_t>(i);
+  WriteICCUint32(u, pos, icc);
+  return true;
+}
+
+Status CreateICCHeader(const ColorEncoding& c,
+                       PaddedBytes* JXL_RESTRICT header) {
+  // TODO(lode): choose color management engine name, e.g. "skia" if
+  // integrated in skia.
+  static const char* kCmm = "jxl ";
+
+  header->resize(128, 0);
+
+  WriteICCUint32(0, 0, header);  // size, correct value filled in at end
+  WriteICCTag(kCmm, 4, header);
+  WriteICCUint32(0x04400000u, 8, header);
+  const char* profile_type =
+      c.GetColorSpace() == ColorSpace::kXYB ? "scnr" : "mntr";
+  WriteICCTag(profile_type, 12, header);
+  WriteICCTag(c.IsGray() ? "GRAY" : "RGB ", 16, header);
+  WriteICCTag("XYZ ", 20, header);
+
+  // Three uint32_t's date/time encoding.
+  // TODO(lode): encode actual date and time, this is a placeholder
+  uint32_t year = 2019, month = 12, day = 1;
+  uint32_t hour = 0, minute = 0, second = 0;
+  WriteICCUint16(year, 24, header);
+  WriteICCUint16(month, 26, header);
+  WriteICCUint16(day, 28, header);
+  WriteICCUint16(hour, 30, header);
+  WriteICCUint16(minute, 32, header);
+  WriteICCUint16(second, 34, header);
+
+  WriteICCTag("acsp", 36, header);
+  WriteICCTag("APPL", 40, header);
+  WriteICCUint32(0, 44, header);  // flags
+  WriteICCUint32(0, 48, header);  // device manufacturer
+  WriteICCUint32(0, 52, header);  // device model
+  WriteICCUint32(0, 56, header);  // device attributes
+  WriteICCUint32(0, 60, header);  // device attributes
+  WriteICCUint32(static_cast<uint32_t>(c.rendering_intent), 64, header);
+
+  // Mandatory D50 white point of profile connection space
+  WriteICCUint32(0x0000f6d6, 68, header);
+  WriteICCUint32(0x00010000, 72, header);
+  WriteICCUint32(0x0000d32d, 76, header);
+
+  WriteICCTag(kCmm, 80, header);
+
+  return true;
+}
+
+void AddToICCTagTable(const char* tag, size_t offset, size_t size,
+                      PaddedBytes* JXL_RESTRICT tagtable,
+                      std::vector<size_t>* offsets) {
+  WriteICCTag(tag, tagtable->size(), tagtable);
+  // writing true offset deferred to later
+  WriteICCUint32(0, tagtable->size(), tagtable);
+  offsets->push_back(offset);
+  WriteICCUint32(size, tagtable->size(), tagtable);
+}
+
+void FinalizeICCTag(PaddedBytes* JXL_RESTRICT tags, size_t* offset,
+                    size_t* size) {
+  while ((tags->size() & 3) != 0) {
+    tags->push_back(0);
+  }
+  *offset += *size;
+  *size = tags->size() - *offset;
+}
+
+// The input text must be ASCII, writing other characters to UTF-16 is not
+// implemented.
+void CreateICCMlucTag(const std::string& text, PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("mluc", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint32(1, tags->size(), tags);
+  WriteICCUint32(12, tags->size(), tags);
+  WriteICCTag("enUS", tags->size(), tags);
+  WriteICCUint32(text.size() * 2, tags->size(), tags);
+  WriteICCUint32(28, tags->size(), tags);
+  for (size_t i = 0; i < text.size(); i++) {
+    tags->push_back(0);  // prepend 0 for UTF-16
+    tags->push_back(text[i]);
+  }
+}
+
+Status CreateICCXYZTag(float xyz[3], PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("XYZ ", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 3; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags));
+  }
+  return true;
+}
+
+Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("sf32", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 9; i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags));
+  }
+  return true;
+}
+
+void MaybeCreateICCCICPTag(const ColorEncoding& c,
+                           PaddedBytes* JXL_RESTRICT tags, size_t* offset,
+                           size_t* size, PaddedBytes* JXL_RESTRICT tagtable,
+                           std::vector<size_t>* offsets) {
+  if (c.GetColorSpace() != ColorSpace::kRGB) {
+    return;
+  }
+  uint8_t primaries = 0;
+  if (c.primaries == Primaries::kP3) {
+    if (c.white_point == WhitePoint::kD65) {
+      primaries = 12;
+    } else if (c.white_point == WhitePoint::kDCI) {
+      primaries = 11;
+    } else {
+      return;
+    }
+  } else if (c.primaries != Primaries::kCustom &&
+             c.white_point == WhitePoint::kD65) {
+    primaries = static_cast<uint8_t>(c.primaries);
+  } else {
+    return;
+  }
+  if (c.tf.IsUnknown() || c.tf.IsGamma()) {
+    return;
+  }
+  WriteICCTag("cicp", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint8(primaries, tags->size(), tags);
+  WriteICCUint8(static_cast<uint8_t>(c.tf.GetTransferFunction()), tags->size(),
+                tags);
+  // Matrix
+  WriteICCUint8(0, tags->size(), tags);
+  // Full range
+  WriteICCUint8(1, tags->size(), tags);
+  FinalizeICCTag(tags, offset, size);
+  AddToICCTagTable("cicp", *offset, *size, tagtable, offsets);
+}
+
+void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
+                          PaddedBytes* JXL_RESTRICT tags) {
+  size_t pos = tags->size();
+  tags->resize(tags->size() + 12 + curve.size() * 2, 0);
+  WriteICCTag("curv", pos, tags);
+  WriteICCUint32(0, pos + 4, tags);
+  WriteICCUint32(curve.size(), pos + 8, tags);
+  for (size_t i = 0; i < curve.size(); i++) {
+    WriteICCUint16(curve[i], pos + 12 + i * 2, tags);
+  }
+}
+
+// Writes 12 + 4*params.size() bytes
+Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
+                            PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("para", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint16(curve_type, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  for (size_t i = 0; i < params.size(); i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags));
+  }
+  return true;
+}
+
+Status CreateICCLutAtoBTagForXYB(PaddedBytes* JXL_RESTRICT tags) {
+  WriteICCTag("mAB ", tags->size(), tags);
+  // 4 reserved bytes set to 0
+  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
+  WriteICCUint8(3, tags->size(), tags);
+  // 2 reserved bytes for padding
+  WriteICCUint16(0, tags->size(), tags);
+  // offset to first B curve
+  WriteICCUint32(32, tags->size(), tags);
+  // offset to matrix
+  WriteICCUint32(244, tags->size(), tags);
+  // offset to first M curve
+  WriteICCUint32(148, tags->size(), tags);
+  // offset to CLUT
+  WriteICCUint32(80, tags->size(), tags);
+  // offset to first A curve
+  // (reuse linear B curves)
+  WriteICCUint32(32, tags->size(), tags);
+
+  // offset = 32
+  // no-op curves
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  // offset = 80
+  // number of grid points for each input channel
+  for (int i = 0; i < 16; ++i) {
+    WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags);
+  }
+  // precision = 2
+  WriteICCUint8(2, tags->size(), tags);
+  // 3 bytes of padding
+  WriteICCUint8(0, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  const float kOffsets[3] = {
+      kScaledXYBOffset[0] + kScaledXYBOffset[1],
+      kScaledXYBOffset[1] - kScaledXYBOffset[0] + 1.0f / kScaledXYBScale[0],
+      kScaledXYBOffset[1] + kScaledXYBOffset[2]};
+  const float kScaling[3] = {
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[0] + 1.0f / kScaledXYBScale[1]),
+      1.0f / (1.0f / kScaledXYBScale[1] + 1.0f / kScaledXYBScale[2])};
+  // 2*2*2*3 entries of 2 bytes each = 48 bytes
+  for (size_t ix = 0; ix < 2; ++ix) {
+    for (size_t iy = 0; iy < 2; ++iy) {
+      for (size_t ib = 0; ib < 2; ++ib) {
+        float in_f[3] = {ix * 1.0f, iy * 1.0f, ib * 1.0f};
+        for (size_t c = 0; c < 3; ++c) {
+          in_f[c] /= kScaledXYBScale[c];
+          in_f[c] -= kScaledXYBOffset[c];
+        }
+        float out_f[3];
+        out_f[0] = in_f[1] + in_f[0];
+        out_f[1] = in_f[1] - in_f[0];
+        out_f[2] = in_f[2] + in_f[1];
+        for (int i = 0; i < 3; ++i) {
+          out_f[i] += kOffsets[i];
+          out_f[i] *= kScaling[i];
+        }
+        for (int i = 0; i < 3; ++i) {
+          JXL_RETURN_IF_ERROR(out_f[i] >= 0.f && out_f[i] <= 1.f);
+          uint16_t val = static_cast<uint16_t>(
+              0.5f + 65535 * std::max(0.f, std::min(1.f, out_f[i])));
+          WriteICCUint16(val, tags->size(), tags);
+        }
+      }
+    }
+  }
+  // offset = 148
+  // 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes
+  for (size_t i = 0; i < 3; ++i) {
+    const float b =
+        -kOffsets[i] - std::cbrt(jxl::kNegOpsinAbsorbanceBiasRGB[i]);
+    std::vector<float> params = {
+        3,
+        1.0f / kScaling[i],
+        b,
+        0,                                // unused
+        std::max(0.f, -b * kScaling[i]),  // make skcms happy
+    };
+    JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags));
+  }
+  // offset = 244
+  const double matrix[] = {1.5170095, -1.1065225, 0.071623,
+                           -0.050022, 0.5683655,  -0.018344,
+                           -1.387676, 1.1145555,  0.6857255};
+  // 12 * 4 = 48 bytes
+  for (size_t i = 0; i < 9; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags));
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    float intercept = 0;
+    for (size_t j = 0; j < 3; ++j) {
+      intercept += matrix[i * 3 + j] * jxl::kNegOpsinAbsorbanceBiasRGB[j];
+    }
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags));
+  }
+  return true;
+}
+}  // namespace
+
+Status MaybeCreateProfile(const ColorEncoding& c,
+                          PaddedBytes* JXL_RESTRICT icc) {
+  PaddedBytes header, tagtable, tags;
+
+  if (c.GetColorSpace() == ColorSpace::kUnknown || c.tf.IsUnknown()) {
+    return false;  // Not an error
+  }
+
+  switch (c.GetColorSpace()) {
+    case ColorSpace::kRGB:
+    case ColorSpace::kGray:
+    case ColorSpace::kXYB:
+      break;  // OK
+    default:
+      return JXL_FAILURE("Invalid CS %u",
+                         static_cast<unsigned int>(c.GetColorSpace()));
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kXYB &&
+      c.rendering_intent != RenderingIntent::kPerceptual) {
+    return JXL_FAILURE(
+        "Only perceptual rendering intent implemented for XYB "
+        "ICC profile.");
+  }
+
+  JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header));
+
+  std::vector<size_t> offsets;
+  // tag count, deferred to later
+  WriteICCUint32(0, tagtable.size(), &tagtable);
+
+  size_t tag_offset = 0, tag_size = 0;
+
+  CreateICCMlucTag(Description(c), &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);
+
+  const std::string copyright = "CC0";
+  CreateICCMlucTag(copyright, &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
+
+  // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint?
+  if (c.IsGray()) {
+    float wtpt[3];
+    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(c.GetWhitePoint(), wtpt));
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags));
+  } else {
+    float d50[3] = {0.964203, 1.0, 0.824905};
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags));
+  }
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets);
+
+  if (!c.IsGray()) {
+    // Chromatic adaptation matrix
+    float chad[9];
+    JXL_RETURN_IF_ERROR(CreateICCChadMatrix(c.GetWhitePoint(), chad));
+
+    JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kRGB) {
+    MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable,
+                          &offsets);
+
+    const PrimariesCIExy primaries = c.GetPrimaries();
+    float m[9];
+    JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g,
+                                           primaries.b, c.GetWhitePoint(), m));
+    float r[3] = {m[0], m[3], m[6]};
+    float g[3] = {m[1], m[4], m[7]};
+    float b[3] = {m[2], m[5], m[8]};
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.GetColorSpace() == ColorSpace::kXYB) {
+    JXL_RETURN_IF_ERROR(CreateICCLutAtoBTagForXYB(&tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("A2B0", tag_offset, tag_size, &tagtable, &offsets);
+  } else {
+    if (c.tf.IsGamma()) {
+      float gamma = 1.0 / c.tf.GetGamma();
+      JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({gamma}, 0, &tags));
+    } else if (c.GetColorSpace() != ColorSpace::kXYB) {
+      switch (c.tf.GetTransferFunction()) {
+        case TransferFunction::kHLG:
+          CreateICCCurvCurvTag(
+              HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kHLG),
+              &tags);
+          break;
+        case TransferFunction::kPQ:
+          CreateICCCurvCurvTag(
+              HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kPQ),
+              &tags);
+          break;
+        case TransferFunction::kSRGB:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3,
+              &tags));
+          break;
+        case TransferFunction::k709:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3,
+              &tags));
+          break;
+        case TransferFunction::kLinear:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        case TransferFunction::kDCI:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        default:
+          JXL_ABORT("Unknown TF %u",
+                    static_cast<unsigned int>(c.tf.GetTransferFunction()));
+      }
+    }
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    if (c.IsGray()) {
+      AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets);
+    } else {
+      AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets);
+    }
+  }
+
+  // Tag count
+  WriteICCUint32(offsets.size(), 0, &tagtable);
+  for (size_t i = 0; i < offsets.size(); i++) {
+    WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4,
+                   &tagtable);
+  }
+
+  // ICC profile size
+  WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header);
+
+  *icc = header;
+  icc->append(tagtable);
+  icc->append(tags);
+
+  // The MD5 checksum must be computed on the profile with profile flags,
+  // rendering intent, and region of the checksum itself, set to 0.
+  // TODO(lode): manually verify with a reliable tool that this creates correct
+  // signature (profile id) for ICC profiles.
+  PaddedBytes icc_sum = *icc;
+  if (icc_sum.size() >= 64 + 4) {
+    memset(icc_sum.data() + 44, 0, 4);
+    memset(icc_sum.data() + 64, 0, 4);
+  }
+  uint8_t checksum[16];
+  ICCComputeMD5(icc_sum, checksum);
+
+  memcpy(icc->data() + 84, checksum, sizeof(checksum));
+
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/color_management.h b/third_party/jpeg-xl/lib/jxl/color_management.h
new file mode 100644
index 0000000000..f623aa1c90
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management.h
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COLOR_MANAGEMENT_H_
+#define LIB_JXL_COLOR_MANAGEMENT_H_
+
+// ICC profiles and color space conversions.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+enum class ExtraTF {
+  kNone,
+  kPQ,
+  kHLG,
+  kSRGB,
+};
+
+// NOTE: for XYB colorspace, the created profile can be used to transform a
+// *scaled* XYB image (created by ScaleXYB()) to another colorspace.
+Status MaybeCreateProfile(const ColorEncoding& c,
+                          PaddedBytes* JXL_RESTRICT icc);
+
+Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COLOR_MANAGEMENT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/color_management_test.cc b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
new file mode 100644
index 0000000000..fc7a1c57ff
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
@@ -0,0 +1,405 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/color_management.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <new>
+#include <string>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+std::ostream& operator<<(std::ostream& os, const CIExy& xy) {
+  return os << "{x=" << xy.x << ", y=" << xy.y << "}";
+}
+
+std::ostream& operator<<(std::ostream& os, const PrimariesCIExy& primaries) {
+  return os << "{r=" << primaries.r << ", g=" << primaries.g
+            << ", b=" << primaries.b << "}";
+}
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::FloatNear;
+
+// Small enough to be fast. If changed, must update Generate*.
+static constexpr size_t kWidth = 16;
+
+static constexpr size_t kNumThreads = 1;  // only have a single row.
+
+struct Globals {
+  // TODO(deymo): Make this a const.
+  static Globals* GetInstance() {
+    static Globals ret;
+    return &ret;
+  }
+
+ private:
+  Globals() {
+    in_gray = GenerateGray();
+    in_color = GenerateColor();
+    out_gray = ImageF(kWidth, 1);
+    out_color = ImageF(kWidth * 3, 1);
+
+    c_native = ColorEncoding::LinearSRGB(/*is_gray=*/false);
+    c_gray = ColorEncoding::LinearSRGB(/*is_gray=*/true);
+  }
+
+  static ImageF GenerateGray() {
+    ImageF gray(kWidth, 1);
+    float* JXL_RESTRICT row = gray.Row(0);
+    // Increasing left to right
+    for (uint32_t x = 0; x < kWidth; ++x) {
+      row[x] = x * 1.0f / (kWidth - 1);  // [0, 1]
+    }
+    return gray;
+  }
+
+  static ImageF GenerateColor() {
+    ImageF image(kWidth * 3, 1);
+    float* JXL_RESTRICT interleaved = image.Row(0);
+    std::fill(interleaved, interleaved + kWidth * 3, 0.0f);
+
+    // [0, 4): neutral
+    for (int32_t x = 0; x < 4; ++x) {
+      interleaved[3 * x + 0] = x * 1.0f / 3;  // [0, 1]
+      interleaved[3 * x + 2] = interleaved[3 * x + 1] = interleaved[3 * x + 0];
+    }
+
+    // [4, 13): pure RGB with low/medium/high saturation
+    for (int32_t c = 0; c < 3; ++c) {
+      interleaved[3 * (4 + c) + c] = 0.08f + c * 0.01f;
+      interleaved[3 * (7 + c) + c] = 0.75f + c * 0.01f;
+      interleaved[3 * (10 + c) + c] = 1.0f;
+    }
+
+    // [13, 16): impure, not quite saturated RGB
+    interleaved[3 * 13 + 0] = 0.86f;
+    interleaved[3 * 13 + 2] = interleaved[3 * 13 + 1] = 0.16f;
+    interleaved[3 * 14 + 1] = 0.87f;
+    interleaved[3 * 14 + 2] = interleaved[3 * 14 + 0] = 0.16f;
+    interleaved[3 * 15 + 2] = 0.88f;
+    interleaved[3 * 15 + 1] = interleaved[3 * 15 + 0] = 0.16f;
+
+    return image;
+  }
+
+ public:
+  // ImageF so we can use VerifyRelativeError; all are interleaved RGB.
+  ImageF in_gray;
+  ImageF in_color;
+  ImageF out_gray;
+  ImageF out_color;
+  ColorEncoding c_native;
+  ColorEncoding c_gray;
+};
+
+class ColorManagementTest
+    : public ::testing::TestWithParam<test::ColorEncodingDescriptor> {
+ public:
+  static void VerifySameFields(const ColorEncoding& c,
+                               const ColorEncoding& c2) {
+    ASSERT_EQ(c.rendering_intent, c2.rendering_intent);
+    ASSERT_EQ(c.GetColorSpace(), c2.GetColorSpace());
+    ASSERT_EQ(c.white_point, c2.white_point);
+    if (c.HasPrimaries()) {
+      ASSERT_EQ(c.primaries, c2.primaries);
+    }
+    ASSERT_TRUE(c.tf.IsSame(c2.tf));
+  }
+
+  // "Same" pixels after converting g->c_native -> c -> g->c_native.
+  static void VerifyPixelRoundTrip(const ColorEncoding& c) {
+    Globals* g = Globals::GetInstance();
+    const ColorEncoding& c_native = c.IsGray() ? g->c_gray : g->c_native;
+    const JxlCmsInterface& cms = GetJxlCms();
+    ColorSpaceTransform xform_fwd(cms);
+    ColorSpaceTransform xform_rev(cms);
+    const float intensity_target =
+        c.tf.IsHLG() ? 1000 : kDefaultIntensityTarget;
+    ASSERT_TRUE(
+        xform_fwd.Init(c_native, c, intensity_target, kWidth, kNumThreads));
+    ASSERT_TRUE(
+        xform_rev.Init(c, c_native, intensity_target, kWidth, kNumThreads));
+
+    const size_t thread = 0;
+    const ImageF& in = c.IsGray() ? g->in_gray : g->in_color;
+    ImageF* JXL_RESTRICT out = c.IsGray() ? &g->out_gray : &g->out_color;
+    ASSERT_TRUE(xform_fwd.Run(thread, in.Row(0), xform_fwd.BufDst(thread)));
+    ASSERT_TRUE(xform_rev.Run(thread, xform_fwd.BufDst(thread), out->Row(0)));
+
+#if JPEGXL_ENABLE_SKCMS
+    double max_l1 = 7E-4;
+    double max_rel = 4E-7;
+#else
+    double max_l1 = 5E-5;
+    // Most are lower; reached 3E-7 with D60 AP0.
+    double max_rel = 4E-7;
+#endif
+    if (c.IsGray()) max_rel = 2E-5;
+    JXL_ASSERT_OK(VerifyRelativeError(in, *out, max_l1, max_rel, _));
+  }
+};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(ColorManagementTestInstantiation,
+                                   ColorManagementTest,
+                                   ::testing::ValuesIn(test::AllEncodings()));
+
+// Exercises the ColorManagement interface for ALL ColorEncoding synthesizable
+// via enums.
+TEST_P(ColorManagementTest, VerifyAllProfiles) {
+  ColorEncoding c = ColorEncodingFromDescriptor(GetParam());
+  printf("%s\n", Description(c).c_str());
+
+  // Can create profile.
+  ASSERT_TRUE(c.CreateICC());
+
+  // Can set an equivalent ColorEncoding from the generated ICC profile.
+  ColorEncoding c3;
+  ASSERT_TRUE(c3.SetICC(PaddedBytes(c.ICC())));
+  VerifySameFields(c, c3);
+
+  VerifyPixelRoundTrip(c);
+}
+
+testing::Matcher<CIExy> CIExyIs(const double x, const double y) {
+  static constexpr double kMaxError = 1e-4;
+  return testing::AllOf(
+      testing::Field(&CIExy::x, testing::DoubleNear(x, kMaxError)),
+      testing::Field(&CIExy::y, testing::DoubleNear(y, kMaxError)));
+}
+
+testing::Matcher<PrimariesCIExy> PrimariesAre(
+    const testing::Matcher<CIExy>& r, const testing::Matcher<CIExy>& g,
+    const testing::Matcher<CIExy>& b) {
+  return testing::AllOf(testing::Field(&PrimariesCIExy::r, r),
+                        testing::Field(&PrimariesCIExy::g, g),
+                        testing::Field(&PrimariesCIExy::b, b));
+}
+
+TEST_F(ColorManagementTest, sRGBChromaticity) {
+  const ColorEncoding sRGB = ColorEncoding::SRGB();
+  EXPECT_THAT(sRGB.GetWhitePoint(), CIExyIs(0.3127, 0.3290));
+  EXPECT_THAT(sRGB.GetPrimaries(),
+              PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60),
+                           CIExyIs(0.15, 0.06)));
+}
+
+TEST_F(ColorManagementTest, D2700Chromaticity) {
+  PaddedBytes icc =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  ColorEncoding sRGB_D2700;
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+
+  EXPECT_THAT(sRGB_D2700.GetWhitePoint(), CIExyIs(0.45986, 0.41060));
+  // The illuminant-relative chromaticities of this profile's primaries are the
+  // same as for sRGB. It is the PCS-relative chromaticities that would be
+  // different.
+  EXPECT_THAT(sRGB_D2700.GetPrimaries(),
+              PrimariesAre(CIExyIs(0.64, 0.33), CIExyIs(0.30, 0.60),
+                           CIExyIs(0.15, 0.06)));
+}
+
+TEST_F(ColorManagementTest, D2700ToSRGB) {
+  PaddedBytes icc =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  ColorEncoding sRGB_D2700;
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+
+  ColorSpaceTransform transform(GetJxlCms());
+  ASSERT_TRUE(transform.Init(sRGB_D2700, ColorEncoding::SRGB(),
+                             kDefaultIntensityTarget, 1, 1));
+  const float sRGB_D2700_values[3] = {0.863, 0.737, 0.490};
+  float sRGB_values[3];
+  ASSERT_TRUE(transform.Run(0, sRGB_D2700_values, sRGB_values));
+  EXPECT_THAT(sRGB_values,
+              ElementsAre(FloatNear(0.914, 1e-3), FloatNear(0.745, 1e-3),
+                          FloatNear(0.601, 1e-3)));
+}
+
+TEST_F(ColorManagementTest, P3HlgTo2020Hlg) {
+  ColorEncoding p3_hlg;
+  p3_hlg.SetColorSpace(ColorSpace::kRGB);
+  p3_hlg.white_point = WhitePoint::kD65;
+  p3_hlg.primaries = Primaries::kP3;
+  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.CreateICC());
+
+  ColorEncoding rec2020_hlg = p3_hlg;
+  rec2020_hlg.primaries = Primaries::k2100;
+  ASSERT_TRUE(rec2020_hlg.CreateICC());
+
+  ColorSpaceTransform transform(GetJxlCms());
+  ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1));
+  const float p3_hlg_values[3] = {0., 0.75, 0.};
+  float rec2020_hlg_values[3];
+  ASSERT_TRUE(transform.Run(0, p3_hlg_values, rec2020_hlg_values));
+  EXPECT_THAT(rec2020_hlg_values,
+              ElementsAre(FloatNear(0.3973, 1e-4), FloatNear(0.7382, 1e-4),
+                          FloatNear(0.1183, 1e-4)));
+}
+
+TEST_F(ColorManagementTest, HlgOotf) {
+  ColorEncoding p3_hlg;
+  p3_hlg.SetColorSpace(ColorSpace::kRGB);
+  p3_hlg.white_point = WhitePoint::kD65;
+  p3_hlg.primaries = Primaries::kP3;
+  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.CreateICC());
+
+  ColorSpaceTransform transform_to_1000(GetJxlCms());
+  ASSERT_TRUE(
+      transform_to_1000.Init(p3_hlg, ColorEncoding::LinearSRGB(), 1000, 1, 1));
+  // HDR reference white: https://www.itu.int/pub/R-REP-BT.2408-4-2021
+  float p3_hlg_values[3] = {0.75, 0.75, 0.75};
+  float linear_srgb_values[3];
+  ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values));
+  // On a 1000-nit display, HDR reference white should be 203 cd/m² which is
+  // 0.203 times the maximum.
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.203, 1e-3), FloatNear(0.203, 1e-3),
+                          FloatNear(0.203, 1e-3)));
+
+  ColorSpaceTransform transform_to_400(GetJxlCms());
+  ASSERT_TRUE(
+      transform_to_400.Init(p3_hlg, ColorEncoding::LinearSRGB(), 400, 1, 1));
+  ASSERT_TRUE(transform_to_400.Run(0, p3_hlg_values, linear_srgb_values));
+  // On a 400-nit display, it should be 100 cd/m².
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.250, 1e-3), FloatNear(0.250, 1e-3),
+                          FloatNear(0.250, 1e-3)));
+
+  p3_hlg_values[2] = 0.50;
+  ASSERT_TRUE(transform_to_1000.Run(0, p3_hlg_values, linear_srgb_values));
+  EXPECT_THAT(linear_srgb_values,
+              ElementsAre(FloatNear(0.201, 1e-3), FloatNear(0.201, 1e-3),
+                          FloatNear(0.050, 1e-3)));
+
+  ColorSpaceTransform transform_from_400(GetJxlCms());
+  ASSERT_TRUE(
+      transform_from_400.Init(ColorEncoding::LinearSRGB(), p3_hlg, 400, 1, 1));
+  linear_srgb_values[0] = linear_srgb_values[1] = linear_srgb_values[2] = 0.250;
+  ASSERT_TRUE(transform_from_400.Run(0, linear_srgb_values, p3_hlg_values));
+  EXPECT_THAT(p3_hlg_values,
+              ElementsAre(FloatNear(0.75, 1e-3), FloatNear(0.75, 1e-3),
+                          FloatNear(0.75, 1e-3)));
+
+  ColorEncoding grayscale_hlg;
+  grayscale_hlg.SetColorSpace(ColorSpace::kGray);
+  grayscale_hlg.white_point = WhitePoint::kD65;
+  grayscale_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(grayscale_hlg.CreateICC());
+
+  ColorSpaceTransform grayscale_transform(GetJxlCms());
+  ASSERT_TRUE(grayscale_transform.Init(
+      grayscale_hlg, ColorEncoding::LinearSRGB(/*is_gray=*/true), 1000, 1, 1));
+  const float grayscale_hlg_value = 0.75;
+  float linear_grayscale_value;
+  ASSERT_TRUE(grayscale_transform.Run(0, &grayscale_hlg_value,
+                                      &linear_grayscale_value));
+  EXPECT_THAT(linear_grayscale_value, FloatNear(0.203, 1e-3));
+}
+
+TEST_F(ColorManagementTest, XYBProfile) {
+  ColorEncoding c_xyb;
+  c_xyb.SetColorSpace(ColorSpace::kXYB);
+  c_xyb.rendering_intent = RenderingIntent::kPerceptual;
+  ASSERT_TRUE(c_xyb.CreateICC());
+  ColorEncoding c_native = ColorEncoding::LinearSRGB(false);
+
+  static const size_t kGridDim = 17;
+  static const size_t kNumColors = kGridDim * kGridDim * kGridDim;
+  const JxlCmsInterface& cms = GetJxlCms();
+  ColorSpaceTransform xform(cms);
+  ASSERT_TRUE(
+      xform.Init(c_xyb, c_native, kDefaultIntensityTarget, kNumColors, 1));
+
+  ImageMetadata metadata;
+  metadata.color_encoding = c_native;
+  ImageBundle ib(&metadata);
+  Image3F native(kNumColors, 1);
+  float mul = 1.0f / (kGridDim - 1);
+  for (size_t ir = 0, x = 0; ir < kGridDim; ++ir) {
+    for (size_t ig = 0; ig < kGridDim; ++ig) {
+      for (size_t ib = 0; ib < kGridDim; ++ib, ++x) {
+        native.PlaneRow(0, 0)[x] = ir * mul;
+        native.PlaneRow(1, 0)[x] = ig * mul;
+        native.PlaneRow(2, 0)[x] = ib * mul;
+      }
+    }
+  }
+  ib.SetFromImage(std::move(native), c_native);
+  const Image3F& in = *ib.color();
+  Image3F opsin(kNumColors, 1);
+  ToXYB(ib, nullptr, &opsin, cms, nullptr);
+
+  Image3F opsin2 = CopyImage(opsin);
+  ScaleXYB(&opsin2);
+
+  float* src = xform.BufSrc(0);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      src[3 * i + c] = opsin2.PlaneRow(c, 0)[i];
+    }
+  }
+
+  float* dst = xform.BufDst(0);
+  ASSERT_TRUE(xform.Run(0, src, dst));
+
+  Image3F out(kNumColors, 1);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      out.PlaneRow(c, 0)[i] = dst[3 * i + c];
+    }
+  }
+
+  auto debug_print_color = [&](size_t i) {
+    printf(
+        "(%f, %f, %f) -> (%9.6f, %f, %f) -> (%f, %f, %f) -> "
+        "(%9.6f, %9.6f, %9.6f)",
+        in.PlaneRow(0, 0)[i], in.PlaneRow(1, 0)[i], in.PlaneRow(2, 0)[i],
+        opsin.PlaneRow(0, 0)[i], opsin.PlaneRow(1, 0)[i],
+        opsin.PlaneRow(2, 0)[i], opsin2.PlaneRow(0, 0)[i],
+        opsin2.PlaneRow(1, 0)[i], opsin2.PlaneRow(2, 0)[i],
+        out.PlaneRow(0, 0)[i], out.PlaneRow(1, 0)[i], out.PlaneRow(2, 0)[i]);
+  };
+
+  float max_err[3] = {};
+  size_t max_err_i[3] = {};
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      // debug_print_color(i); printf("\n");
+      float err = std::abs(in.PlaneRow(c, 0)[i] - out.PlaneRow(c, 0)[i]);
+      if (err > max_err[c]) {
+        max_err[c] = err;
+        max_err_i[c] = i;
+      }
+    }
+  }
+  static float kMaxError[3] = {9e-4, 4e-4, 5e-4};
+  printf("Maximum errors:\n");
+  for (size_t c = 0; c < 3; ++c) {
+    debug_print_color(max_err_i[c]);
+    printf("    %f\n", max_err[c]);
+    EXPECT_LT(max_err[c], kMaxError[c]);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/common.h b/third_party/jpeg-xl/lib/jxl/common.h
new file mode 100644
index 0000000000..c2ebe029a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/common.h
@@ -0,0 +1,245 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COMMON_H_
+#define LIB_JXL_COMMON_H_
+
+// Shared constants and helper functions.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <limits>  // numeric_limits
+#include <memory>  // unique_ptr
+#include <string>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+
+#ifndef JXL_HIGH_PRECISION
+#define JXL_HIGH_PRECISION 1
+#endif
+
+// Macro that defines whether support for decoding JXL files to JPEG is enabled.
+#ifndef JPEGXL_ENABLE_TRANSCODE_JPEG
+#define JPEGXL_ENABLE_TRANSCODE_JPEG 1
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+// Macro that defines whether support for decoding boxes is enabled.
+#ifndef JPEGXL_ENABLE_BOXES
+#define JPEGXL_ENABLE_BOXES 1
+#endif  // JPEGXL_ENABLE_BOXES
+
+namespace jxl {
+// Some enums and typedefs used by more than one header file.
+
+constexpr size_t kBitsPerByte = 8;  // more clear than CHAR_BIT
+
+constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) {
+  return (bits + 7) & ~size_t(7);
+}
+
+constexpr inline size_t RoundUpToBlockDim(size_t dim) {
+  return (dim + 7) & ~size_t(7);
+}
+
+static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b,
+                                            uint64_t& sum) {
+  sum = a + b;
+  return sum >= a;  // no need to check b - either sum >= both or < both.
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+
+constexpr double kPi = 3.14159265358979323846264338327950288;
+
+// Reasonable default for sRGB, matches common monitors. We map white to this
+// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is
+// very close.
+static constexpr float kDefaultIntensityTarget = 255;
+
+template <typename T>
+constexpr T Pi(T multiplier) {
+  return static_cast<T>(multiplier * kPi);
+}
+
+// Block is the square grid of pixels to which an "energy compaction"
+// transformation (e.g. DCT) is applied. Each block has its own AC quantizer.
+constexpr size_t kBlockDim = 8;
+
+constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim;
+
+constexpr size_t kGroupDim = 256;
+static_assert(kGroupDim % kBlockDim == 0,
+              "Group dim should be divisible by block dim");
+constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim;
+
+// Maximum number of passes in an image.
+constexpr size_t kMaxNumPasses = 11;
+
+// Maximum number of reference frames.
+constexpr size_t kMaxNumReferenceFrames = 4;
+
+// Dimensions of a frame, in pixels, and other derived dimensions.
+// Computed from FrameHeader.
+// TODO(veluca): add extra channels.
+struct FrameDimensions {
+  void Set(size_t xsize, size_t ysize, size_t group_size_shift,
+           size_t max_hshift, size_t max_vshift, bool modular_mode,
+           size_t upsampling) {
+    group_dim = (kGroupDim >> 1) << group_size_shift;
+    dc_group_dim = group_dim * kBlockDim;
+    xsize_upsampled = xsize;
+    ysize_upsampled = ysize;
+    this->xsize = DivCeil(xsize, upsampling);
+    this->ysize = DivCeil(ysize, upsampling);
+    xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift;
+    ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift;
+    xsize_padded = xsize_blocks * kBlockDim;
+    ysize_padded = ysize_blocks * kBlockDim;
+    if (modular_mode) {
+      // Modular mode doesn't have any padding.
+      xsize_padded = this->xsize;
+      ysize_padded = this->ysize;
+    }
+    xsize_upsampled_padded = xsize_padded * upsampling;
+    ysize_upsampled_padded = ysize_padded * upsampling;
+    xsize_groups = DivCeil(this->xsize, group_dim);
+    ysize_groups = DivCeil(this->ysize, group_dim);
+    xsize_dc_groups = DivCeil(xsize_blocks, group_dim);
+    ysize_dc_groups = DivCeil(ysize_blocks, group_dim);
+    num_groups = xsize_groups * ysize_groups;
+    num_dc_groups = xsize_dc_groups * ysize_dc_groups;
+  }
+
+  // Image size without any upsampling, i.e. original_size / upsampling.
+  size_t xsize;
+  size_t ysize;
+  // Original image size.
+  size_t xsize_upsampled;
+  size_t ysize_upsampled;
+  // Image size after upsampling the padded image.
+  size_t xsize_upsampled_padded;
+  size_t ysize_upsampled_padded;
+  // Image size after padding to a multiple of kBlockDim (if VarDCT mode).
+  size_t xsize_padded;
+  size_t ysize_padded;
+  // Image size in kBlockDim blocks.
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  // Image size in number of groups.
+  size_t xsize_groups;
+  size_t ysize_groups;
+  // Image size in number of DC groups.
+  size_t xsize_dc_groups;
+  size_t ysize_dc_groups;
+  // Number of AC or DC groups.
+  size_t num_groups;
+  size_t num_dc_groups;
+  // Size of a group.
+  size_t group_dim;
+  size_t dc_group_dim;
+};
+
+// Prior to C++14 (i.e. C++11): provide our own make_unique
+#if __cplusplus < 201402L
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#else
+using std::make_unique;
+#endif
+
+template <typename T>
+JXL_INLINE T Clamp1(T val, T low, T hi) {
+  return val < low ? low : val > hi ? hi : val;
+}
+
+// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1)
+constexpr uint32_t PackSigned(int32_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X.
+// (((~value) & 1) - 1) is either 0 or 0xFF...FF and it will have an expected
+// unsigned-integer-overflow.
+constexpr intptr_t UnpackSigned(size_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return static_cast<intptr_t>((value >> 1) ^ (((~value) & 1) - 1));
+}
+
+// conversion from integer to string.
+template <typename T>
+std::string ToString(T n) {
+  char data[32] = {};
+  if (T(0.1) != T(0)) {
+    // float
+    snprintf(data, sizeof(data), "%g", static_cast<double>(n));
+  } else if (T(-1) > T(0)) {
+    // unsigned
+    snprintf(data, sizeof(data), "%llu", static_cast<unsigned long long>(n));
+  } else {
+    // signed
+    snprintf(data, sizeof(data), "%lld", static_cast<long long>(n));
+  }
+  return data;
+}
+
+static inline JXL_MAYBE_UNUSED uint64_t DecodeVarInt(const uint8_t* input,
+                                                     size_t inputSize,
+                                                     size_t* pos) {
+  size_t i;
+  uint64_t ret = 0;
+  for (i = 0; *pos + i < inputSize && i < 10; ++i) {
+    ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i);
+    // If the next-byte flag is not set, stop
+    if ((input[*pos + i] & 128) == 0) break;
+  }
+  // TODO: Return a decoding error if i == 10.
+  *pos += i + 1;
+  return ret;
+}
+
+static inline JXL_MAYBE_UNUSED bool EncodeVarInt(uint64_t value,
+                                                 size_t output_size,
+                                                 size_t* output_pos,
+                                                 uint8_t* output) {
+  // While more than 7 bits of data are left,
+  // store 7 bits and set the next byte flag
+  while (value > 127) {
+    if (*output_pos > output_size) return false;
+    // |128: Set the next byte flag
+    output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128;
+    // Remove the seven bits we just wrote
+    value >>= 7;
+  }
+  if (*output_pos > output_size) return false;
+  output[(*output_pos)++] = ((uint8_t)value) & 127;
+  return true;
+}
+
+static inline JXL_MAYBE_UNUSED void EncodeVarInt(uint64_t value,
+                                                 PaddedBytes* data) {
+  size_t pos = data->size();
+  data->resize(data->size() + 9);
+  JXL_CHECK(EncodeVarInt(value, data->size(), &pos, data->data()));
+  data->resize(pos);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.cc b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc
new file mode 100644
index 0000000000..f9a8f149dd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.cc
@@ -0,0 +1,318 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/compressed_dc.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
+#include <hwy/aligned_allocator.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+using D = HWY_FULL(float);
+using DScalar = HWY_CAPPED(float, 1);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// TODO(veluca): optimize constants.
+const float w1 = 0.20345139757231578f;
+const float w2 = 0.0334829185968739f;
+const float w0 = 1.0f - 4.0f * (w1 + w2);
+
+template <class V>
+V MaxWorkaround(V a, V b) {
+#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
+  // Prevents "Do not know how to split the result of this operator" error
+  return IfThenElse(a > b, a, b);
+#else
+  return Max(a, b);
+#endif
+}
+
+template <typename D>
+JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
+                                    const float* JXL_RESTRICT row_top,
+                                    const float* JXL_RESTRICT row,
+                                    const float* JXL_RESTRICT row_bottom,
+                                    Vec<D>* JXL_RESTRICT mc,
+                                    Vec<D>* JXL_RESTRICT sm,
+                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
+  const auto tl = LoadU(d, row_top + x - 1);
+  const auto tc = Load(d, row_top + x);
+  const auto tr = LoadU(d, row_top + x + 1);
+
+  const auto ml = LoadU(d, row + x - 1);
+  *mc = Load(d, row + x);
+  const auto mr = LoadU(d, row + x + 1);
+
+  const auto bl = LoadU(d, row_bottom + x - 1);
+  const auto bc = Load(d, row_bottom + x);
+  const auto br = LoadU(d, row_bottom + x + 1);
+
+  const auto w_center = Set(d, w0);
+  const auto w_side = Set(d, w1);
+  const auto w_corner = Set(d, w2);
+
+  const auto corner = Add(Add(tl, tr), Add(bl, br));
+  const auto side = Add(Add(ml, mr), Add(tc, bc));
+  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
+
+  const auto dc_quant = Set(d, dc_factor);
+  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
+}
+
+template <typename D>
+JXL_INLINE void ComputePixel(
+    const float* JXL_RESTRICT dc_factors,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows,
+    const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
+    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
+  const D d;
+  auto mc_x = Undefined(d);
+  auto mc_y = Undefined(d);
+  auto mc_b = Undefined(d);
+  auto sm_x = Undefined(d);
+  auto sm_y = Undefined(d);
+  auto sm_b = Undefined(d);
+  auto gap = Set(d, 0.5f);
+  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
+                      &mc_x, &sm_x, &gap, x);
+  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
+                      &mc_y, &sm_y, &gap, x);
+  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
+                      &mc_b, &sm_b, &gap, x);
+  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
+  factor = ZeroIfNegative(factor);
+
+  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
+  Store(out, d, out_rows[0] + x);
+  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
+  Store(out, d, out_rows[1] + x);
+  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
+  Store(out, d, out_rows[2] + x);
+}
+
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool) {
+  const size_t xsize = dc->xsize();
+  const size_t ysize = dc->ysize();
+  if (ysize <= 2 || xsize <= 2) return;
+
+  // TODO(veluca): use tile-based processing?
+  // TODO(veluca): decide if changes to the y channel should be propagated to
+  // the x and b channels through color correlation.
+  JXL_ASSERT(w1 + w2 < 0.25f);
+
+  PROFILER_FUNC;
+
+  Image3F smoothed(xsize, ysize);
+  // Fill in borders that the loop below will not. First and last are unused.
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y : {size_t(0), ysize - 1}) {
+      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
+             xsize * sizeof(float));
+    }
+  }
+  auto process_row = [&](const uint32_t y, size_t /*thread*/) {
+    const float* JXL_RESTRICT rows_top[3]{
+        dc->ConstPlaneRow(0, y - 1),
+        dc->ConstPlaneRow(1, y - 1),
+        dc->ConstPlaneRow(2, y - 1),
+    };
+    const float* JXL_RESTRICT rows[3] = {
+        dc->ConstPlaneRow(0, y),
+        dc->ConstPlaneRow(1, y),
+        dc->ConstPlaneRow(2, y),
+    };
+    const float* JXL_RESTRICT rows_bottom[3] = {
+        dc->ConstPlaneRow(0, y + 1),
+        dc->ConstPlaneRow(1, y + 1),
+        dc->ConstPlaneRow(2, y + 1),
+    };
+    float* JXL_RESTRICT rows_out[3] = {
+        smoothed.PlaneRow(0, y),
+        smoothed.PlaneRow(1, y),
+        smoothed.PlaneRow(2, y),
+    };
+    for (size_t x : {size_t(0), xsize - 1}) {
+      for (size_t c = 0; c < 3; c++) {
+        rows_out[c][x] = rows[c][x];
+      }
+    }
+
+    size_t x = 1;
+    // First pixels
+    const size_t N = Lanes(D());
+    for (; x < std::min(N, xsize - 1); x++) {
+      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
+                            x);
+    }
+    // Full vectors.
+    for (; x + N <= xsize - 1; x += N) {
+      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
+    }
+    // Last pixels.
+    for (; x < xsize - 1; x++) {
+      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
+                            x);
+    }
+  };
+  JXL_CHECK(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit, process_row,
+                      "DCSmoothingRow"));
+  dc->Swap(smoothed);
+}
+
+// DC dequantization.
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+  if (chroma_subsampling.Is444()) {
+    const auto fac_x = Set(df, dc_factors[0] * mul);
+    const auto fac_y = Set(df, dc_factors[1] * mul);
+    const auto fac_b = Set(df, dc_factors[2] * mul);
+    const auto cfl_fac_x = Set(df, cfl_factors[0]);
+    const auto cfl_fac_b = Set(df, cfl_factors[2]);
+    for (size_t y = 0; y < r.ysize(); y++) {
+      float* dec_row_x = r.PlaneRow(dc, 0, y);
+      float* dec_row_y = r.PlaneRow(dc, 1, y);
+      float* dec_row_b = r.PlaneRow(dc, 2, y);
+      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
+      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
+      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
+      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
+        const auto in_q_x = Load(di, quant_row_x + x);
+        const auto in_q_y = Load(di, quant_row_y + x);
+        const auto in_q_b = Load(di, quant_row_b + x);
+        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
+        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
+        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
+        Store(in_y, df, dec_row_y + x);
+        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
+        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
+      }
+    }
+  } else {
+    for (size_t c : {1, 0, 2}) {
+      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
+                r.y0() >> chroma_subsampling.VShift(c),
+                r.xsize() >> chroma_subsampling.HShift(c),
+                r.ysize() >> chroma_subsampling.VShift(c));
+      const auto fac = Set(df, dc_factors[c] * mul);
+      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
+      for (size_t y = 0; y < rect.ysize(); y++) {
+        const int32_t* quant_row = ch.plane.Row(y);
+        float* row = rect.PlaneRow(dc, c, y);
+        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
+          const auto in_q = Load(di, quant_row + x);
+          const auto in = Mul(ConvertTo(df, in_q), fac);
+          Store(in, df, row + x);
+        }
+      }
+    }
+  }
+  if (bctx.num_dc_ctxs <= 1) {
+    for (size_t y = 0; y < r.ysize(); y++) {
+      uint8_t* qdc_row = r.Row(quant_dc, y);
+      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
+    }
+  } else {
+    for (size_t y = 0; y < r.ysize(); y++) {
+      uint8_t* qdc_row_val = r.Row(quant_dc, y);
+      const int32_t* quant_row_x =
+          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
+      const int32_t* quant_row_y =
+          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
+      const int32_t* quant_row_b =
+          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
+      for (size_t x = 0; x < r.xsize(); x++) {
+        int bucket_x = 0, bucket_y = 0, bucket_b = 0;
+        for (int t : bctx.dc_thresholds[0]) {
+          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
+        }
+        for (int t : bctx.dc_thresholds[1]) {
+          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
+        }
+        for (int t : bctx.dc_thresholds[2]) {
+          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
+        }
+        int bucket = bucket_x;
+        bucket *= bctx.dc_thresholds[2].size() + 1;
+        bucket += bucket_b;
+        bucket *= bctx.dc_thresholds[1].size() + 1;
+        bucket += bucket_y;
+        qdc_row_val[x] = bucket;
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(DequantDC);
+HWY_EXPORT(AdaptiveDCSmoothing);
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(dc_factors, dc, pool);
+}
+
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx) {
+  return HWY_DYNAMIC_DISPATCH(DequantDC)(r, dc, quant_dc, in, dc_factors, mul,
+                                         cfl_factors, chroma_subsampling, bctx);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/compressed_dc.h b/third_party/jpeg-xl/lib/jxl/compressed_dc.h
new file mode 100644
index 0000000000..b06e5931f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/compressed_dc.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_COMPRESSED_DC_H_
+#define LIB_JXL_COMPRESSED_DC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/modular_image.h"
+
+// DC handling functions: encoding and decoding of DC to and from bitstream, and
+// related function to initialize the per-group decoder cache.
+
+namespace jxl {
+
+// Smooth DC in already-smooth areas, to counteract banding.
+void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
+                         ThreadPool* pool);
+
+void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
+               const float* dc_factors, float mul, const float* cfl_factors,
+               YCbCrChromaSubsampling chroma_subsampling,
+               const BlockCtxMap& bctx);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_COMPRESSED_DC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve-inl.h b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
new file mode 100644
index 0000000000..054c9c6f0d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve-inl.h
@@ -0,0 +1,297 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_CONVOLVE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CONVOLVE_INL_H_
+#undef LIB_JXL_CONVOLVE_INL_H_
+#else
+#define LIB_JXL_CONVOLVE_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Broadcast;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
+#endif
+using hwy::HWY_NAMESPACE::TableLookupLanes;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Synthesizes left/right neighbors from a vector of center pixels.
+class Neighbors {
+ public:
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+  // Returns l[i] == c[Mirror(i - 1)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL1(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {0, 0, 1, 2,  3,  4,  5,  6,
+                                             7, 8, 9, 10, 11, 12, 13, 14};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // ONML'KJII
+#elif HWY_TARGET == HWY_SCALAR
+    return c;  // Same (the first mirrored value is the last valid one)
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))};  // KJII
+#else
+    const D d;
+    // TODO(deymo): Figure out if this can be optimized using a single vsri
+    // instruction to convert LKJI to KJII.
+    HWY_ALIGN constexpr int lanes[4] = {0, 0, 1, 2};  // KJII
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 2)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL2(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {1, 0, 0, 1, 2,  3,  4,  5,
+                                             6, 7, 8, 9, 10, 11, 12, 13};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // NMLK'JIIJ
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))};  // JIIJ
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {1, 0, 0, 1};  // JIIJ
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+
+  // Returns l[i] == c[Mirror(i - 3)].
+  HWY_INLINE HWY_MAYBE_UNUSED static V FirstL3(const V c) {
+#if HWY_CAP_GE256
+    const D d;
+    HWY_ALIGN constexpr int32_t lanes[16] = {2, 1, 0, 0, 1, 2,  3,  4,
+                                             5, 6, 7, 8, 9, 10, 11, 12};
+    const auto indices = SetTableIndices(d, lanes);
+    // c = PONM'LKJI
+    return TableLookupLanes(c, indices);  // MLKJ'IIJK
+#elif HWY_TARGET == HWY_SCALAR
+    const D d;
+    JXL_ASSERT(false);  // unsupported, avoid calling this.
+    return Zero(d);
+#else  // 128 bit
+    // c = LKJI
+#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86)
+    return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))};  // IIJK
+#else
+    const D d;
+    HWY_ALIGN constexpr int lanes[4] = {2, 1, 0, 0};  // IIJK
+    const auto indices = SetTableIndices(d, lanes);
+    return TableLookupLanes(c, indices);
+#endif
+#endif
+  }
+};
+
+#if HWY_TARGET != HWY_SCALAR
+
+// Returns indices for SetTableIndices such that TableLookupLanes on the
+// rightmost unaligned vector (rightmost sample in its most-significant lane)
+// returns the mirrored values, with the mirror outside the last valid sample.
+static inline const int32_t* MirrorLanes(const size_t mod) {
+  const HWY_CAPPED(float, 16) d;
+  constexpr size_t kN = MaxLanes(d);
+
+  // For mod = `image width mod 16` 0..15:
+  // last full vec     mirrored (mem order)  loadedVec  mirrorVec  idxVec
+  // 0123456789abcdef| fedcba9876543210      fed..210   012..def   012..def
+  // 0123456789abcdef|0 0fedcba98765432      0fe..321   234..f00   123..eff
+  // 0123456789abcdef|01 10fedcba987654      10f..432   456..110   234..ffe
+  // 0123456789abcdef|012 210fedcba9876      210..543   67..2210   34..ffed
+  // 0123456789abcdef|0123 3210fedcba98      321..654   8..33210   4..ffedc
+  // 0123456789abcdef|01234 43210fedcba
+  // 0123456789abcdef|012345 543210fedc
+  // 0123456789abcdef|0123456 6543210fe
+  // 0123456789abcdef|01234567 76543210
+  // 0123456789abcdef|012345678 8765432
+  // 0123456789abcdef|0123456789 987654
+  // 0123456789abcdef|0123456789A A9876
+  // 0123456789abcdef|0123456789AB BA98
+  // 0123456789abcdef|0123456789ABC CBA
+  // 0123456789abcdef|0123456789ABCD DC
+  // 0123456789abcdef|0123456789ABCDE E      EDC..10f   EED..210   ffe..321
+#if HWY_CAP_GE512
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1,  2,  3,  4,  5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,  //
+      14, 13, 12, 11, 10, 9, 8, 7, 6, 5,  4,  3,  2,  1,  0};
+#elif HWY_CAP_GE256
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+      1, 2, 3, 4, 5, 6, 7, 7,  //
+      6, 5, 4, 3, 2, 1, 0};
+#else  // 128-bit
+  HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3,  //
+                                                              2, 1, 0};
+#endif
+  return idx_lanes + kN - 1 - mod;
+}
+
+#endif  // HWY_TARGET != HWY_SCALAR
+
+// Single entry point for convolution.
+// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it.
+template <class Strategy>
+class ConvolveT {
+  static constexpr int64_t kRadius = Strategy::kRadius;
+  using Simd = HWY_CAPPED(float, 16);
+
+ public:
+  static size_t MinWidth() {
+#if HWY_TARGET == HWY_SCALAR
+    // First/Last use mirrored loads of up to +/- kRadius.
+    return 2 * kRadius;
+#else
+    return Lanes(Simd()) + kRadius;
+#endif
+  }
+
+  // "Image" is ImageF or Image3F.
+  template <class Image, class Weights>
+  static void Run(const Image& in, const Rect& rect, const Weights& weights,
+                  ThreadPool* pool, Image* out) {
+    PROFILER_ZONE("ConvolveT::Run");
+    JXL_CHECK(SameSize(rect, *out));
+    JXL_CHECK(rect.xsize() >= MinWidth());
+
+    static_assert(int64_t(kRadius) <= 3,
+                  "Must handle [0, kRadius) and >= kRadius");
+    switch (rect.xsize() % Lanes(Simd())) {
+      case 0:
+        return RunRows<0>(in, rect, weights, pool, out);
+      case 1:
+        return RunRows<1>(in, rect, weights, pool, out);
+      case 2:
+        return RunRows<2>(in, rect, weights, pool, out);
+      default:
+        return RunRows<3>(in, rect, weights, pool, out);
+    }
+  }
+
+ private:
+  template <size_t kSizeModN, class WrapRow, class Weights>
+  static JXL_INLINE void RunRow(const float* JXL_RESTRICT in,
+                                const size_t xsize, const int64_t stride,
+                                const WrapRow& wrap_row, const Weights& weights,
+                                float* JXL_RESTRICT out) {
+    Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row,
+                                              weights, out);
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    const WrapRowMirror wrap_row(in, rect.ysize());
+    for (int64_t y = ybegin; y < yend; ++y) {
+      RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row,
+                        weights, out->Row(y));
+    }
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect,
+                                       const int64_t ybegin, const int64_t yend,
+                                       const Weights& weights, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    for (int64_t y = ybegin; y < yend; ++y) {
+      for (size_t c = 0; c < 3; ++c) {
+        const WrapRowMirror wrap_row(in.Plane(c), rect.ysize());
+        RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride,
+                          wrap_row, weights, out->PlaneRow(c, y));
+      }
+    }
+  }
+
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, ImageF* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride,
+                            WrapRowUnchanged(), weights, out->Row(y));
+        },
+        "Convolve"));
+  }
+
+  // Image3F.
+  template <size_t kSizeModN, class Weights>
+  static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect,
+                                         const int64_t ybegin,
+                                         const int64_t yend,
+                                         const Weights& weights,
+                                         ThreadPool* pool, Image3F* out) {
+    const int64_t stride = in.PixelsPerRow();
+    JXL_CHECK(RunOnPool(
+        pool, ybegin, yend, ThreadPool::NoInit,
+        [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+          for (size_t c = 0; c < 3; ++c) {
+            RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(),
+                              stride, WrapRowUnchanged(), weights,
+                              out->PlaneRow(c, y));
+          }
+        },
+        "Convolve3"));
+  }
+
+  template <size_t kSizeModN, class Image, class Weights>
+  static JXL_INLINE void RunRows(const Image& in, const Rect& rect,
+                                 const Weights& weights, ThreadPool* pool,
+                                 Image* out) {
+    const int64_t ysize = rect.ysize();
+    RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize),
+                             weights, out);
+    if (ysize > 2 * int64_t(kRadius)) {
+      RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius),
+                                 ysize - int64_t(kRadius), weights, pool, out);
+    }
+    if (ysize > int64_t(kRadius)) {
+      RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize,
+                               weights, out);
+    }
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_CONVOLVE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve.h b/third_party/jpeg-xl/lib/jxl/convolve.h
new file mode 100644
index 0000000000..2fcd2d0980
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve.h
@@ -0,0 +1,105 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CONVOLVE_H_
+#define LIB_JXL_CONVOLVE_H_
+
+// 2D convolution.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// No valid values outside [0, xsize), but the strategy may still safely load
+// the preceding vector, and/or round xsize up to the vector lane count. This
+// avoids needing PadImage.
+// Requires xsize >= kConvolveLanes + kConvolveMaxRadius.
+static constexpr size_t kConvolveMaxRadius = 3;
+
+// Weights must already be normalized.
+
+struct WeightsSymmetric3 {
+  // d r d (each replicated 4x)
+  // r c r
+  // d r d
+  float c[4];
+  float r[4];
+  float d[4];
+};
+
+struct WeightsSymmetric5 {
+  // The lower-right quadrant is: c r R  (each replicated 4x)
+  //                              r d L
+  //                              R L D
+  float c[4];
+  float r[4];
+  float R[4];
+  float d[4];
+  float D[4];
+  float L[4];
+};
+
+// Weights for separable 5x5 filters (typically but not necessarily the same
+// values for horizontal and vertical directions). The kernel must already be
+// normalized, but note that values for negative offsets are omitted, so the
+// given values do not sum to 1.
+struct WeightsSeparable5 {
+  // Horizontal 1D, distances 0..2 (each replicated 4x)
+  float horz[3 * 4];
+  float vert[3 * 4];
+};
+
+// Weights for separable 7x7 filters (typically but not necessarily the same
+// values for horizontal and vertical directions). The kernel must already be
+// normalized, but note that values for negative offsets are omitted, so the
+// given values do not sum to 1.
+//
+// NOTE: for >= 7x7 Gaussian kernels, it is faster to use FastGaussian instead,
+// at least when images exceed the L1 cache size.
+struct WeightsSeparable7 {
+  // Horizontal 1D, distances 0..3 (each replicated 4x)
+  float horz[4 * 4];
+  float vert[4 * 4];
+};
+
+const WeightsSymmetric3& WeightsSymmetric3Lowpass();
+const WeightsSeparable5& WeightsSeparable5Lowpass();
+const WeightsSymmetric5& WeightsSymmetric5Lowpass();
+
+void SlowSymmetric3(const ImageF& in, const Rect& rect,
+                    const WeightsSymmetric3& weights, ThreadPool* pool,
+                    ImageF* JXL_RESTRICT out);
+
+void SlowSeparable5(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable5& weights, ThreadPool* pool,
+                    ImageF* out);
+
+void SlowSeparable7(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable7& weights, ThreadPool* pool,
+                    ImageF* out);
+
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out);
+
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out);
+
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out);
+
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CONVOLVE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
new file mode 100644
index 0000000000..b26ff54bbc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+// 5x5 convolution by separable kernel with a single scan through the input.
+// This is more cache-efficient than separate horizontal/vertical passes, and
+// possibly faster (given enough registers) than tiling and/or transposing.
+//
+// Overview: imagine a 5x5 window around a central pixel. First convolve the
+// rows by multiplying the pixels with the corresponding weights from
+// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
+// intermediate results by the corresponding vertical weight, i.e.
+// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
+// convolution result at the position of the central pixel in the output.
+//
+// Each of these operations uses SIMD vectors. The central pixel and most
+// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
+// require unaligned loads. Because weights are supplied in identical groups of
+// 4, we can use LoadDup128 to load them (slightly faster).
+//
+// Uses mirrored boundary handling. Until x >= kRadius, the horizontal
+// convolution uses Neighbors class to shuffle vectors as if each of its lanes
+// had been loaded from the mirrored offset. Similarly, the last full vector to
+// write uses mirroring. In the case of scalar vectors, Neighbors is not usable
+// and the value is loaded directly. Otherwise, the number of valid pixels
+// modulo the vector size enables a small optimization: for smaller offsets,
+// a non-mirrored load is sufficient.
+class Separable5Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 2;
+
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    const int64_t neg_stride = -stride;  // allows LEA addressing.
+    const float* const JXL_RESTRICT row_t2 =
+        wrap_row(row_m + 2 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t1 =
+        wrap_row(row_m + 1 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_b1 =
+        wrap_row(row_m + 1 * stride, stride);
+    const float* const JXL_RESTRICT row_b2 =
+        wrap_row(row_m + 2 * stride, stride);
+
+    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+
+    size_t x = 0;
+
+    // More than one iteration for scalars.
+    for (; x < kRadius; x += Lanes(d)) {
+      const V conv0 =
+          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
+
+      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
+      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
+      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+    }
+
+    // Main loop: load inputs without padding
+    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0);
+
+      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
+      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
+      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+    }
+
+    // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+    while (x < xsize) {
+#else
+    if (kSizeModN < kRadius) {
+#endif
+      const V conv0 =
+          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0);
+
+      const V conv1t =
+          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2);
+      const V conv1b =
+          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t =
+          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2);
+      const V conv2b =
+          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+      Store(conv2, d, row_out + x);
+      x += Lanes(d);
+    }
+
+    // If mod = 0, the above vector was the last.
+    if (kSizeModN != 0) {
+      for (; x < xsize; ++x) {
+        float mul = 0.0f;
+        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+          const float wy = weights.vert[std::abs(dy) * 4];
+          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+            const float wx = weights.horz[std::abs(dx) * 4];
+            const int64_t clamped_x = Mirror(x + dx, xsize);
+            mul += clamped_row[clamped_x] * wx * wy;
+          }
+        }
+        row_out[x] = mul;
+      }
+    }
+  }
+
+ private:
+  // Same as HorzConvolve for the first/last vector in a row.
+  static JXL_MAYBE_INLINE V HorzConvolveFirst(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+#if HWY_TARGET == HWY_SCALAR
+    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+#else
+    (void)xsize;
+    const V l1 = Neighbors::FirstL1(c);
+    const V l2 = Neighbors::FirstL2(c);
+#endif
+
+    const V r1 = LoadU(d, row + x + 1);
+    const V r2 = LoadU(d, row + x + 2);
+
+    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
+    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
+    return mul2;
+  }
+
+  template <size_t kSizeModN>
+  static JXL_MAYBE_INLINE V
+  HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x,
+                   const int64_t xsize, const V wh0, const V wh1, const V wh2) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+    const V l1 = LoadU(d, row + x - 1);
+    const V l2 = LoadU(d, row + x - 2);
+
+    V r1, r2;
+#if HWY_TARGET == HWY_SCALAR
+    r1 = LoadU(d, row + Mirror(x + 1, xsize));
+    r2 = LoadU(d, row + Mirror(x + 2, xsize));
+#else
+    const size_t N = Lanes(d);
+    if (kSizeModN == 0) {
+      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+    } else {  // == 1
+      const auto last = LoadU(d, row + xsize - N);
+      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r1 = last;
+    }
+#endif
+
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    return mul2;
+  }
+
+  // Requires kRadius valid pixels before/after pos.
+  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+                                         const V wh0, const V wh1,
+                                         const V wh2) {
+    const D d;
+    const V c = LoadU(d, pos);
+    const V mul0 = Mul(c, wh0);
+
+    // Loading anew is faster than combining vectors.
+    const V l1 = LoadU(d, pos - 1);
+    const V r1 = LoadU(d, pos + 1);
+    const V l2 = LoadU(d, pos - 2);
+    const V r2 = LoadU(d, pos + 2);
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    return mul2;
+  }
+};
+
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Separable5Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSeparable5(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Separable5);
+void Separable5(const ImageF& in, const Rect& rect,
+                const WeightsSeparable5& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
new file mode 100644
index 0000000000..086dfd22b5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
@@ -0,0 +1,285 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable7.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+// 7x7 convolution by separable kernel with a single scan through the input.
+// Extended version of Separable5, see documentation there.
+class Separable7Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 3;
+
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSeparable7& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    const int64_t neg_stride = -stride;  // allows LEA addressing.
+    const float* const JXL_RESTRICT row_t3 =
+        wrap_row(row_m + 3 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t2 =
+        wrap_row(row_m + 2 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_t1 =
+        wrap_row(row_m + 1 * neg_stride, stride);
+    const float* const JXL_RESTRICT row_b1 =
+        wrap_row(row_m + 1 * stride, stride);
+    const float* const JXL_RESTRICT row_b2 =
+        wrap_row(row_m + 2 * stride, stride);
+    const float* const JXL_RESTRICT row_b3 =
+        wrap_row(row_m + 3 * stride, stride);
+
+    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+    const V wh3 = LoadDup128(d, weights.horz + 3 * 4);
+    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+    const V wv3 = LoadDup128(d, weights.vert + 3 * 4);
+
+    size_t x = 0;
+
+    // More than one iteration for scalars.
+    for (; x < kRadius; x += Lanes(d)) {
+      const V conv0 =
+          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3), wv0);
+
+      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+    }
+
+    // Main loop: load inputs without padding
+    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2, wh3), wv0);
+
+      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3);
+      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3);
+      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3);
+      const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+    }
+
+    // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+    while (x < xsize) {
+#else
+    if (kSizeModN < kRadius) {
+#endif
+      const V conv0 =
+          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, wh3),
+              wv0);
+
+      const V conv1t =
+          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1b =
+          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
+
+      const V conv2t =
+          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2b =
+          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
+
+      const V conv3t =
+          HorzConvolveLast<kSizeModN>(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3b =
+          HorzConvolveLast<kSizeModN>(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
+
+      Store(conv3, d, row_out + x);
+      x += Lanes(d);
+    }
+
+    // If mod = 0, the above vector was the last.
+    if (kSizeModN != 0) {
+      for (; x < xsize; ++x) {
+        float mul = 0.0f;
+        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+          const float wy = weights.vert[std::abs(dy) * 4];
+          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+            const float wx = weights.horz[std::abs(dx) * 4];
+            const int64_t clamped_x = Mirror(x + dx, xsize);
+            mul += clamped_row[clamped_x] * wx * wy;
+          }
+        }
+        row_out[x] = mul;
+      }
+    }
+  }
+
+ private:
+  // Same as HorzConvolve for the first/last vector in a row.
+  static JXL_MAYBE_INLINE V HorzConvolveFirst(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2, const V wh3) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+#if HWY_TARGET == HWY_SCALAR
+    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+    const V l3 = LoadU(d, row + Mirror(x - 3, xsize));
+#else
+    (void)xsize;
+    const V l1 = Neighbors::FirstL1(c);
+    const V l2 = Neighbors::FirstL2(c);
+    const V l3 = Neighbors::FirstL3(c);
+#endif
+
+    const V r1 = LoadU(d, row + x + 1);
+    const V r2 = LoadU(d, row + x + 2);
+    const V r3 = LoadU(d, row + x + 3);
+
+    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
+    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
+    const V mul3 = MulAdd(Add(l3, r3), wh3, mul2);
+    return mul3;
+  }
+
+  template <size_t kSizeModN>
+  static JXL_MAYBE_INLINE V HorzConvolveLast(
+      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
+      const V wh0, const V wh1, const V wh2, const V wh3) {
+    const D d;
+    const V c = LoadU(d, row + x);
+    const V mul0 = Mul(c, wh0);
+
+    const V l1 = LoadU(d, row + x - 1);
+    const V l2 = LoadU(d, row + x - 2);
+    const V l3 = LoadU(d, row + x - 3);
+
+    V r1, r2, r3;
+#if HWY_TARGET == HWY_SCALAR
+    r1 = LoadU(d, row + Mirror(x + 1, xsize));
+    r2 = LoadU(d, row + Mirror(x + 2, xsize));
+    r3 = LoadU(d, row + Mirror(x + 3, xsize));
+#else
+    const size_t N = Lanes(d);
+    if (kSizeModN == 0) {
+      r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3)));
+      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+    } else if (kSizeModN == 1) {
+      const auto last = LoadU(d, row + xsize - N);
+      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2)));
+      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r1 = last;
+    } else /* kSizeModN >= 2 */ {
+      const auto last = LoadU(d, row + xsize - N);
+      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+      r2 = last;
+      r1 = LoadU(d, row + x + 1);
+    }
+#endif
+
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    const V sum3 = Add(l3, r3);
+    const V mul3 = MulAdd(sum3, wh3, mul2);
+    return mul3;
+  }
+
+  // Returns one vector of horizontal convolution results; lane i is the result
+  // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius
+  // valid pixels before/after pos.
+  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+                                         const V wh0, const V wh1, const V wh2,
+                                         const V wh3) {
+    const D d;
+    const V c = LoadU(d, pos);
+    const V mul0 = Mul(c, wh0);
+
+    // TODO(janwas): better to Combine
+    const V l1 = LoadU(d, pos - 1);
+    const V r1 = LoadU(d, pos + 1);
+    const V l2 = LoadU(d, pos - 2);
+    const V r2 = LoadU(d, pos + 2);
+    const V l3 = LoadU(d, pos - 3);
+    const V r3 = LoadU(d, pos + 3);
+    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+    const V sum1 = Add(l1, r1);
+    const V mul1 = MulAdd(sum1, wh1, mul0);
+    const V sum2 = Add(l2, r2);
+    const V mul2 = MulAdd(sum2, wh2, mul1);
+    const V sum3 = Add(l3, r3);
+    const V mul3 = MulAdd(sum3, wh3, mul2);
+    return mul3;
+  }
+};
+
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Separable7Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSeparable7(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Separable7);
+void Separable7(const ImageF& in, const Rect& rect,
+                const WeightsSeparable7& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_slow.cc b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
new file mode 100644
index 0000000000..fffe5f74c8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
@@ -0,0 +1,212 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#include "lib/jxl/convolve-inl.h"
+
+namespace jxl {
+
+//------------------------------------------------------------------------------
+// Kernels
+
+// 4 instances of a given literal value, useful as input to LoadDup128.
+#define JXL_REP4(literal) literal, literal, literal, literal
+
+// Concentrates energy in low-frequency components (e.g. for antialiasing).
+const WeightsSymmetric3& WeightsSymmetric3Lowpass() {
+  // Computed by research/convolve_weights.py's cubic spline approximations of
+  // prolate spheroidal wave functions.
+  constexpr float w0 = 0.36208932f;
+  constexpr float w1 = 0.12820096f;
+  constexpr float w2 = 0.03127668f;
+  static constexpr WeightsSymmetric3 weights = {
+      {JXL_REP4(w0)}, {JXL_REP4(w1)}, {JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Lowpass() {
+  constexpr float w0 = 0.41714928f;
+  constexpr float w1 = 0.25539268f;
+  constexpr float w2 = 0.03603267f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSymmetric5& WeightsSymmetric5Lowpass() {
+  static constexpr WeightsSymmetric5 weights = {
+      {JXL_REP4(0.1740135f)}, {JXL_REP4(0.1065369f)}, {JXL_REP4(0.0150310f)},
+      {JXL_REP4(0.0652254f)}, {JXL_REP4(0.0012984f)}, {JXL_REP4(0.0092025f)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian1() {
+  constexpr float w0 = 0.38774f;
+  constexpr float w1 = 0.24477f;
+  constexpr float w2 = 0.06136f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian2() {
+  constexpr float w0 = 0.250301f;
+  constexpr float w1 = 0.221461f;
+  constexpr float w2 = 0.153388f;
+  static constexpr WeightsSeparable5 weights = {
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)},
+      {JXL_REP4(w0), JXL_REP4(w1), JXL_REP4(w2)}};
+  return weights;
+}
+
+#undef JXL_REP4
+
+//------------------------------------------------------------------------------
+// Slow
+
+namespace {
+
+template <class WrapX, class WrapY>
+float SlowSymmetric3Pixel(const ImageF& in, const int64_t ix, const int64_t iy,
+                          const int64_t xsize, const int64_t ysize,
+                          const WeightsSymmetric3& weights) {
+  float sum = 0.0f;
+
+  // ix: image; kx: kernel
+  for (int64_t ky = -1; ky <= 1; ky++) {
+    const int64_t y = WrapY()(iy + ky, ysize);
+    const float* JXL_RESTRICT row_in = in.ConstRow(static_cast<size_t>(y));
+
+    const float wc = ky == 0 ? weights.c[0] : weights.r[0];
+    const float wlr = ky == 0 ? weights.r[0] : weights.d[0];
+
+    const int64_t xm1 = WrapX()(ix - 1, xsize);
+    const int64_t xp1 = WrapX()(ix + 1, xsize);
+    sum += row_in[ix] * wc + (row_in[xm1] + row_in[xp1]) * wlr;
+  }
+  return sum;
+}
+
+template <class WrapY>
+void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize,
+                       const int64_t ysize, const WeightsSymmetric3& weights,
+                       float* JXL_RESTRICT row_out) {
+  row_out[0] =
+      SlowSymmetric3Pixel<WrapMirror, WrapY>(in, 0, iy, xsize, ysize, weights);
+  for (int64_t ix = 1; ix < xsize - 1; ix++) {
+    row_out[ix] = SlowSymmetric3Pixel<WrapUnchanged, WrapY>(in, ix, iy, xsize,
+                                                            ysize, weights);
+  }
+  {
+    const int64_t ix = xsize - 1;
+    row_out[ix] = SlowSymmetric3Pixel<WrapMirror, WrapY>(in, ix, iy, xsize,
+                                                         ysize, weights);
+  }
+}
+
+}  // namespace
+
+void SlowSymmetric3(const ImageF& in, const Rect& rect,
+                    const WeightsSymmetric3& weights, ThreadPool* pool,
+                    ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+
+  const int64_t xsize = static_cast<int64_t>(rect.xsize());
+  const int64_t ysize = static_cast<int64_t>(rect.ysize());
+  const int64_t kRadius = 1;
+
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t iy = task;
+        float* JXL_RESTRICT out_row = out->Row(static_cast<size_t>(iy));
+
+        if (iy < kRadius || iy >= ysize - kRadius) {
+          SlowSymmetric3Row<WrapMirror>(in, iy, xsize, ysize, weights, out_row);
+        } else {
+          SlowSymmetric3Row<WrapUnchanged>(in, iy, xsize, ysize, weights,
+                                           out_row);
+        }
+      },
+      "SlowSymmetric3"));
+}
+
+namespace {
+
+// Separable kernels, any radius.
+float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
+                         const int64_t y, const int64_t radius,
+                         const float* JXL_RESTRICT horz_weights,
+                         const float* JXL_RESTRICT vert_weights) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  const WrapMirror wrap;
+
+  float mul = 0.0f;
+  for (int dy = -radius; dy <= radius; ++dy) {
+    const float wy = vert_weights[std::abs(dy) * 4];
+    const size_t sy = wrap(y + dy, ysize);
+    JXL_CHECK(sy < ysize);
+    const float* const JXL_RESTRICT row = rect.ConstRow(in, sy);
+    for (int dx = -radius; dx <= radius; ++dx) {
+      const float wx = horz_weights[std::abs(dx) * 4];
+      const size_t sx = wrap(x + dx, xsize);
+      JXL_CHECK(sx < xsize);
+      mul += row[sx] * wx * wy;
+    }
+  }
+  return mul;
+}
+
+}  // namespace
+
+void SlowSeparable5(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable5& weights, ThreadPool* pool,
+                    ImageF* out) {
+  PROFILER_FUNC;
+  const float* horz_weights = &weights.horz[0];
+  const float* vert_weights = &weights.vert[0];
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t y = task;
+
+        float* const JXL_RESTRICT row_out = out->Row(y);
+        for (size_t x = 0; x < rect.xsize(); ++x) {
+          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2,
+                                          horz_weights, vert_weights);
+        }
+      },
+      "SlowSeparable5"));
+}
+
+void SlowSeparable7(const ImageF& in, const Rect& rect,
+                    const WeightsSeparable7& weights, ThreadPool* pool,
+                    ImageF* out) {
+  PROFILER_FUNC;
+  const float* horz_weights = &weights.horz[0];
+  const float* vert_weights = &weights.vert[0];
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t y = task;
+
+        float* const JXL_RESTRICT row_out = out->Row(y);
+        for (size_t x = 0; x < rect.xsize(); ++x) {
+          row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3,
+                                          horz_weights, vert_weights);
+        }
+      },
+      "SlowSeparable7"));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc
new file mode 100644
index 0000000000..06b59dfb60
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc
@@ -0,0 +1,194 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+template <class WrapY, class V>
+static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
+                     const int64_t iy, const size_t ysize, const V wx0,
+                     const V wx1, const V wx2) {
+  const HWY_FULL(float) d;
+  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
+  const auto in_m2 = LoadU(d, center - 2);
+  const auto in_p2 = LoadU(d, center + 2);
+  const auto in_m1 = LoadU(d, center - 1);
+  const auto in_p1 = LoadU(d, center + 1);
+  const auto in_00 = Load(d, center);
+  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
+  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
+  const auto sum_0 = Mul(wx0, in_00);
+  return Add(sum_2, Add(sum_1, sum_0));
+}
+
+// 3x3 convolution by symmetric kernel with a single scan through the input.
+class Symmetric3Strategy {
+  using D = HWY_CAPPED(float, 16);
+  using V = Vec<D>;
+
+ public:
+  static constexpr int64_t kRadius = 1;
+
+  // Only accesses pixels in [0, xsize).
+  template <size_t kSizeModN, class WrapRow>
+  static JXL_MAYBE_INLINE void ConvolveRow(
+      const float* const JXL_RESTRICT row_m, const size_t xsize,
+      const int64_t stride, const WrapRow& wrap_row,
+      const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) {
+    const D d;
+    // t, m, b = top, middle, bottom row;
+    const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride);
+    const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride);
+
+    // Must load in advance - compiler doesn't understand LoadDup128 and
+    // schedules them too late.
+    const V w0 = LoadDup128(d, weights.c);
+    const V w1 = LoadDup128(d, weights.r);
+    const V w2 = LoadDup128(d, weights.d);
+
+    // l, c, r = left, center, right. Leftmost vector: need FirstL1.
+    {
+      const V tc = LoadU(d, row_t + 0);
+      const V mc = LoadU(d, row_m + 0);
+      const V bc = LoadU(d, row_b + 0);
+      const V tl = Neighbors::FirstL1(tc);
+      const V tr = LoadU(d, row_t + 0 + 1);
+      const V ml = Neighbors::FirstL1(mc);
+      const V mr = LoadU(d, row_m + 0 + 1);
+      const V bl = Neighbors::FirstL1(bc);
+      const V br = LoadU(d, row_b + 0 + 1);
+      const V conv =
+          WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+      Store(conv, d, row_out + 0);
+    }
+
+    // Loop as long as we can load enough new values:
+    const size_t N = Lanes(d);
+    size_t x = N;
+    for (; x + N + kRadius <= xsize; x += N) {
+      const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2);
+      Store(conv, d, row_out + x);
+    }
+
+    // For final (partial) vector:
+    const V tc = LoadU(d, row_t + x);
+    const V mc = LoadU(d, row_m + x);
+    const V bc = LoadU(d, row_b + x);
+
+    V tr, mr, br;
+#if HWY_TARGET == HWY_SCALAR
+    tr = tc;  // Single-lane => mirrored right neighbor = center value.
+    mr = mc;
+    br = bc;
+#else
+    if (kSizeModN == 0) {
+      // The above loop didn't handle the last vector because it needs an
+      // additional right neighbor (generated via mirroring).
+      auto mirror = SetTableIndices(d, MirrorLanes(N - 1));
+      tr = TableLookupLanes(tc, mirror);
+      mr = TableLookupLanes(mc, mirror);
+      br = TableLookupLanes(bc, mirror);
+    } else {
+      auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1));
+      // Loads last valid value into uppermost lane and mirrors.
+      tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror);
+      mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror);
+      br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror);
+    }
+#endif
+
+    const V tl = LoadU(d, row_t + x - 1);
+    const V ml = LoadU(d, row_m + x - 1);
+    const V bl = LoadU(d, row_b + x - 1);
+    const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+    Store(conv, d, row_out + x);
+  }
+
+ private:
+  // Returns sum{x_i * w_i}.
+  template <class V>
+  static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr,
+                                        const V ml, const V mc, const V mr,
+                                        const V bl, const V bc, const V br,
+                                        const V w0, const V w1, const V w2) {
+    const V sum_tb = Add(tc, bc);
+
+    // Faster than 5 mul + 4 FMA.
+    const V mul0 = Mul(mc, w0);
+    const V sum_lr = Add(ml, mr);
+
+    const V x1 = Add(sum_tb, sum_lr);
+    const V mul1 = MulAdd(x1, w1, mul0);
+
+    const V sum_t2 = Add(tl, tr);
+    const V sum_b2 = Add(bl, br);
+    const V x2 = Add(sum_t2, sum_b2);
+    const V mul2 = MulAdd(x2, w2, mul1);
+    return mul2;
+  }
+
+  static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t,
+                                          const float* JXL_RESTRICT row_m,
+                                          const float* JXL_RESTRICT row_b,
+                                          const int64_t x, const V w0,
+                                          const V w1, const V w2) {
+    const D d;
+    const V tc = LoadU(d, row_t + x);
+    const V mc = LoadU(d, row_m + x);
+    const V bc = LoadU(d, row_b + x);
+    const V tl = LoadU(d, row_t + x - 1);
+    const V tr = LoadU(d, row_t + x + 1);
+    const V ml = LoadU(d, row_m + x - 1);
+    const V mr = LoadU(d, row_m + x + 1);
+    const V bl = LoadU(d, row_b + x - 1);
+    const V br = LoadU(d, row_b + x + 1);
+    return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+  }
+};
+
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out) {
+  using Conv = ConvolveT<Symmetric3Strategy>;
+  if (rect.xsize() >= Conv::MinWidth()) {
+    return Conv::Run(in, rect, weights, pool, out);
+  }
+
+  return SlowSymmetric3(in, rect, weights, pool, out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Symmetric3);
+void Symmetric3(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric3& weights, ThreadPool* pool,
+                ImageF* out) {
+  return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
new file mode 100644
index 0000000000..55a16899c3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
@@ -0,0 +1,185 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/common.h"  // RoundUpTo
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Vec;
+
+// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
+template <class WrapY>
+static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
+                               const int64_t ix, const int64_t iy,
+                               const size_t xsize, const size_t ysize,
+                               const float wx0, const float wx1,
+                               const float wx2) {
+  const WrapMirror wrap_x;
+  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
+  const float in_m2 = row[wrap_x(ix - 2, xsize)];
+  const float in_p2 = row[wrap_x(ix + 2, xsize)];
+  const float in_m1 = row[wrap_x(ix - 1, xsize)];
+  const float in_p1 = row[wrap_x(ix + 1, xsize)];
+  const float in_00 = row[ix];
+  const float sum_2 = wx2 * (in_m2 + in_p2);
+  const float sum_1 = wx1 * (in_m1 + in_p1);
+  const float sum_0 = wx0 * in_00;
+  return sum_2 + sum_1 + sum_0;
+}
+
+template <class WrapY, class V>
+static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
+                     const int64_t iy, const size_t ysize, const V wx0,
+                     const V wx1, const V wx2) {
+  const HWY_FULL(float) d;
+  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
+  const auto in_m2 = LoadU(d, center - 2);
+  const auto in_p2 = LoadU(d, center + 2);
+  const auto in_m1 = LoadU(d, center - 1);
+  const auto in_p1 = LoadU(d, center + 1);
+  const auto in_00 = Load(d, center);
+  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
+  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
+  const auto sum_0 = Mul(wx0, in_00);
+  return Add(sum_2, Add(sum_1, sum_0));
+}
+
+// Produces result for one pixel
+template <class WrapY>
+float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
+                       const int64_t iy, const WeightsSymmetric5& weights) {
+  const float w0 = weights.c[0];
+  const float w1 = weights.r[0];
+  const float w2 = weights.R[0];
+  const float w4 = weights.d[0];
+  const float w5 = weights.L[0];
+  const float w8 = weights.D[0];
+
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  const WrapY wrap_y;
+  // Unrolled loop over all 5 rows of the kernel.
+  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
+
+  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
+  float sum1 =
+      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
+
+  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
+  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
+
+  return sum0 + sum1;
+}
+
+// Produces result for one vector's worth of pixels
+template <class WrapY>
+static void Symmetric5Interior(const ImageF& in, const Rect& rect,
+                               const int64_t ix, const int64_t iy,
+                               const WeightsSymmetric5& weights,
+                               float* JXL_RESTRICT row_out) {
+  const HWY_FULL(float) d;
+
+  const auto w0 = LoadDup128(d, weights.c);
+  const auto w1 = LoadDup128(d, weights.r);
+  const auto w2 = LoadDup128(d, weights.R);
+  const auto w4 = LoadDup128(d, weights.d);
+  const auto w5 = LoadDup128(d, weights.L);
+  const auto w8 = LoadDup128(d, weights.D);
+
+  const size_t ysize = rect.ysize();
+  const WrapY wrap_y;
+  // Unrolled loop over all 5 rows of the kernel.
+  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
+
+  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
+  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
+
+  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
+  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
+
+  Store(Add(sum0, sum1), d, row_out + ix);
+}
+
+template <class WrapY>
+static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
+                          const WeightsSymmetric5& weights,
+                          float* JXL_RESTRICT row_out) {
+  const int64_t kRadius = 2;
+  const size_t xsize = rect.xsize();
+
+  size_t ix = 0;
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const size_t aligned_x = RoundUpTo(kRadius, N);
+  for (; ix < std::min(aligned_x, xsize); ++ix) {
+    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  }
+  for (; ix + N + kRadius <= xsize; ix += N) {
+    Symmetric5Interior<WrapY>(in, rect, ix, iy, weights, row_out);
+  }
+  for (; ix < xsize; ++ix) {
+    row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+  }
+}
+
+static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
+                                             const int64_t iy,
+                                             const WeightsSymmetric5& weights,
+                                             float* JXL_RESTRICT row_out) {
+  return Symmetric5Row<WrapMirror>(in, rect, iy, weights, row_out);
+}
+
+// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
+// the fully vectorized strategies below.
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+
+  const size_t ysize = rect.ysize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const int64_t iy = task;
+
+        if (iy < 2 || iy >= static_cast<ssize_t>(ysize) - 2) {
+          Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy));
+        } else {
+          Symmetric5Row<WrapUnchanged>(in, rect, iy, weights, out->Row(iy));
+        }
+      },
+      "Symmetric5x5Convolution"));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Symmetric5);
+void Symmetric5(const ImageF& in, const Rect& rect,
+                const WeightsSymmetric5& weights, ThreadPool* pool,
+                ImageF* JXL_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/convolve_test.cc b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
new file mode 100644
index 0000000000..e86d637114
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
@@ -0,0 +1,252 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#include <time.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/nanobenchmark.h>
+#include <hwy/tests/test_util-inl.h>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+#ifndef JXL_DEBUG_CONVOLVE
+#define JXL_DEBUG_CONVOLVE 0
+#endif
+
+#include "lib/jxl/convolve-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+void TestNeighbors() {
+  const Neighbors::D d;
+  const Neighbors::V v = Iota(d, 0);
+  HWY_ALIGN float actual[hwy::kTestMaxVectorSize / sizeof(float)] = {0};
+
+  HWY_ALIGN float first_l1[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
+  Store(Neighbors::FirstL1(v), d, actual);
+  const size_t N = Lanes(d);
+  EXPECT_EQ(std::vector<float>(first_l1, first_l1 + N),
+            std::vector<float>(actual, actual + N));
+
+#if HWY_TARGET != HWY_SCALAR
+  HWY_ALIGN float first_l2[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13};
+  Store(Neighbors::FirstL2(v), d, actual);
+  EXPECT_EQ(std::vector<float>(first_l2, first_l2 + N),
+            std::vector<float>(actual, actual + N));
+
+  HWY_ALIGN float first_l3[hwy::kTestMaxVectorSize / sizeof(float)] = {
+      2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  Store(Neighbors::FirstL3(v), d, actual);
+  EXPECT_EQ(std::vector<float>(first_l3, first_l3 + N),
+            std::vector<float>(actual, actual + N));
+#endif  // HWY_TARGET != HWY_SCALAR
+}
+
+void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  const WeightsSymmetric3& weights = WeightsSymmetric3Lowpass();
+  Symmetric3(in, rect, weights, pool, &out_expected);
+  SlowSymmetric3(in, rect, weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+// Ensures Symmetric and Separable give the same result.
+void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected);
+  Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  const WeightsSeparable5& weights = WeightsSeparable5Lowpass();
+  Separable5(in, Rect(in), weights, pool, &out_expected);
+  SlowSeparable5(in, rect, weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
+                      Rng* rng) {
+  const Rect rect(0, 0, xsize, ysize);
+
+  ImageF in(xsize, ysize);
+  GenerateImage(*rng, &in, 0.0f, 1.0f);
+
+  ImageF out_expected(xsize, ysize);
+  ImageF out_actual(xsize, ysize);
+
+  // Gaussian sigma 1.0
+  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
+                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
+
+  SlowSeparable7(in, rect, weights, pool, &out_expected);
+  Separable7(in, Rect(in), weights, pool, &out_actual);
+
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
+}
+
+// For all xsize/ysize and kernels:
+void TestConvolve() {
+  TestNeighbors();
+
+  test::ThreadPoolForTests pool(4);
+  EXPECT_EQ(true,
+            RunOnPool(
+                &pool, kConvolveMaxRadius, 40, ThreadPool::NoInit,
+                [](const uint32_t task, size_t /*thread*/) {
+                  const size_t xsize = task;
+                  Rng rng(129 + 13 * xsize);
+
+                  ThreadPool* null_pool = nullptr;
+                  test::ThreadPoolForTests pool3(3);
+                  for (size_t ysize = kConvolveMaxRadius; ysize < 16; ++ysize) {
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE,
+                              "%" PRIuS " x %" PRIuS " (target %" PRIx64
+                              ")===============================",
+                              xsize, ysize, static_cast<int64_t>(HWY_TARGET));
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym3------------------");
+                    VerifySymmetric3(xsize, ysize, null_pool, &rng);
+                    VerifySymmetric3(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sym5------------------");
+                    VerifySymmetric5(xsize, ysize, null_pool, &rng);
+                    VerifySymmetric5(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep5------------------");
+                    VerifySeparable5(xsize, ysize, null_pool, &rng);
+                    VerifySeparable5(xsize, ysize, &pool3, &rng);
+
+                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep7------------------");
+                    VerifySeparable7(xsize, ysize, null_pool, &rng);
+                    VerifySeparable7(xsize, ysize, &pool3, &rng);
+                  }
+                },
+                "TestConvolve"));
+}
+
+// Measures durations, verifies results, prints timings. `unpredictable1`
+// must have value 1 (unknown to the compiler to prevent elision).
+template <class Conv>
+void BenchmarkConv(const char* caption, const Conv& conv,
+                   const hwy::FuncInput unpredictable1) {
+  const size_t kNumInputs = 1;
+  const hwy::FuncInput inputs[kNumInputs] = {unpredictable1};
+  hwy::Result results[kNumInputs];
+
+  const size_t kDim = 160;  // in+out fit in L2
+  ImageF in(kDim, kDim);
+  ZeroFillImage(&in);
+  in.Row(kDim / 2)[kDim / 2] = unpredictable1;
+  ImageF out(kDim, kDim);
+
+  hwy::Params p;
+  p.verbose = false;
+  p.max_evals = 7;
+  p.target_rel_mad = 0.002;
+  const size_t num_results = MeasureClosure(
+      [&in, &conv, &out](const hwy::FuncInput input) {
+        conv(in, &out);
+        return out.Row(input)[0];
+      },
+      inputs, kNumInputs, results, p);
+  if (num_results != kNumInputs) {
+    fprintf(stderr, "MeasureClosure failed.\n");
+  }
+  for (size_t i = 0; i < num_results; ++i) {
+    const double seconds = static_cast<double>(results[i].ticks) /
+                           hwy::platform::InvariantTicksPerSecond();
+    printf("%12s: %7.2f MP/s (MAD=%4.2f%%)\n", caption,
+           kDim * kDim * 1E-6 / seconds,
+           static_cast<double>(results[i].variability) * 100.0);
+  }
+}
+
+struct ConvSymmetric3 {
+  void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const {
+    ThreadPool* null_pool = nullptr;
+    Symmetric3(in, Rect(in), WeightsSymmetric3Lowpass(), null_pool, out);
+  }
+};
+
+struct ConvSeparable5 {
+  void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) const {
+    ThreadPool* null_pool = nullptr;
+    Separable5(in, Rect(in), WeightsSeparable5Lowpass(), null_pool, out);
+  }
+};
+
+void BenchmarkAll() {
+#if 0  // disabled to avoid test timeouts, run manually on demand
+  const hwy::FuncInput unpredictable1 = time(nullptr) != 1234;
+  BenchmarkConv("Symmetric3", ConvSymmetric3(), unpredictable1);
+  BenchmarkConv("Separable5", ConvSeparable5(), unpredictable1);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class ConvolveTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(ConvolveTest);
+
+HWY_EXPORT_AND_TEST_P(ConvolveTest, TestConvolve);
+
+HWY_EXPORT_AND_TEST_P(ConvolveTest, BenchmarkAll);
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc
new file mode 100644
index 0000000000..ee2a97f93a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/data_parallel_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+class DataParallelTest : public ::testing::Test {
+ protected:
+  // A fake class to verify that DataParallel is properly calling the
+  // client-provided runner functions.
+  static int FakeRunner(void* runner_opaque, void* jpegxl_opaque,
+                        JxlParallelRunInit init, JxlParallelRunFunction func,
+                        uint32_t start_range, uint32_t end_range) {
+    DataParallelTest* self = static_cast<DataParallelTest*>(runner_opaque);
+    self->runner_called_++;
+    self->jpegxl_opaque_ = jpegxl_opaque;
+    self->init_ = init;
+    self->func_ = func;
+    self->start_range_ = start_range;
+    self->end_range_ = end_range;
+    return self->runner_return_;
+  }
+
+  ThreadPool pool_{&DataParallelTest::FakeRunner, this};
+
+  // Number of times FakeRunner() was called.
+  int runner_called_ = 0;
+
+  // Parameters passed to FakeRunner.
+  void* jpegxl_opaque_ = nullptr;
+  JxlParallelRunInit init_ = nullptr;
+  JxlParallelRunFunction func_ = nullptr;
+  uint32_t start_range_ = -1;
+  uint32_t end_range_ = -1;
+
+  // Return value that FakeRunner will return.
+  int runner_return_ = 0;
+};
+
+// JxlParallelRunInit interface.
+typedef int (*JxlParallelRunInit)();
+
+}  // namespace
+
+TEST_F(DataParallelTest, RunnerCalledParameters) {
+  EXPECT_TRUE(pool_.Run(
+      1234, 5678, [](size_t /* num_threads */) { return true; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_EQ(1, runner_called_);
+  EXPECT_NE(nullptr, init_);
+  EXPECT_NE(nullptr, func_);
+  EXPECT_NE(nullptr, jpegxl_opaque_);
+  EXPECT_EQ(1234u, start_range_);
+  EXPECT_EQ(5678u, end_range_);
+}
+
+TEST_F(DataParallelTest, RunnerFailurePropagates) {
+  runner_return_ = -1;  // FakeRunner return value.
+  EXPECT_FALSE(pool_.Run(
+      1234, 5678, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_FALSE(RunOnPool(
+      nullptr, 1234, 5678, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }, "Test"));
+}
+
+TEST_F(DataParallelTest, RunnerNotCalledOnEmptyRange) {
+  runner_return_ = -1;  // FakeRunner return value.
+  EXPECT_TRUE(pool_.Run(
+      123, 123, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }));
+  EXPECT_TRUE(RunOnPool(
+      nullptr, 123, 123, [](size_t /* num_threads */) { return false; },
+      [](uint32_t /* task */, size_t /* thread */) { return; }, "Test"));
+  // We don't call the external runner when the range is empty. We don't even
+  // need to call the init function.
+  EXPECT_EQ(0, runner_called_);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dct-inl.h b/third_party/jpeg-xl/lib/jxl/dct-inl.h
new file mode 100644
index 0000000000..532606075e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct-inl.h
@@ -0,0 +1,334 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD floating-point (I)DCT, any power of two.
+
+#if defined(LIB_JXL_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DCT_INL_H_
+#undef LIB_JXL_DCT_INL_H_
+#else
+#define LIB_JXL_DCT_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_block-inl.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/transpose-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+template <size_t SZ>
+struct FVImpl {
+  using type = HWY_CAPPED(float, SZ);
+};
+
+template <>
+struct FVImpl<0> {
+  using type = HWY_FULL(float);
+};
+
+template <size_t SZ>
+using FV = typename FVImpl<SZ>::type;
+
+// Implementation of Lowest Complexity Self Recursive Radix-2 DCT II/III
+// Algorithms, by Siriani M. Perera and Jianhua Liu.
+
+template <size_t N, size_t SZ>
+struct CoeffBundle {
+  static void AddReverse(const float* JXL_RESTRICT ain1,
+                         const float* JXL_RESTRICT ain2,
+                         float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain1 + i * SZ);
+      auto in2 = Load(FV<SZ>(), ain2 + (N - i - 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), aout + i * SZ);
+    }
+  }
+  static void SubReverse(const float* JXL_RESTRICT ain1,
+                         const float* JXL_RESTRICT ain2,
+                         float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain1 + i * SZ);
+      auto in2 = Load(FV<SZ>(), ain2 + (N - i - 1) * SZ);
+      Store(Sub(in1, in2), FV<SZ>(), aout + i * SZ);
+    }
+  }
+  static void B(float* JXL_RESTRICT coeff) {
+    auto sqrt2 = Set(FV<SZ>(), kSqrt2);
+    auto in1 = Load(FV<SZ>(), coeff);
+    auto in2 = Load(FV<SZ>(), coeff + SZ);
+    Store(MulAdd(in1, sqrt2, in2), FV<SZ>(), coeff);
+    for (size_t i = 1; i + 1 < N; i++) {
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (i + 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
+    }
+  }
+  static void BTranspose(float* JXL_RESTRICT coeff) {
+    for (size_t i = N - 1; i > 0; i--) {
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (i - 1) * SZ);
+      Store(Add(in1, in2), FV<SZ>(), coeff + i * SZ);
+    }
+    auto sqrt2 = Set(FV<SZ>(), kSqrt2);
+    auto in1 = Load(FV<SZ>(), coeff);
+    Store(Mul(in1, sqrt2), FV<SZ>(), coeff);
+  }
+  // Ideally optimized away by compiler (except the multiply).
+  static void InverseEvenOdd(const float* JXL_RESTRICT ain,
+                             float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = Load(FV<SZ>(), ain + i * SZ);
+      Store(in1, FV<SZ>(), aout + 2 * i * SZ);
+    }
+    for (size_t i = N / 2; i < N; i++) {
+      auto in1 = Load(FV<SZ>(), ain + i * SZ);
+      Store(in1, FV<SZ>(), aout + (2 * (i - N / 2) + 1) * SZ);
+    }
+  }
+  // Ideally optimized away by compiler.
+  static void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
+                             float* JXL_RESTRICT aout) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = LoadU(FV<SZ>(), ain + 2 * i * ain_stride);
+      Store(in1, FV<SZ>(), aout + i * SZ);
+    }
+    for (size_t i = N / 2; i < N; i++) {
+      auto in1 = LoadU(FV<SZ>(), ain + (2 * (i - N / 2) + 1) * ain_stride);
+      Store(in1, FV<SZ>(), aout + i * SZ);
+    }
+  }
+  // Invoked on full vector.
+  static void Multiply(float* JXL_RESTRICT coeff) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto in1 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ);
+      auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]);
+      Store(Mul(in1, mul), FV<SZ>(), coeff + (N / 2 + i) * SZ);
+    }
+  }
+  static void MultiplyAndAdd(const float* JXL_RESTRICT coeff,
+                             float* JXL_RESTRICT out, size_t out_stride) {
+    for (size_t i = 0; i < N / 2; i++) {
+      auto mul = Set(FV<SZ>(), WcMultipliers<N>::kMultipliers[i]);
+      auto in1 = Load(FV<SZ>(), coeff + i * SZ);
+      auto in2 = Load(FV<SZ>(), coeff + (N / 2 + i) * SZ);
+      auto out1 = MulAdd(mul, in2, in1);
+      auto out2 = NegMulAdd(mul, in2, in1);
+      StoreU(out1, FV<SZ>(), out + i * out_stride);
+      StoreU(out2, FV<SZ>(), out + (N - i - 1) * out_stride);
+    }
+  }
+  template <typename Block>
+  static void LoadFromBlock(const Block& in, size_t off,
+                            float* JXL_RESTRICT coeff) {
+    for (size_t i = 0; i < N; i++) {
+      Store(in.LoadPart(FV<SZ>(), i, off), FV<SZ>(), coeff + i * SZ);
+    }
+  }
+  template <typename Block>
+  static void StoreToBlockAndScale(const float* JXL_RESTRICT coeff,
+                                   const Block& out, size_t off) {
+    auto mul = Set(FV<SZ>(), 1.0f / N);
+    for (size_t i = 0; i < N; i++) {
+      out.StorePart(FV<SZ>(), Mul(mul, Load(FV<SZ>(), coeff + i * SZ)), i, off);
+    }
+  }
+};
+
+template <size_t N, size_t SZ>
+struct DCT1DImpl;
+
+template <size_t SZ>
+struct DCT1DImpl<1, SZ> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+};
+
+template <size_t SZ>
+struct DCT1DImpl<2, SZ> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+    auto in1 = Load(FV<SZ>(), mem);
+    auto in2 = Load(FV<SZ>(), mem + SZ);
+    Store(Add(in1, in2), FV<SZ>(), mem);
+    Store(Sub(in1, in2), FV<SZ>(), mem + SZ);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct DCT1DImpl {
+  void operator()(float* JXL_RESTRICT mem) {
+    // This is relatively small (4kB with 64-DCT and AVX-512)
+    HWY_ALIGN float tmp[N * SZ];
+    CoeffBundle<N / 2, SZ>::AddReverse(mem, mem + N / 2 * SZ, tmp);
+    DCT1DImpl<N / 2, SZ>()(tmp);
+    CoeffBundle<N / 2, SZ>::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ);
+    CoeffBundle<N, SZ>::Multiply(tmp);
+    DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ);
+    CoeffBundle<N / 2, SZ>::B(tmp + N / 2 * SZ);
+    CoeffBundle<N, SZ>::InverseEvenOdd(tmp, mem);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct IDCT1DImpl;
+
+template <size_t SZ>
+struct IDCT1DImpl<1, SZ> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    StoreU(LoadU(FV<SZ>(), from), FV<SZ>(), to);
+  }
+};
+
+template <size_t SZ>
+struct IDCT1DImpl<2, SZ> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    JXL_DASSERT(from_stride >= SZ);
+    JXL_DASSERT(to_stride >= SZ);
+    auto in1 = LoadU(FV<SZ>(), from);
+    auto in2 = LoadU(FV<SZ>(), from + from_stride);
+    StoreU(Add(in1, in2), FV<SZ>(), to);
+    StoreU(Sub(in1, in2), FV<SZ>(), to + to_stride);
+  }
+};
+
+template <size_t N, size_t SZ>
+struct IDCT1DImpl {
+  void operator()(const float* from, size_t from_stride, float* to,
+                  size_t to_stride) {
+    JXL_DASSERT(from_stride >= SZ);
+    JXL_DASSERT(to_stride >= SZ);
+    // This is relatively small (4kB with 64-DCT and AVX-512)
+    HWY_ALIGN float tmp[N * SZ];
+    CoeffBundle<N, SZ>::ForwardEvenOdd(from, from_stride, tmp);
+    IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ);
+    CoeffBundle<N / 2, SZ>::BTranspose(tmp + N / 2 * SZ);
+    IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ);
+    CoeffBundle<N, SZ>::MultiplyAndAdd(tmp, to, to_stride);
+  }
+};
+
+template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
+void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+  size_t M = M_or_0 != 0 ? M_or_0 : Mp;
+  constexpr size_t SZ = MaxLanes(FV<M_or_0>());
+  HWY_ALIGN float tmp[N * SZ];
+  for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
+    // TODO(veluca): consider removing the temporary memory here (as is done in
+    // IDCT), if it turns out that some compilers don't optimize away the loads
+    // and this is performance-critical.
+    CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp);
+    DCT1DImpl<N, SZ>()(tmp);
+    CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i);
+  }
+}
+
+template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
+void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+  size_t M = M_or_0 != 0 ? M_or_0 : Mp;
+  constexpr size_t SZ = MaxLanes(FV<M_or_0>());
+  for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
+    IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i),
+                        to.Stride());
+  }
+}
+
+template <size_t N, size_t M, typename = void>
+struct DCT1D {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return DCT1DWrapper<N, M>(from, to, M);
+  }
+};
+
+template <size_t N, size_t M>
+struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M);
+  }
+};
+
+template <size_t N, size_t M, typename = void>
+struct IDCT1D {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return IDCT1DWrapper<N, M>(from, to, M);
+  }
+};
+
+template <size_t N, size_t M>
+struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
+  template <typename FromBlock, typename ToBlock>
+  void operator()(const FromBlock& from, const ToBlock& to) {
+    return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to,
+                           M);
+  }
+};
+
+// Computes the maybe-transposed, scaled DCT of a block, that needs to be
+// HWY_ALIGN'ed.
+template <size_t ROWS, size_t COLS>
+struct ComputeScaledDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // floats.
+  template <class From>
+  HWY_MAYBE_UNUSED void operator()(const From& from, float* to,
+                                   float* JXL_RESTRICT scratch_space) {
+    float* JXL_RESTRICT block = scratch_space;
+    if (ROWS < COLS) {
+      DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS));
+      Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS));
+    } else {
+      DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS));
+      Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS));
+    }
+  }
+};
+// Computes the maybe-transposed, scaled IDCT of a block, that needs to be
+// HWY_ALIGN'ed.
+template <size_t ROWS, size_t COLS>
+struct ComputeScaledIDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // floats.
+  template <class To>
+  HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to,
+                                   float* JXL_RESTRICT scratch_space) {
+    float* JXL_RESTRICT block = scratch_space;
+    // Reverse the steps done in ComputeScaledDCT.
+    if (ROWS < COLS) {
+      Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS));
+      IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS));
+      IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to);
+    } else {
+      IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS));
+      Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS));
+      IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to);
+    }
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JXL_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_block-inl.h b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h
new file mode 100644
index 0000000000..50646a737f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_block-inl.h
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Adapters for DCT input/output: from/to contiguous blocks or image rows.
+
+#if defined(LIB_JXL_DCT_BLOCK_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DCT_BLOCK_INL_H_
+#undef LIB_JXL_DCT_BLOCK_INL_H_
+#else
+#define LIB_JXL_DCT_BLOCK_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/status.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Vec;
+
+// Block: (x, y) <-> (N * y + x)
+// Lines: (x, y) <-> (stride * y + x)
+//
+// I.e. Block is a specialization of Lines with fixed stride.
+//
+// FromXXX should implement Read and Load (Read vector).
+// ToXXX should implement Write and Store (Write vector).
+
+template <size_t N>
+using BlockDesc = HWY_CAPPED(float, N);
+
+// Here and in the following, the SZ template parameter specifies the number of
+// values to load/store. Needed because we want to handle 4x4 sub-blocks of
+// 16x16 blocks.
+class DCTFrom {
+ public:
+  DCTFrom(const float* data, size_t stride) : stride_(stride), data_(data) {}
+
+  template <typename D>
+  HWY_INLINE Vec<D> LoadPart(D, const size_t row, size_t i) const {
+    JXL_DASSERT(Lanes(D()) <= stride_);
+    // Since these functions are used also for DC, no alignment at all is
+    // guaranteed in the case of floating blocks.
+    // TODO(veluca): consider using a different class for DC-to-LF and
+    // DC-from-LF, or copying DC values to/from a temporary aligned location.
+    return LoadU(D(), Address(row, i));
+  }
+
+  HWY_INLINE float Read(const size_t row, const size_t i) const {
+    return *Address(row, i);
+  }
+
+  constexpr HWY_INLINE const float* Address(const size_t row,
+                                            const size_t i) const {
+    return data_ + row * stride_ + i;
+  }
+
+  size_t Stride() const { return stride_; }
+
+ private:
+  size_t stride_;
+  const float* JXL_RESTRICT data_;
+};
+
+class DCTTo {
+ public:
+  DCTTo(float* data, size_t stride) : stride_(stride), data_(data) {}
+
+  template <typename D>
+  HWY_INLINE void StorePart(D, const Vec<D>& v, const size_t row,
+                            size_t i) const {
+    JXL_DASSERT(Lanes(D()) <= stride_);
+    // Since these functions are used also for DC, no alignment at all is
+    // guaranteed in the case of floating blocks.
+    // TODO(veluca): consider using a different class for DC-to-LF and
+    // DC-from-LF, or copying DC values to/from a temporary aligned location.
+    StoreU(v, D(), Address(row, i));
+  }
+
+  HWY_INLINE void Write(float v, const size_t row, const size_t i) const {
+    *Address(row, i) = v;
+  }
+
+  constexpr HWY_INLINE float* Address(const size_t row, const size_t i) const {
+    return data_ + row * stride_ + i;
+  }
+
+  size_t Stride() const { return stride_; }
+
+ private:
+  size_t stride_;
+  float* JXL_RESTRICT data_;
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DCT_BLOCK_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_for_test.h b/third_party/jpeg-xl/lib/jxl/dct_for_test.h
new file mode 100644
index 0000000000..8e32aa7eff
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_for_test.h
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_FOR_TEST_H_
+#define LIB_JXL_DCT_FOR_TEST_H_
+
+// Unoptimized DCT only for use in tests.
+
+#include <string.h>  // memcpy
+
+#include <cmath>
+#include <vector>
+
+#include "lib/jxl/common.h"  // Pi
+
+namespace jxl {
+
+namespace test {
+static inline double alpha(int u) { return u == 0 ? 0.7071067811865475 : 1.0; }
+
+// N-DCT on M columns, divided by sqrt(N). Matches the definition in the spec.
+template <size_t N, size_t M>
+void DCT1D(double block[N * M], double out[N * M]) {
+  std::vector<double> matrix(N * N);
+  const double scale = std::sqrt(2.0) / N;
+  for (size_t y = 0; y < N; y++) {
+    for (size_t u = 0; u < N; u++) {
+      matrix[N * u + y] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale;
+    }
+  }
+  for (size_t x = 0; x < M; x++) {
+    for (size_t u = 0; u < N; u++) {
+      out[M * u + x] = 0;
+      for (size_t y = 0; y < N; y++) {
+        out[M * u + x] += matrix[N * u + y] * block[M * y + x];
+      }
+    }
+  }
+}
+
+// N-IDCT on M columns, multiplied by sqrt(N). Matches the definition in the
+// spec.
+template <size_t N, size_t M>
+void IDCT1D(double block[N * M], double out[N * M]) {
+  std::vector<double> matrix(N * N);
+  const double scale = std::sqrt(2.0);
+  for (size_t y = 0; y < N; y++) {
+    for (size_t u = 0; u < N; u++) {
+      // Transpose of DCT matrix.
+      matrix[N * y + u] = alpha(u) * cos((y + 0.5) * u * Pi(1.0 / N)) * scale;
+    }
+  }
+  for (size_t x = 0; x < M; x++) {
+    for (size_t u = 0; u < N; u++) {
+      out[M * u + x] = 0;
+      for (size_t y = 0; y < N; y++) {
+        out[M * u + x] += matrix[N * u + y] * block[M * y + x];
+      }
+    }
+  }
+}
+
+template <size_t N, size_t M>
+void TransposeBlock(double in[N * M], double out[M * N]) {
+  for (size_t x = 0; x < N; x++) {
+    for (size_t y = 0; y < M; y++) {
+      out[y * N + x] = in[x * M + y];
+    }
+  }
+}
+}  // namespace test
+
+// Untransposed DCT.
+template <size_t N>
+void DCTSlow(double block[N * N]) {
+  constexpr size_t kBlockSize = N * N;
+  std::vector<double> g(kBlockSize);
+  test::DCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+  test::DCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+}
+
+// Untransposed IDCT.
+template <size_t N>
+void IDCTSlow(double block[N * N]) {
+  constexpr size_t kBlockSize = N * N;
+  std::vector<double> g(kBlockSize);
+  test::IDCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+  test::IDCT1D<N, N>(block, g.data());
+  test::TransposeBlock<N, N>(g.data(), block);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_FOR_TEST_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.cc b/third_party/jpeg-xl/lib/jxl/dct_scales.cc
new file mode 100644
index 0000000000..f9e89a6014
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_scales.cc
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dct_scales.h"
+
+namespace jxl {
+
+// Definition of constexpr arrays.
+constexpr float DCTResampleScales<1, 8>::kScales[];
+constexpr float DCTResampleScales<2, 16>::kScales[];
+constexpr float DCTResampleScales<4, 32>::kScales[];
+constexpr float DCTResampleScales<8, 64>::kScales[];
+constexpr float DCTResampleScales<16, 128>::kScales[];
+constexpr float DCTResampleScales<32, 256>::kScales[];
+constexpr float DCTResampleScales<8, 1>::kScales[];
+constexpr float DCTResampleScales<16, 2>::kScales[];
+constexpr float DCTResampleScales<32, 4>::kScales[];
+constexpr float DCTResampleScales<64, 8>::kScales[];
+constexpr float DCTResampleScales<128, 16>::kScales[];
+constexpr float DCTResampleScales<256, 32>::kScales[];
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+constexpr float WcMultipliers<16>::kMultipliers[];
+constexpr float WcMultipliers<32>::kMultipliers[];
+constexpr float WcMultipliers<64>::kMultipliers[];
+constexpr float WcMultipliers<128>::kMultipliers[];
+constexpr float WcMultipliers<256>::kMultipliers[];
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dct_scales.h b/third_party/jpeg-xl/lib/jxl/dct_scales.h
new file mode 100644
index 0000000000..23af03d60f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_scales.h
@@ -0,0 +1,379 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_SCALES_H_
+#define LIB_JXL_DCT_SCALES_H_
+
+// Scaling factors.
+
+#include <stddef.h>
+
+namespace jxl {
+
+static constexpr float kSqrt2 = 1.41421356237f;
+static constexpr float kSqrt0_5 = 0.70710678118f;
+
+// For n != 0, the n-th basis function of a N-DCT, evaluated in pixel k, has a
+// value of cos((k+1/2) n/(2N) pi). When downsampling by 2x, we average
+// the values for pixel k and k+1 to get the value for pixel (k/2), thus we get
+//
+// [cos((k+1/2) n/N pi) + cos((k+3/2) n/N pi)]/2 =
+// cos(n/(2N) pi) cos((k+1) n/N pi) =
+// cos(n/(2N) pi) cos(((k/2)+1/2) n/(N/2) pi)
+//
+// which is exactly the same as the value of pixel k/2 of a N/2-sized DCT,
+// except for the cos(n/(2N) pi) scaling factor (which does *not*
+// depend on the pixel). Thus, when using the lower-frequency coefficients of a
+// DCT-N to compute a DCT-(N/2), they should be scaled by this constant. Scaling
+// factors for a DCT-(N/4) etc can then be obtained by successive
+// multiplications. The structs below contain the above-mentioned scaling
+// factors.
+//
+// Python code for the tables below:
+//
+// for i in range(N // 8):
+//    v = math.cos(i / (2 * N) * math.pi)
+//    v *= math.cos(i / (N) * math.pi)
+//    v *= math.cos(i / (N / 2) * math.pi)
+//    print(v, end=", ")
+
+template <size_t FROM, size_t TO>
+struct DCTResampleScales;
+
+template <>
+struct DCTResampleScales<8, 1> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+  };
+};
+
+template <>
+struct DCTResampleScales<16, 2> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      0.901764195028874394,
+  };
+};
+
+template <>
+struct DCTResampleScales<32, 4> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      0.974886821136879522,
+      0.901764195028874394,
+      0.787054918159101335,
+  };
+};
+
+template <>
+struct DCTResampleScales<64, 8> {
+  static constexpr float kScales[] = {
+      1.0000000000000000, 0.9936866130906366, 0.9748868211368796,
+      0.9440180941651672, 0.9017641950288744, 0.8490574973847023,
+      0.7870549181591013, 0.7171081282466044,
+  };
+};
+
+template <>
+struct DCTResampleScales<128, 16> {
+  static constexpr float kScales[] = {
+      1.0,
+      0.9984194528776054,
+      0.9936866130906366,
+      0.9858278282666936,
+      0.9748868211368796,
+      0.9609244059440204,
+      0.9440180941651672,
+      0.9242615922757944,
+      0.9017641950288744,
+      0.8766500784429904,
+      0.8490574973847023,
+      0.8191378932865928,
+      0.7870549181591013,
+      0.7529833816270532,
+      0.7171081282466044,
+      0.6796228528314651,
+  };
+};
+
+template <>
+struct DCTResampleScales<256, 32> {
+  static constexpr float kScales[] = {
+      1.0,
+      0.9996047255830407,
+      0.9984194528776054,
+      0.9964458326264695,
+      0.9936866130906366,
+      0.9901456355893141,
+      0.9858278282666936,
+      0.9807391980963174,
+      0.9748868211368796,
+      0.9682788310563117,
+      0.9609244059440204,
+      0.9528337534340876,
+      0.9440180941651672,
+      0.9344896436056892,
+      0.9242615922757944,
+      0.913348084400198,
+      0.9017641950288744,
+      0.8895259056651056,
+      0.8766500784429904,
+      0.8631544288990163,
+      0.8490574973847023,
+      0.8343786191696513,
+      0.8191378932865928,
+      0.8033561501721485,
+      0.7870549181591013,
+      0.7702563888779096,
+      0.7529833816270532,
+      0.7352593067735488,
+      0.7171081282466044,
+      0.6985543251889097,
+      0.6796228528314651,
+      0.6603391026591464,
+  };
+};
+
+// Inverses of the above.
+template <>
+struct DCTResampleScales<1, 8> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+  };
+};
+
+template <>
+struct DCTResampleScales<2, 16> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      1.108937353592731823,
+  };
+};
+
+template <>
+struct DCTResampleScales<4, 32> {
+  static constexpr float kScales[] = {
+      1.000000000000000000,
+      1.025760096781116015,
+      1.108937353592731823,
+      1.270559368765487251,
+  };
+};
+
+template <>
+struct DCTResampleScales<8, 64> {
+  static constexpr float kScales[] = {
+      1.0000000000000000, 1.0063534990068217, 1.0257600967811158,
+      1.0593017296817173, 1.1089373535927318, 1.1777765381970435,
+      1.2705593687654873, 1.3944898413647777,
+  };
+};
+
+template <>
+struct DCTResampleScales<16, 128> {
+  static constexpr float kScales[] = {
+      1.0,
+      1.0015830492062623,
+      1.0063534990068217,
+      1.0143759095928793,
+      1.0257600967811158,
+      1.0406645869480142,
+      1.0593017296817173,
+      1.0819447744633812,
+      1.1089373535927318,
+      1.1407059950032632,
+      1.1777765381970435,
+      1.2207956782315876,
+      1.2705593687654873,
+      1.3280505578213306,
+      1.3944898413647777,
+      1.4714043176061107,
+  };
+};
+
+template <>
+struct DCTResampleScales<32, 256> {
+  static constexpr float kScales[] = {
+      1.0,
+      1.0003954307206069,
+      1.0015830492062623,
+      1.0035668445360069,
+      1.0063534990068217,
+      1.009952439375063,
+      1.0143759095928793,
+      1.0196390660647288,
+      1.0257600967811158,
+      1.0327603660498115,
+      1.0406645869480142,
+      1.049501024072585,
+      1.0593017296817173,
+      1.0701028169146336,
+      1.0819447744633812,
+      1.0948728278734026,
+      1.1089373535927318,
+      1.124194353004584,
+      1.1407059950032632,
+      1.158541237256391,
+      1.1777765381970435,
+      1.1984966740820495,
+      1.2207956782315876,
+      1.244777922949508,
+      1.2705593687654873,
+      1.2982690107339132,
+      1.3280505578213306,
+      1.3600643892400104,
+      1.3944898413647777,
+      1.4315278911623237,
+      1.4714043176061107,
+      1.5143734423314616,
+  };
+};
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+template <>
+struct WcMultipliers<16> {
+  static constexpr float kMultipliers[] = {
+      0.5024192861881557, 0.5224986149396889, 0.5669440348163577,
+      0.6468217833599901, 0.7881546234512502, 1.060677685990347,
+      1.7224470982383342, 5.101148618689155,
+  };
+};
+
+template <>
+struct WcMultipliers<32> {
+  static constexpr float kMultipliers[] = {
+      0.5006029982351963, 0.5054709598975436, 0.5154473099226246,
+      0.5310425910897841, 0.5531038960344445, 0.5829349682061339,
+      0.6225041230356648, 0.6748083414550057, 0.7445362710022986,
+      0.8393496454155268, 0.9725682378619608, 1.1694399334328847,
+      1.4841646163141662, 2.057781009953411,  3.407608418468719,
+      10.190008123548033,
+  };
+};
+template <>
+struct WcMultipliers<64> {
+  static constexpr float kMultipliers[] = {
+      0.500150636020651,  0.5013584524464084, 0.5037887256810443,
+      0.5074711720725553, 0.5124514794082247, 0.5187927131053328,
+      0.52657731515427,   0.535909816907992,  0.5469204379855088,
+      0.5597698129470802, 0.57465518403266,   0.5918185358574165,
+      0.6115573478825099, 0.6342389366884031, 0.6603198078137061,
+      0.6903721282002123, 0.7251205223771985, 0.7654941649730891,
+      0.8127020908144905, 0.8683447152233481, 0.9345835970364075,
+      1.0144082649970547, 1.1120716205797176, 1.233832737976571,
+      1.3892939586328277, 1.5939722833856311, 1.8746759800084078,
+      2.282050068005162,  2.924628428158216,  4.084611078129248,
+      6.796750711673633,  20.373878167231453,
+  };
+};
+template <>
+struct WcMultipliers<128> {
+  static constexpr float kMultipliers[] = {
+      0.5000376519155477, 0.5003390374428216, 0.5009427176380873,
+      0.5018505174842379, 0.5030651913013697, 0.5045904432216454,
+      0.5064309549285542, 0.5085924210498143, 0.5110815927066812,
+      0.5139063298475396, 0.5170756631334912, 0.5205998663018917,
+      0.524490540114724,  0.5287607092074876, 0.5334249333971333,
+      0.538499435291984,  0.5440022463817783, 0.549953374183236,
+      0.5563749934898856, 0.5632916653417023, 0.5707305880121454,
+      0.5787218851348208, 0.5872989370937893, 0.5964987630244563,
+      0.606362462272146,  0.6169357260050706, 0.6282694319707711,
+      0.6404203382416639, 0.6534518953751283, 0.6674352009263413,
+      0.6824501259764195, 0.6985866506472291, 0.7159464549705746,
+      0.7346448236478627, 0.7548129391165311, 0.776600658233963,
+      0.8001798956216941, 0.8257487738627852, 0.8535367510066064,
+      0.8838110045596234, 0.9168844461846523, 0.9531258743921193,
+      0.9929729612675466, 1.036949040910389,  1.0856850642580145,
+      1.1399486751015042, 1.2006832557294167, 1.2690611716991191,
+      1.346557628206286,  1.4350550884414341, 1.5369941008524954,
+      1.6555965242641195, 1.7952052190778898, 1.961817848571166,
+      2.163957818751979,  2.4141600002500763, 2.7316450287739396,
+      3.147462191781909,  3.7152427383269746, 4.5362909369693565,
+      5.827688377844654,  8.153848602466814,  13.58429025728446,
+      40.744688103351834,
+  };
+};
+
+template <>
+struct WcMultipliers<256> {
+  static constexpr float kMultipliers[128] = {
+      0.5000094125358878, 0.500084723455784,  0.5002354020255269,
+      0.5004615618093246, 0.5007633734146156, 0.5011410648064231,
+      0.5015949217281668, 0.502125288230386,  0.5027325673091954,
+      0.5034172216566842, 0.5041797745258774, 0.5050208107132756,
+      0.5059409776624396, 0.5069409866925212, 0.5080216143561264,
+      0.509183703931388,  0.5104281670536573, 0.5117559854927805,
+      0.5131682130825206, 0.5146659778093218, 0.516250484068288,
+      0.5179230150949777, 0.5196849355823947, 0.5215376944933958,
+      0.5234828280796439, 0.52552196311921,   0.5276568203859896,
+      0.5298892183652453, 0.5322210772308335, 0.5346544231010253,
+      0.537191392591309,  0.5398342376841637, 0.5425853309375497,
+      0.545447171055775,  0.5484223888484947, 0.551513753605893,
+      0.554724179920619,  0.5580567349898085, 0.5615146464335654,
+      0.5651013106696203, 0.5688203018875696, 0.5726753816701664,
+      0.5766705093136241, 0.5808098529038624, 0.5850978012111273,
+      0.58953897647151,   0.5941382481306648, 0.5989007476325463,
+      0.6038318843443582, 0.6089373627182432, 0.614223200800649,
+      0.6196957502119484, 0.6253617177319102, 0.6312281886412079,
+      0.6373026519855411, 0.6435930279473415, 0.6501076975307724,
+      0.6568555347890955, 0.6638459418498757, 0.6710888870233562,
+      0.6785949463131795, 0.6863753486870501, 0.6944420255086364,
+      0.7028076645818034, 0.7114857693151208, 0.7204907235796304,
+      0.7298378629074134, 0.7395435527641373, 0.749625274727372,
+      0.7601017215162176, 0.7709929019493761, 0.7823202570613161,
+      0.7941067887834509, 0.8063772028037925, 0.8191580674598145,
+      0.83247799080191,   0.8463678182968619, 0.860860854031955,
+      0.8759931087426972, 0.8918035785352535, 0.9083345588266809,
+      0.9256319988042384, 0.9437459026371479, 0.962730784794803,
+      0.9826461881778968, 1.0035572754078206, 1.0255355056139732,
+      1.048659411496106,  1.0730154944316674, 1.0986992590905857,
+      1.1258164135986009, 1.1544842669978943, 1.184833362908442,
+      1.217009397314603,  1.2511754798461228, 1.287514812536712,
+      1.326233878832723,  1.3675662599582539, 1.411777227500661,
+      1.459169302866857,  1.5100890297227016, 1.5649352798258847,
+      1.6241695131835794, 1.6883285509131505, 1.7580406092704062,
+      1.8340456094306077, 1.9172211551275689, 2.0086161135167564,
+      2.1094945286246385, 2.22139377701127,   2.346202662531156,
+      2.486267909203593,  2.644541877144861,  2.824791402350551,
+      3.0318994541759925, 3.2723115884254845, 3.5547153325075804,
+      3.891107790700307,  4.298537526449054,  4.802076008665048,
+      5.440166215091329,  6.274908408039339,  7.413566756422303,
+      9.058751453879703,  11.644627325175037, 16.300023088031555,
+      27.163977662448232, 81.48784219222516,
+  };
+};
+
+// Apply the DCT algorithm-intrinsic constants to DCTResampleScale.
+template <size_t FROM, size_t TO>
+constexpr float DCTTotalResampleScale(size_t x) {
+  return DCTResampleScales<FROM, TO>::kScales[x];
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_SCALES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dct_test.cc b/third_party/jpeg-xl/lib/jxl/dct_test.cc
new file mode 100644
index 0000000000..9f5eff41e9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_test.cc
@@ -0,0 +1,389 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include <cmath>
+#include <numeric>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dct_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_for_test.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/test_utils.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// Computes the in-place NxN DCT of block.
+// Requires that block is HWY_ALIGN'ed.
+//
+// Performs ComputeTransposedScaledDCT and then transposes and scales it to
+// obtain "vanilla" DCT.
+template <size_t N>
+void ComputeDCT(float block[N * N]) {
+  HWY_ALIGN float tmp_block[N * N];
+  HWY_ALIGN float scratch_space[N * N];
+  ComputeScaledDCT<N, N>()(DCTFrom(block, N), tmp_block, scratch_space);
+
+  // Untranspose.
+  Transpose<N, N>::Run(DCTFrom(tmp_block, N), DCTTo(block, N));
+}
+
+// Computes the in-place 8x8 iDCT of block.
+// Requires that block is HWY_ALIGN'ed.
+template <int N>
+void ComputeIDCT(float block[N * N]) {
+  HWY_ALIGN float tmp_block[N * N];
+  HWY_ALIGN float scratch_space[N * N];
+  // Untranspose.
+  Transpose<N, N>::Run(DCTFrom(block, N), DCTTo(tmp_block, N));
+
+  ComputeScaledIDCT<N, N>()(tmp_block, DCTTo(block, N), scratch_space);
+}
+
+template <size_t N>
+void TransposeTestT(float accuracy) {
+  constexpr size_t kBlockSize = N * N;
+  HWY_ALIGN float src[kBlockSize];
+  DCTTo to_src(src, N);
+  for (size_t y = 0; y < N; ++y) {
+    for (size_t x = 0; x < N; ++x) {
+      to_src.Write(y * N + x, y, x);
+    }
+  }
+  HWY_ALIGN float dst[kBlockSize];
+  Transpose<N, N>::Run(DCTFrom(src, N), DCTTo(dst, N));
+  DCTFrom from_dst(dst, N);
+  for (size_t y = 0; y < N; ++y) {
+    for (size_t x = 0; x < N; ++x) {
+      float expected = x * N + y;
+      float actual = from_dst.Read(y, x);
+      EXPECT_NEAR(expected, actual, accuracy) << "x = " << x << ", y = " << y;
+    }
+  }
+}
+
+void TransposeTest() {
+  TransposeTestT<8>(1e-7f);
+  TransposeTestT<16>(1e-7f);
+  TransposeTestT<32>(1e-7f);
+}
+
+template <size_t N>
+void ColumnDctRoundtripT(float accuracy) {
+  constexpr size_t kBlockSize = N * N;
+  // Though we are only interested in single column result, dct.h has built-in
+  // limit on minimal number of columns processed. So, to be safe, we do
+  // regular 8x8 block transformation. On the bright side - we could check all
+  // 8 basis vectors at once.
+  HWY_ALIGN float block[kBlockSize];
+  DCTTo to(block, N);
+  DCTFrom from(block, N);
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      to.Write((i == j) ? 1.0f : 0.0f, i, j);
+    }
+  }
+
+  // Running (I)DCT on the same memory block seems to trigger a compiler bug on
+  // ARMv7 with clang6.
+  HWY_ALIGN float tmp[kBlockSize];
+  DCTTo to_tmp(tmp, N);
+  DCTFrom from_tmp(tmp, N);
+
+  DCT1D<N, N>()(from, to_tmp);
+  IDCT1D<N, N>()(from_tmp, to);
+
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      float expected = (i == j) ? 1.0f : 0.0f;
+      float actual = from.Read(i, j);
+      EXPECT_NEAR(expected, actual, accuracy) << " i=" << i << ", j=" << j;
+    }
+  }
+}
+
+void ColumnDctRoundtrip() {
+  ColumnDctRoundtripT<8>(1e-6f);
+  ColumnDctRoundtripT<16>(1e-6f);
+  ColumnDctRoundtripT<32>(1e-6f);
+}
+
+template <size_t N>
+void TestDctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    HWY_ALIGN float fast[kBlockSize] = {0.0f};
+    double slow[kBlockSize] = {0.0};
+    fast[i] = 1.0;
+    slow[i] = 1.0;
+    DCTSlow<N>(slow);
+    ComputeDCT<N>(fast);
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(fast[k], slow[k], accuracy / N)
+          << "i = " << i << ", k = " << k << ", N = " << N;
+    }
+  }
+}
+
+template <size_t N>
+void TestIdctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    HWY_ALIGN float fast[kBlockSize] = {0.0f};
+    double slow[kBlockSize] = {0.0};
+    fast[i] = 1.0;
+    slow[i] = 1.0;
+    IDCTSlow<N>(slow);
+    ComputeIDCT<N>(fast);
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(fast[k], slow[k], accuracy * N)
+          << "i = " << i << ", k = " << k << ", N = " << N;
+    }
+  }
+}
+
+template <size_t N>
+void TestInverseT(float accuracy) {
+  test::ThreadPoolForTests pool(N < 32 ? 0 : 8);
+  enum { kBlockSize = N * N };
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kBlockSize, ThreadPool::NoInit,
+      [accuracy](const uint32_t task, size_t /*thread*/) {
+        const size_t i = static_cast<size_t>(task);
+        HWY_ALIGN float x[kBlockSize] = {0.0f};
+        x[i] = 1.0;
+
+        ComputeIDCT<N>(x);
+        ComputeDCT<N>(x);
+
+        for (size_t k = 0; k < kBlockSize; ++k) {
+          EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy)
+              << "i = " << i << ", k = " << k;
+        }
+      },
+      "TestInverse"));
+}
+
+void InverseTest() {
+  TestInverseT<8>(1e-6f);
+  TestInverseT<16>(1e-6f);
+  TestInverseT<32>(3e-6f);
+}
+
+template <size_t N>
+void TestDctTranspose(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = 0; j < kBlockSize; ++j) {
+      // We check that <e_i, Me_j> = <M^\dagger{}e_i, e_j>.
+      // That means (Me_j)_i = (M^\dagger{}e_i)_j
+
+      // x := Me_j
+      HWY_ALIGN float x[kBlockSize] = {0.0f};
+      x[j] = 1.0;
+      ComputeIDCT<N>(x);
+      // y := M^\dagger{}e_i
+      HWY_ALIGN float y[kBlockSize] = {0.0f};
+      y[i] = 1.0;
+      ComputeDCT<N>(y);
+
+      EXPECT_NEAR(x[i] / N, y[j] * N, accuracy) << "i = " << i << ", j = " << j;
+    }
+  }
+}
+
+template <size_t N>
+void TestSlowInverse(float accuracy, size_t start = 0, size_t end = N * N) {
+  constexpr size_t kBlockSize = N * N;
+  for (size_t i = start; i < end; i++) {
+    double x[kBlockSize] = {0.0f};
+    x[i] = 1.0;
+
+    DCTSlow<N>(x);
+    IDCTSlow<N>(x);
+
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(x[k], (k == i) ? 1.0f : 0.0f, accuracy)
+          << "i = " << i << ", k = " << k;
+    }
+  }
+}
+
+template <size_t ROWS, size_t COLS>
+void TestRectInverseT(float accuracy) {
+  constexpr size_t kBlockSize = ROWS * COLS;
+  for (size_t i = 0; i < kBlockSize; ++i) {
+    HWY_ALIGN float x[kBlockSize] = {0.0f};
+    HWY_ALIGN float out[kBlockSize] = {0.0f};
+    x[i] = 1.0;
+    HWY_ALIGN float coeffs[kBlockSize] = {0.0f};
+    HWY_ALIGN float scratch_space[kBlockSize * 2];
+
+    ComputeScaledDCT<ROWS, COLS>()(DCTFrom(x, COLS), coeffs, scratch_space);
+    ComputeScaledIDCT<ROWS, COLS>()(coeffs, DCTTo(out, COLS), scratch_space);
+
+    for (size_t k = 0; k < kBlockSize; ++k) {
+      EXPECT_NEAR(out[k], (k == i) ? 1.0f : 0.0f, accuracy)
+          << "i = " << i << ", k = " << k << " ROWS = " << ROWS
+          << " COLS = " << COLS;
+    }
+  }
+}
+
+void TestRectInverse() {
+  TestRectInverseT<16, 32>(1e-6f);
+  TestRectInverseT<8, 32>(1e-6f);
+  TestRectInverseT<8, 16>(1e-6f);
+  TestRectInverseT<4, 8>(1e-6f);
+  TestRectInverseT<2, 4>(1e-6f);
+  TestRectInverseT<1, 4>(1e-6f);
+  TestRectInverseT<1, 2>(1e-6f);
+
+  TestRectInverseT<32, 16>(1e-6f);
+  TestRectInverseT<32, 8>(1e-6f);
+  TestRectInverseT<16, 8>(1e-6f);
+  TestRectInverseT<8, 4>(1e-6f);
+  TestRectInverseT<4, 2>(1e-6f);
+  TestRectInverseT<4, 1>(1e-6f);
+  TestRectInverseT<2, 1>(1e-6f);
+}
+
+template <size_t ROWS, size_t COLS>
+void TestRectTransposeT(float accuracy) {
+  constexpr size_t kBlockSize = ROWS * COLS;
+  HWY_ALIGN float scratch_space[kBlockSize * 2];
+  for (size_t px = 0; px < COLS; ++px) {
+    for (size_t py = 0; py < ROWS; ++py) {
+      HWY_ALIGN float x1[kBlockSize] = {0.0f};
+      HWY_ALIGN float x2[kBlockSize] = {0.0f};
+      HWY_ALIGN float coeffs1[kBlockSize] = {0.0f};
+      HWY_ALIGN float coeffs2[kBlockSize] = {0.0f};
+      x1[py * COLS + px] = 1;
+      x2[px * ROWS + py] = 1;
+
+      constexpr size_t OUT_ROWS = ROWS < COLS ? ROWS : COLS;
+      constexpr size_t OUT_COLS = ROWS < COLS ? COLS : ROWS;
+
+      ComputeScaledDCT<ROWS, COLS>()(DCTFrom(x1, COLS), coeffs1, scratch_space);
+      ComputeScaledDCT<COLS, ROWS>()(DCTFrom(x2, ROWS), coeffs2, scratch_space);
+
+      for (size_t x = 0; x < OUT_COLS; ++x) {
+        for (size_t y = 0; y < OUT_ROWS; ++y) {
+          EXPECT_NEAR(coeffs1[y * OUT_COLS + x], coeffs2[y * OUT_COLS + x],
+                      accuracy)
+              << " px = " << px << ", py = " << py << ", x = " << x
+              << ", y = " << y;
+        }
+      }
+    }
+  }
+}
+
+void TestRectTranspose() {
+  TestRectTransposeT<16, 32>(1e-6f);
+  TestRectTransposeT<8, 32>(1e-6f);
+  TestRectTransposeT<8, 16>(1e-6f);
+  TestRectTransposeT<4, 8>(1e-6f);
+  TestRectTransposeT<2, 4>(1e-6f);
+  TestRectTransposeT<1, 4>(1e-6f);
+  TestRectTransposeT<1, 2>(1e-6f);
+
+  // Identical to 8, 16
+  //  TestRectTranspose<16, 8>(1e-6f);
+}
+
+void TestDctAccuracyShard(size_t shard) {
+  if (shard == 0) {
+    TestDctAccuracy<1>(1.1E-7f);
+    TestDctAccuracy<2>(1.1E-7f);
+    TestDctAccuracy<4>(1.1E-7f);
+    TestDctAccuracy<8>(1.1E-7f);
+    TestDctAccuracy<16>(1.3E-7f);
+  }
+  TestDctAccuracy<32>(1.1E-7f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestIdctAccuracyShard(size_t shard) {
+  if (shard == 0) {
+    TestIdctAccuracy<1>(1E-7f);
+    TestIdctAccuracy<2>(1E-7f);
+    TestIdctAccuracy<4>(1E-7f);
+    TestIdctAccuracy<8>(1E-7f);
+    TestIdctAccuracy<16>(1E-7f);
+  }
+  TestIdctAccuracy<32>(1E-7f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestDctTransposeShard(size_t shard) {
+  if (shard == 0) {
+    TestDctTranspose<8>(1E-6f);
+    TestDctTranspose<16>(1E-6f);
+  }
+  TestDctTranspose<32>(3E-6f, 32 * shard, 32 * (shard + 1));
+}
+
+void TestSlowInverseShard(size_t shard) {
+  if (shard == 0) {
+    TestSlowInverse<1>(1E-5f);
+    TestSlowInverse<2>(1E-5f);
+    TestSlowInverse<4>(1E-5f);
+    TestSlowInverse<8>(1E-5f);
+    TestSlowInverse<16>(1E-5f);
+  }
+  TestSlowInverse<32>(1E-5f, 32 * shard, 32 * (shard + 1));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class TransposeTest : public hwy::TestWithParamTarget {};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(TransposeTest);
+
+HWY_EXPORT_AND_TEST_P(TransposeTest, TransposeTest);
+HWY_EXPORT_AND_TEST_P(TransposeTest, InverseTest);
+HWY_EXPORT_AND_TEST_P(TransposeTest, ColumnDctRoundtrip);
+HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectInverse);
+HWY_EXPORT_AND_TEST_P(TransposeTest, TestRectTranspose);
+
+// Tests in the DctShardedTest class are sharded for N=32.
+class DctShardedTest : public ::hwy::TestWithParamTargetAndT<uint32_t> {};
+
+std::vector<uint32_t> ShardRange(uint32_t n) {
+#ifdef JXL_DISABLE_SLOW_TESTS
+  JXL_ASSERT(n > 6);
+  std::vector<uint32_t> ret = {0, 1, 3, 5, n - 1};
+#else
+  std::vector<uint32_t> ret(n);
+  std::iota(ret.begin(), ret.end(), 0);
+#endif  // JXL_DISABLE_SLOW_TESTS
+  return ret;
+}
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(DctShardedTest,
+                                      ::testing::ValuesIn(ShardRange(32)));
+
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctAccuracyShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestIdctAccuracyShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestDctTransposeShard);
+HWY_EXPORT_AND_TEST_P_T(DctShardedTest, TestSlowInverseShard);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dct_util.h b/third_party/jpeg-xl/lib/jxl/dct_util.h
new file mode 100644
index 0000000000..fb6ce3b971
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dct_util.h
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DCT_UTIL_H_
+#define LIB_JXL_DCT_UTIL_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+union ACPtr {
+  int32_t* ptr32;
+  int16_t* ptr16;
+  ACPtr() = default;
+  explicit ACPtr(int16_t* p) : ptr16(p) {}
+  explicit ACPtr(int32_t* p) : ptr32(p) {}
+};
+
+union ConstACPtr {
+  const int32_t* ptr32;
+  const int16_t* ptr16;
+  ConstACPtr() = default;
+  explicit ConstACPtr(const int16_t* p) : ptr16(p) {}
+  explicit ConstACPtr(const int32_t* p) : ptr32(p) {}
+};
+
+enum class ACType { k16 = 0, k32 = 1 };
+
+class ACImage {
+ public:
+  virtual ~ACImage() = default;
+  virtual ACType Type() const = 0;
+  virtual ACPtr PlaneRow(size_t c, size_t y, size_t xbase) = 0;
+  virtual ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const = 0;
+  virtual size_t PixelsPerRow() const = 0;
+  virtual void ZeroFill() = 0;
+  virtual void ZeroFillPlane(size_t c) = 0;
+  virtual bool IsEmpty() const = 0;
+};
+
+template <typename T>
+class ACImageT final : public ACImage {
+ public:
+  ACImageT() = default;
+  ACImageT(size_t xsize, size_t ysize) {
+    static_assert(
+        std::is_same<T, int16_t>::value || std::is_same<T, int32_t>::value,
+        "ACImage must be either 32- or 16- bit");
+    img_ = Image3<T>(xsize, ysize);
+  }
+  ACType Type() const override {
+    return sizeof(T) == 2 ? ACType::k16 : ACType::k32;
+  }
+  ACPtr PlaneRow(size_t c, size_t y, size_t xbase) override {
+    return ACPtr(img_.PlaneRow(c, y) + xbase);
+  }
+  ConstACPtr PlaneRow(size_t c, size_t y, size_t xbase) const override {
+    return ConstACPtr(img_.PlaneRow(c, y) + xbase);
+  }
+
+  size_t PixelsPerRow() const override { return img_.PixelsPerRow(); }
+
+  void ZeroFill() override { ZeroFillImage(&img_); }
+
+  void ZeroFillPlane(size_t c) override { ZeroFillImage(&img_.Plane(c)); }
+
+  bool IsEmpty() const override {
+    return img_.xsize() == 0 || img_.ysize() == 0;
+  }
+
+ private:
+  Image3<T> img_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DCT_UTIL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.cc b/third_party/jpeg-xl/lib/jxl/dec_ans.cc
new file mode 100644
index 0000000000..c9145472e0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_ans.cc
@@ -0,0 +1,374 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_ans.h"
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+
+// Decodes a number in the range [0..255], by reading 1 - 11 bits.
+inline int DecodeVarLenUint8(BitReader* input) {
+  if (input->ReadFixedBits<1>()) {
+    int nbits = static_cast<int>(input->ReadFixedBits<3>());
+    if (nbits == 0) {
+      return 1;
+    } else {
+      return static_cast<int>(input->ReadBits(nbits)) + (1 << nbits);
+    }
+  }
+  return 0;
+}
+
+// Decodes a number in the range [0..65535], by reading 1 - 21 bits.
+inline int DecodeVarLenUint16(BitReader* input) {
+  if (input->ReadFixedBits<1>()) {
+    int nbits = static_cast<int>(input->ReadFixedBits<4>());
+    if (nbits == 0) {
+      return 1;
+    } else {
+      return static_cast<int>(input->ReadBits(nbits)) + (1 << nbits);
+    }
+  }
+  return 0;
+}
+
+Status ReadHistogram(int precision_bits, std::vector<int32_t>* counts,
+                     BitReader* input) {
+  int simple_code = input->ReadBits(1);
+  if (simple_code == 1) {
+    int i;
+    int symbols[2] = {0};
+    int max_symbol = 0;
+    const int num_symbols = input->ReadBits(1) + 1;
+    for (i = 0; i < num_symbols; ++i) {
+      symbols[i] = DecodeVarLenUint8(input);
+      if (symbols[i] > max_symbol) max_symbol = symbols[i];
+    }
+    counts->resize(max_symbol + 1);
+    if (num_symbols == 1) {
+      (*counts)[symbols[0]] = 1 << precision_bits;
+    } else {
+      if (symbols[0] == symbols[1]) {  // corrupt data
+        return false;
+      }
+      (*counts)[symbols[0]] = input->ReadBits(precision_bits);
+      (*counts)[symbols[1]] = (1 << precision_bits) - (*counts)[symbols[0]];
+    }
+  } else {
+    int is_flat = input->ReadBits(1);
+    if (is_flat == 1) {
+      int alphabet_size = DecodeVarLenUint8(input) + 1;
+      *counts = CreateFlatHistogram(alphabet_size, 1 << precision_bits);
+      return true;
+    }
+
+    uint32_t shift;
+    {
+      // TODO(veluca): speed up reading with table lookups.
+      int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1);
+      int log = 0;
+      for (; log < upper_bound_log; log++) {
+        if (input->ReadFixedBits<1>() == 0) break;
+      }
+      shift = (input->ReadBits(log) | (1 << log)) - 1;
+      if (shift > ANS_LOG_TAB_SIZE + 1) {
+        return JXL_FAILURE("Invalid shift value");
+      }
+    }
+
+    int length = DecodeVarLenUint8(input) + 3;
+    counts->resize(length);
+    int total_count = 0;
+
+    static const uint8_t huff[128][2] = {
+        {3, 10}, {7, 12}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {7, 13}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {6, 11}, {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+        {3, 10}, {5, 0},  {3, 7}, {4, 3}, {3, 6}, {3, 8}, {3, 9}, {4, 5},
+        {3, 10}, {4, 4},  {3, 7}, {4, 1}, {3, 6}, {3, 8}, {3, 9}, {4, 2},
+    };
+
+    std::vector<int> logcounts(counts->size());
+    int omit_log = -1;
+    int omit_pos = -1;
+    // This array remembers which symbols have an RLE length.
+    std::vector<int> same(counts->size(), 0);
+    for (size_t i = 0; i < logcounts.size(); ++i) {
+      input->Refill();  // for PeekFixedBits + Advance
+      int idx = input->PeekFixedBits<7>();
+      input->Consume(huff[idx][0]);
+      logcounts[i] = huff[idx][1];
+      // The RLE symbol.
+      if (logcounts[i] == ANS_LOG_TAB_SIZE + 1) {
+        int rle_length = DecodeVarLenUint8(input);
+        same[i] = rle_length + 5;
+        i += rle_length + 3;
+        continue;
+      }
+      if (logcounts[i] > omit_log) {
+        omit_log = logcounts[i];
+        omit_pos = i;
+      }
+    }
+    // Invalid input, e.g. due to invalid usage of RLE.
+    if (omit_pos < 0) return JXL_FAILURE("Invalid histogram.");
+    if (static_cast<size_t>(omit_pos) + 1 < logcounts.size() &&
+        logcounts[omit_pos + 1] == ANS_TAB_SIZE + 1) {
+      return JXL_FAILURE("Invalid histogram.");
+    }
+    int prev = 0;
+    int numsame = 0;
+    for (size_t i = 0; i < logcounts.size(); ++i) {
+      if (same[i]) {
+        // RLE sequence, let this loop output the same count for the next
+        // iterations.
+        numsame = same[i] - 1;
+        prev = i > 0 ? (*counts)[i - 1] : 0;
+      }
+      if (numsame > 0) {
+        (*counts)[i] = prev;
+        numsame--;
+      } else {
+        int code = logcounts[i];
+        // omit_pos may not be negative at this point (checked before).
+        if (i == static_cast<size_t>(omit_pos)) {
+          continue;
+        } else if (code == 0) {
+          continue;
+        } else if (code == 1) {
+          (*counts)[i] = 1;
+        } else {
+          int bitcount = GetPopulationCountPrecision(code - 1, shift);
+          (*counts)[i] = (1 << (code - 1)) +
+                         (input->ReadBits(bitcount) << (code - 1 - bitcount));
+        }
+      }
+      total_count += (*counts)[i];
+    }
+    (*counts)[omit_pos] = (1 << precision_bits) - total_count;
+    if ((*counts)[omit_pos] <= 0) {
+      // The histogram we've read sums to more than total_count (including at
+      // least 1 for the omitted value).
+      return JXL_FAILURE("Invalid histogram count.");
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeANSCodes(const size_t num_histograms,
+                      const size_t max_alphabet_size, BitReader* in,
+                      ANSCode* result) {
+  result->degenerate_symbols.resize(num_histograms, -1);
+  if (result->use_prefix_code) {
+    JXL_ASSERT(max_alphabet_size <= 1 << PREFIX_MAX_BITS);
+    result->huffman_data.resize(num_histograms);
+    std::vector<uint16_t> alphabet_sizes(num_histograms);
+    for (size_t c = 0; c < num_histograms; c++) {
+      alphabet_sizes[c] = DecodeVarLenUint16(in) + 1;
+      if (alphabet_sizes[c] > max_alphabet_size) {
+        return JXL_FAILURE("Alphabet size is too long: %u", alphabet_sizes[c]);
+      }
+    }
+    for (size_t c = 0; c < num_histograms; c++) {
+      if (alphabet_sizes[c] > 1) {
+        if (!result->huffman_data[c].ReadFromBitStream(alphabet_sizes[c], in)) {
+          if (!in->AllReadsWithinBounds()) {
+            return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                              "Not enough bytes for huffman code");
+          }
+          return JXL_FAILURE("Invalid huffman tree number %" PRIuS
+                             ", alphabet size %u",
+                             c, alphabet_sizes[c]);
+        }
+      } else {
+        // 0-bit codes does not require extension tables.
+        result->huffman_data[c].table_.clear();
+        result->huffman_data[c].table_.resize(1u << kHuffmanTableBits);
+      }
+      for (const auto& h : result->huffman_data[c].table_) {
+        if (h.bits <= kHuffmanTableBits) {
+          result->UpdateMaxNumBits(c, h.value);
+        }
+      }
+    }
+  } else {
+    JXL_ASSERT(max_alphabet_size <= ANS_MAX_ALPHABET_SIZE);
+    result->alias_tables =
+        AllocateArray(num_histograms * (1 << result->log_alpha_size) *
+                      sizeof(AliasTable::Entry));
+    AliasTable::Entry* alias_tables =
+        reinterpret_cast<AliasTable::Entry*>(result->alias_tables.get());
+    for (size_t c = 0; c < num_histograms; ++c) {
+      std::vector<int32_t> counts;
+      if (!ReadHistogram(ANS_LOG_TAB_SIZE, &counts, in)) {
+        return JXL_FAILURE("Invalid histogram bitstream.");
+      }
+      if (counts.size() > max_alphabet_size) {
+        return JXL_FAILURE("Alphabet size is too long: %" PRIuS, counts.size());
+      }
+      while (!counts.empty() && counts.back() == 0) {
+        counts.pop_back();
+      }
+      for (size_t s = 0; s < counts.size(); s++) {
+        if (counts[s] != 0) {
+          result->UpdateMaxNumBits(c, s);
+        }
+      }
+      // InitAliasTable "fixes" empty counts to contain degenerate "0" symbol.
+      int degenerate_symbol = counts.empty() ? 0 : (counts.size() - 1);
+      for (int s = 0; s < degenerate_symbol; ++s) {
+        if (counts[s] != 0) {
+          degenerate_symbol = -1;
+          break;
+        }
+      }
+      result->degenerate_symbols[c] = degenerate_symbol;
+      InitAliasTable(counts, ANS_TAB_SIZE, result->log_alpha_size,
+                     alias_tables + c * (1 << result->log_alpha_size));
+    }
+  }
+  return true;
+}
+Status DecodeUintConfig(size_t log_alpha_size, HybridUintConfig* uint_config,
+                        BitReader* br) {
+  br->Refill();
+  size_t split_exponent = br->ReadBits(CeilLog2Nonzero(log_alpha_size + 1));
+  size_t msb_in_token = 0, lsb_in_token = 0;
+  if (split_exponent != log_alpha_size) {
+    // otherwise, msb/lsb don't matter.
+    size_t nbits = CeilLog2Nonzero(split_exponent + 1);
+    msb_in_token = br->ReadBits(nbits);
+    if (msb_in_token > split_exponent) {
+      // This could be invalid here already and we need to check this before
+      // we use its value to read more bits.
+      return JXL_FAILURE("Invalid HybridUintConfig");
+    }
+    nbits = CeilLog2Nonzero(split_exponent - msb_in_token + 1);
+    lsb_in_token = br->ReadBits(nbits);
+  }
+  if (lsb_in_token + msb_in_token > split_exponent) {
+    return JXL_FAILURE("Invalid HybridUintConfig");
+  }
+  *uint_config = HybridUintConfig(split_exponent, msb_in_token, lsb_in_token);
+  return true;
+}
+
+Status DecodeUintConfigs(size_t log_alpha_size,
+                         std::vector<HybridUintConfig>* uint_config,
+                         BitReader* br) {
+  // TODO(veluca): RLE?
+  for (size_t i = 0; i < uint_config->size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        DecodeUintConfig(log_alpha_size, &(*uint_config)[i], br));
+  }
+  return true;
+}
+
+LZ77Params::LZ77Params() { Bundle::Init(this); }
+Status LZ77Params::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &enabled));
+  if (!visitor->Conditional(enabled)) return true;
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(224), Val(512), Val(4096),
+                                         BitsOffset(15, 8), 224, &min_symbol));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(3), Val(4), BitsOffset(2, 5),
+                                         BitsOffset(8, 9), 3, &min_length));
+  return true;
+}
+
+void ANSCode::UpdateMaxNumBits(size_t ctx, size_t symbol) {
+  HybridUintConfig* cfg = &uint_config[ctx];
+  // LZ77 symbols use a different uint config.
+  if (lz77.enabled && lz77.nonserialized_distance_context != ctx &&
+      symbol >= lz77.min_symbol) {
+    symbol -= lz77.min_symbol;
+    cfg = &lz77.length_uint_config;
+  }
+  size_t split_token = cfg->split_token;
+  size_t msb_in_token = cfg->msb_in_token;
+  size_t lsb_in_token = cfg->lsb_in_token;
+  size_t split_exponent = cfg->split_exponent;
+  if (symbol < split_token) {
+    max_num_bits = std::max(max_num_bits, split_exponent);
+    return;
+  }
+  uint32_t n_extra_bits =
+      split_exponent - (msb_in_token + lsb_in_token) +
+      ((symbol - split_token) >> (msb_in_token + lsb_in_token));
+  size_t total_bits = msb_in_token + lsb_in_token + n_extra_bits + 1;
+  max_num_bits = std::max(max_num_bits, total_bits);
+}
+
+Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code,
+                        std::vector<uint8_t>* context_map, bool disallow_lz77) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(Bundle::Read(br, &code->lz77));
+  if (code->lz77.enabled) {
+    num_contexts++;
+    JXL_RETURN_IF_ERROR(DecodeUintConfig(/*log_alpha_size=*/8,
+                                         &code->lz77.length_uint_config, br));
+  }
+  if (code->lz77.enabled && disallow_lz77) {
+    return JXL_FAILURE("Using LZ77 when explicitly disallowed");
+  }
+  size_t num_histograms = 1;
+  context_map->resize(num_contexts);
+  if (num_contexts > 1) {
+    JXL_RETURN_IF_ERROR(DecodeContextMap(context_map, &num_histograms, br));
+  }
+  code->lz77.nonserialized_distance_context = context_map->back();
+  code->use_prefix_code = br->ReadFixedBits<1>();
+  if (code->use_prefix_code) {
+    code->log_alpha_size = PREFIX_MAX_BITS;
+  } else {
+    code->log_alpha_size = br->ReadFixedBits<2>() + 5;
+  }
+  code->uint_config.resize(num_histograms);
+  JXL_RETURN_IF_ERROR(
+      DecodeUintConfigs(code->log_alpha_size, &code->uint_config, br));
+  const size_t max_alphabet_size = 1 << code->log_alpha_size;
+  JXL_RETURN_IF_ERROR(
+      DecodeANSCodes(num_histograms, max_alphabet_size, br, code));
+  // When using LZ77, flat codes might result in valid codestreams with
+  // histograms that potentially allow very large bit counts.
+  // TODO(veluca): in principle, a valid codestream might contain a histogram
+  // that could allow very large numbers of bits that is never used during ANS
+  // decoding. There's no benefit to doing that, though.
+  if (!code->lz77.enabled && code->max_num_bits > 32) {
+    // Just emit a warning as there are many opportunities for false positives.
+    JXL_WARNING("Histogram can represent numbers that are too large: %" PRIuS
+                "\n",
+                code->max_num_bits);
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.h b/third_party/jpeg-xl/lib/jxl/dec_ans.h
new file mode 100644
index 0000000000..0f4406745a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_ans.h
@@ -0,0 +1,462 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_ANS_H_
+#define LIB_JXL_DEC_ANS_H_
+
+// Library to decode the ANS population counts from the bit-stream and build a
+// decoding table from them.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstring>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_huffman.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+class ANSSymbolReader;
+
+// Experiments show that best performance is typically achieved for a
+// split-exponent of 3 or 4. Trend seems to be that '4' is better
+// for large-ish pictures, and '3' better for rather small-ish pictures.
+// This is plausible - the more special symbols we have, the better
+// statistics we need to get a benefit out of them.
+
+// Our hybrid-encoding scheme has dedicated tokens for the smallest
+// (1 << split_exponents) numbers, and for the rest
+// encodes (number of bits) + (msb_in_token sub-leading binary digits) +
+// (lsb_in_token lowest binary digits) in the token, with the remaining bits
+// then being encoded as data.
+//
+// Example with split_exponent = 4, msb_in_token = 2, lsb_in_token = 0.
+//
+// Numbers N in [0 .. 15]:
+//   These get represented as (token=N, bits='').
+// Numbers N >= 16:
+//   If n is such that 2**n <= N < 2**(n+1),
+//   and m = N - 2**n is the 'mantissa',
+//   these get represented as:
+// (token=split_token +
+//        ((n - split_exponent) * 4) +
+//        (m >> (n - msb_in_token)),
+//  bits=m & (1 << (n - msb_in_token)) - 1)
+// Specifically, we would get:
+// N = 0 - 15:          (token=N, nbits=0, bits='')
+// N = 16 (10000):      (token=16, nbits=2, bits='00')
+// N = 17 (10001):      (token=16, nbits=2, bits='01')
+// N = 20 (10100):      (token=17, nbits=2, bits='00')
+// N = 24 (11000):      (token=18, nbits=2, bits='00')
+// N = 28 (11100):      (token=19, nbits=2, bits='00')
+// N = 32 (100000):     (token=20, nbits=3, bits='000')
+// N = 65535:           (token=63, nbits=13, bits='1111111111111')
+struct HybridUintConfig {
+  uint32_t split_exponent;
+  uint32_t split_token;
+  uint32_t msb_in_token;
+  uint32_t lsb_in_token;
+  JXL_INLINE void Encode(uint32_t value, uint32_t* JXL_RESTRICT token,
+                         uint32_t* JXL_RESTRICT nbits,
+                         uint32_t* JXL_RESTRICT bits) const {
+    if (value < split_token) {
+      *token = value;
+      *nbits = 0;
+      *bits = 0;
+    } else {
+      uint32_t n = FloorLog2Nonzero(value);
+      uint32_t m = value - (1 << n);
+      *token = split_token +
+               ((n - split_exponent) << (msb_in_token + lsb_in_token)) +
+               ((m >> (n - msb_in_token)) << lsb_in_token) +
+               (m & ((1 << lsb_in_token) - 1));
+      *nbits = n - msb_in_token - lsb_in_token;
+      *bits = (value >> lsb_in_token) & ((1UL << *nbits) - 1);
+    }
+  }
+
+  explicit HybridUintConfig(uint32_t split_exponent = 4,
+                            uint32_t msb_in_token = 2,
+                            uint32_t lsb_in_token = 0)
+      : split_exponent(split_exponent),
+        split_token(1 << split_exponent),
+        msb_in_token(msb_in_token),
+        lsb_in_token(lsb_in_token) {
+    JXL_DASSERT(split_exponent >= msb_in_token + lsb_in_token);
+  }
+};
+
+struct LZ77Params : public Fields {
+  LZ77Params();
+  JXL_FIELDS_NAME(LZ77Params)
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+  bool enabled;
+
+  // Symbols above min_symbol use a special hybrid uint encoding and
+  // represent a length, to be added to min_length.
+  uint32_t min_symbol;
+  uint32_t min_length;
+
+  // Not serialized by VisitFields.
+  HybridUintConfig length_uint_config{0, 0, 0};
+
+  size_t nonserialized_distance_context;
+};
+
+static constexpr size_t kWindowSize = 1 << 20;
+static constexpr size_t kNumSpecialDistances = 120;
+// Table of special distance codes from WebP lossless.
+static constexpr int8_t kSpecialDistances[kNumSpecialDistances][2] = {
+    {0, 1},  {1, 0},  {1, 1},  {-1, 1}, {0, 2},  {2, 0},  {1, 2},  {-1, 2},
+    {2, 1},  {-2, 1}, {2, 2},  {-2, 2}, {0, 3},  {3, 0},  {1, 3},  {-1, 3},
+    {3, 1},  {-3, 1}, {2, 3},  {-2, 3}, {3, 2},  {-3, 2}, {0, 4},  {4, 0},
+    {1, 4},  {-1, 4}, {4, 1},  {-4, 1}, {3, 3},  {-3, 3}, {2, 4},  {-2, 4},
+    {4, 2},  {-4, 2}, {0, 5},  {3, 4},  {-3, 4}, {4, 3},  {-4, 3}, {5, 0},
+    {1, 5},  {-1, 5}, {5, 1},  {-5, 1}, {2, 5},  {-2, 5}, {5, 2},  {-5, 2},
+    {4, 4},  {-4, 4}, {3, 5},  {-3, 5}, {5, 3},  {-5, 3}, {0, 6},  {6, 0},
+    {1, 6},  {-1, 6}, {6, 1},  {-6, 1}, {2, 6},  {-2, 6}, {6, 2},  {-6, 2},
+    {4, 5},  {-4, 5}, {5, 4},  {-5, 4}, {3, 6},  {-3, 6}, {6, 3},  {-6, 3},
+    {0, 7},  {7, 0},  {1, 7},  {-1, 7}, {5, 5},  {-5, 5}, {7, 1},  {-7, 1},
+    {4, 6},  {-4, 6}, {6, 4},  {-6, 4}, {2, 7},  {-2, 7}, {7, 2},  {-7, 2},
+    {3, 7},  {-3, 7}, {7, 3},  {-7, 3}, {5, 6},  {-5, 6}, {6, 5},  {-6, 5},
+    {8, 0},  {4, 7},  {-4, 7}, {7, 4},  {-7, 4}, {8, 1},  {8, 2},  {6, 6},
+    {-6, 6}, {8, 3},  {5, 7},  {-5, 7}, {7, 5},  {-7, 5}, {8, 4},  {6, 7},
+    {-6, 7}, {7, 6},  {-7, 6}, {8, 5},  {7, 7},  {-7, 7}, {8, 6},  {8, 7}};
+
+struct ANSCode {
+  CacheAlignedUniquePtr alias_tables;
+  std::vector<HuffmanDecodingData> huffman_data;
+  std::vector<HybridUintConfig> uint_config;
+  std::vector<int> degenerate_symbols;
+  bool use_prefix_code;
+  uint8_t log_alpha_size;  // for ANS.
+  LZ77Params lz77;
+  // Maximum number of bits necessary to represent the result of a
+  // ReadHybridUint call done with this ANSCode.
+  size_t max_num_bits = 0;
+  void UpdateMaxNumBits(size_t ctx, size_t symbol);
+};
+
+class ANSSymbolReader {
+ public:
+  // Invalid symbol reader, to be overwritten.
+  ANSSymbolReader() = default;
+  ANSSymbolReader(const ANSCode* code, BitReader* JXL_RESTRICT br,
+                  size_t distance_multiplier = 0)
+      : alias_tables_(
+            reinterpret_cast<AliasTable::Entry*>(code->alias_tables.get())),
+        huffman_data_(code->huffman_data.data()),
+        use_prefix_code_(code->use_prefix_code),
+        configs(code->uint_config.data()) {
+    if (!use_prefix_code_) {
+      state_ = static_cast<uint32_t>(br->ReadFixedBits<32>());
+      log_alpha_size_ = code->log_alpha_size;
+      log_entry_size_ = ANS_LOG_TAB_SIZE - code->log_alpha_size;
+      entry_size_minus_1_ = (1 << log_entry_size_) - 1;
+    } else {
+      state_ = (ANS_SIGNATURE << 16u);
+    }
+    if (!code->lz77.enabled) return;
+    // a std::vector incurs unacceptable decoding speed loss because of
+    // initialization.
+    lz77_window_storage_ = AllocateArray(kWindowSize * sizeof(uint32_t));
+    lz77_window_ = reinterpret_cast<uint32_t*>(lz77_window_storage_.get());
+    lz77_ctx_ = code->lz77.nonserialized_distance_context;
+    lz77_length_uint_ = code->lz77.length_uint_config;
+    lz77_threshold_ = code->lz77.min_symbol;
+    lz77_min_length_ = code->lz77.min_length;
+    num_special_distances_ =
+        distance_multiplier == 0 ? 0 : kNumSpecialDistances;
+    for (size_t i = 0; i < num_special_distances_; i++) {
+      int dist = kSpecialDistances[i][0];
+      dist += static_cast<int>(distance_multiplier) * kSpecialDistances[i][1];
+      if (dist < 1) dist = 1;
+      special_distances_[i] = dist;
+    }
+  }
+
+  JXL_INLINE size_t ReadSymbolANSWithoutRefill(const size_t histo_idx,
+                                               BitReader* JXL_RESTRICT br) {
+    const uint32_t res = state_ & (ANS_TAB_SIZE - 1u);
+
+    const AliasTable::Entry* table =
+        &alias_tables_[histo_idx << log_alpha_size_];
+    const AliasTable::Symbol symbol =
+        AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_);
+    state_ = symbol.freq * (state_ >> ANS_LOG_TAB_SIZE) + symbol.offset;
+
+#if 1
+    // Branchless version is about equally fast on SKX.
+    const uint32_t new_state =
+        (state_ << 16u) | static_cast<uint32_t>(br->PeekFixedBits<16>());
+    const bool normalize = state_ < (1u << 16u);
+    state_ = normalize ? new_state : state_;
+    br->Consume(normalize ? 16 : 0);
+#else
+    if (JXL_UNLIKELY(state_ < (1u << 16u))) {
+      state_ = (state_ << 16u) | br->PeekFixedBits<16>();
+      br->Consume(16);
+    }
+#endif
+    const uint32_t next_res = state_ & (ANS_TAB_SIZE - 1u);
+    AliasTable::Prefetch(table, next_res, log_entry_size_);
+
+    return symbol.value;
+  }
+
+  JXL_INLINE size_t ReadSymbolHuffWithoutRefill(const size_t histo_idx,
+                                                BitReader* JXL_RESTRICT br) {
+    return huffman_data_[histo_idx].ReadSymbol(br);
+  }
+
+  JXL_INLINE size_t ReadSymbolWithoutRefill(const size_t histo_idx,
+                                            BitReader* JXL_RESTRICT br) {
+    // TODO(veluca): hoist if in hotter loops.
+    if (JXL_UNLIKELY(use_prefix_code_)) {
+      return ReadSymbolHuffWithoutRefill(histo_idx, br);
+    }
+    return ReadSymbolANSWithoutRefill(histo_idx, br);
+  }
+
+  JXL_INLINE size_t ReadSymbol(const size_t histo_idx,
+                               BitReader* JXL_RESTRICT br) {
+    br->Refill();
+    return ReadSymbolWithoutRefill(histo_idx, br);
+  }
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  bool CheckANSFinalState() const { return true; }
+#else
+  bool CheckANSFinalState() const { return state_ == (ANS_SIGNATURE << 16u); }
+#endif
+
+  template <typename BitReader>
+  static JXL_INLINE uint32_t ReadHybridUintConfig(
+      const HybridUintConfig& config, size_t token, BitReader* br) {
+    size_t split_token = config.split_token;
+    size_t msb_in_token = config.msb_in_token;
+    size_t lsb_in_token = config.lsb_in_token;
+    size_t split_exponent = config.split_exponent;
+    // Fast-track version of hybrid integer decoding.
+    if (token < split_token) return token;
+    uint32_t nbits = split_exponent - (msb_in_token + lsb_in_token) +
+                     ((token - split_token) >> (msb_in_token + lsb_in_token));
+    // Max amount of bits for ReadBits is 32 and max valid left shift is 29
+    // bits. However, for speed no error is propagated here, instead limit the
+    // nbits size. If nbits > 29, the code stream is invalid, but no error is
+    // returned.
+    // Note that in most cases we will emit an error if the histogram allows
+    // representing numbers that would cause invalid shifts, but we need to
+    // keep this check as when LZ77 is enabled it might make sense to have an
+    // histogram that could in principle cause invalid shifts.
+    nbits &= 31u;
+    uint32_t low = token & ((1 << lsb_in_token) - 1);
+    token >>= lsb_in_token;
+    const size_t bits = br->PeekBits(nbits);
+    br->Consume(nbits);
+    size_t ret = (((((1 << msb_in_token) | (token & ((1 << msb_in_token) - 1)))
+                    << nbits) |
+                   bits)
+                  << lsb_in_token) |
+                 low;
+    // TODO(eustas): mark BitReader as unhealthy if nbits > 29 or ret does not
+    //               fit uint32_t
+    return static_cast<uint32_t>(ret);
+  }
+
+  // Takes a *clustered* idx. Can only use if HuffRleOnly() is true.
+  void ReadHybridUintClusteredHuffRleOnly(size_t ctx,
+                                          BitReader* JXL_RESTRICT br,
+                                          uint32_t* value, uint32_t* run) {
+    JXL_DASSERT(HuffRleOnly());
+    br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+    size_t token = ReadSymbolHuffWithoutRefill(ctx, br);
+    if (JXL_UNLIKELY(token >= lz77_threshold_)) {
+      *run =
+          ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) +
+          lz77_min_length_ - 1;
+      return;
+    }
+    *value = ReadHybridUintConfig(configs[ctx], token, br);
+  }
+  bool HuffRleOnly() {
+    if (lz77_window_ == nullptr) return false;
+    if (!use_prefix_code_) return false;
+    for (size_t i = 0; i < kHuffmanTableBits; i++) {
+      if (huffman_data_[lz77_ctx_].table_[i].bits) return false;
+      if (huffman_data_[lz77_ctx_].table_[i].value != 1) return false;
+    }
+    if (configs[lz77_ctx_].split_token > 1) return false;
+    return true;
+  }
+
+  // Takes a *clustered* idx.
+  size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) {
+    if (JXL_UNLIKELY(num_to_copy_ > 0)) {
+      size_t ret = lz77_window_[(copy_pos_++) & kWindowMask];
+      num_to_copy_--;
+      lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+      return ret;
+    }
+    br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+    size_t token = ReadSymbolWithoutRefill(ctx, br);
+    if (JXL_UNLIKELY(token >= lz77_threshold_)) {
+      num_to_copy_ =
+          ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) +
+          lz77_min_length_;
+      br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+      // Distance code.
+      size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br);
+      size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br);
+      if (JXL_LIKELY(distance < num_special_distances_)) {
+        distance = special_distances_[distance];
+      } else {
+        distance = distance + 1 - num_special_distances_;
+      }
+      if (JXL_UNLIKELY(distance > num_decoded_)) {
+        distance = num_decoded_;
+      }
+      if (JXL_UNLIKELY(distance > kWindowSize)) {
+        distance = kWindowSize;
+      }
+      copy_pos_ = num_decoded_ - distance;
+      if (JXL_UNLIKELY(distance == 0)) {
+        JXL_DASSERT(lz77_window_ != nullptr);
+        // distance 0 -> num_decoded_ == copy_pos_ == 0
+        size_t to_fill = std::min<size_t>(num_to_copy_, kWindowSize);
+        memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0]));
+      }
+      // TODO(eustas): overflow; mark BitReader as unhealthy
+      if (num_to_copy_ < lz77_min_length_) return 0;
+      return ReadHybridUintClustered(ctx, br);  // will trigger a copy.
+    }
+    size_t ret = ReadHybridUintConfig(configs[ctx], token, br);
+    if (lz77_window_) lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+    return ret;
+  }
+
+  JXL_INLINE size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br,
+                                   const std::vector<uint8_t>& context_map) {
+    return ReadHybridUintClustered(context_map[ctx], br);
+  }
+
+  // ctx is a *clustered* context!
+  // This function will modify the ANS state as if `count` symbols have been
+  // decoded.
+  bool IsSingleValueAndAdvance(size_t ctx, uint32_t* value, size_t count) {
+    // TODO(veluca): No optimization for Huffman mode yet.
+    if (use_prefix_code_) return false;
+    // TODO(eustas): propagate "degenerate_symbol" to simplify this method.
+    const uint32_t res = state_ & (ANS_TAB_SIZE - 1u);
+    const AliasTable::Entry* table = &alias_tables_[ctx << log_alpha_size_];
+    AliasTable::Symbol symbol =
+        AliasTable::Lookup(table, res, log_entry_size_, entry_size_minus_1_);
+    if (symbol.freq != ANS_TAB_SIZE) return false;
+    if (configs[ctx].split_token <= symbol.value) return false;
+    if (symbol.value >= lz77_threshold_) return false;
+    *value = symbol.value;
+    if (lz77_window_) {
+      for (size_t i = 0; i < count; i++) {
+        lz77_window_[(num_decoded_++) & kWindowMask] = symbol.value;
+      }
+    }
+    return true;
+  }
+
+  static constexpr size_t kMaxCheckpointInterval = 512;
+  struct Checkpoint {
+    uint32_t state;
+    uint32_t num_to_copy;
+    uint32_t copy_pos;
+    uint32_t num_decoded;
+    uint32_t lz77_window[kMaxCheckpointInterval];
+  };
+  void Save(Checkpoint* checkpoint) {
+    checkpoint->state = state_;
+    checkpoint->num_decoded = num_decoded_;
+    checkpoint->num_to_copy = num_to_copy_;
+    checkpoint->copy_pos = copy_pos_;
+    if (lz77_window_) {
+      size_t win_start = num_decoded_ & kWindowMask;
+      size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask;
+      if (win_end > win_start) {
+        memcpy(checkpoint->lz77_window, lz77_window_ + win_start,
+               (win_end - win_start) * sizeof(*lz77_window_));
+      } else {
+        memcpy(checkpoint->lz77_window, lz77_window_ + win_start,
+               (kWindowSize - win_start) * sizeof(*lz77_window_));
+        memcpy(checkpoint->lz77_window + (kWindowSize - win_start),
+               lz77_window_, win_end * sizeof(*lz77_window_));
+      }
+    }
+  }
+  void Restore(const Checkpoint& checkpoint) {
+    state_ = checkpoint.state;
+    JXL_DASSERT(num_decoded_ <=
+                checkpoint.num_decoded + kMaxCheckpointInterval);
+    num_decoded_ = checkpoint.num_decoded;
+    num_to_copy_ = checkpoint.num_to_copy;
+    copy_pos_ = checkpoint.copy_pos;
+    if (lz77_window_) {
+      size_t win_start = num_decoded_ & kWindowMask;
+      size_t win_end = (num_decoded_ + kMaxCheckpointInterval) & kWindowMask;
+      if (win_end > win_start) {
+        memcpy(lz77_window_ + win_start, checkpoint.lz77_window,
+               (win_end - win_start) * sizeof(*lz77_window_));
+      } else {
+        memcpy(lz77_window_ + win_start, checkpoint.lz77_window,
+               (kWindowSize - win_start) * sizeof(*lz77_window_));
+        memcpy(lz77_window_, checkpoint.lz77_window + (kWindowSize - win_start),
+               win_end * sizeof(*lz77_window_));
+      }
+    }
+  }
+
+ private:
+  const AliasTable::Entry* JXL_RESTRICT alias_tables_;  // not owned
+  const HuffmanDecodingData* huffman_data_;
+  bool use_prefix_code_;
+  uint32_t state_ = ANS_SIGNATURE << 16u;
+  const HybridUintConfig* JXL_RESTRICT configs;
+  uint32_t log_alpha_size_{};
+  uint32_t log_entry_size_{};
+  uint32_t entry_size_minus_1_{};
+
+  // LZ77 structures and constants.
+  static constexpr size_t kWindowMask = kWindowSize - 1;
+  CacheAlignedUniquePtr lz77_window_storage_;
+  uint32_t* lz77_window_ = nullptr;
+  uint32_t num_decoded_ = 0;
+  uint32_t num_to_copy_ = 0;
+  uint32_t copy_pos_ = 0;
+  uint32_t lz77_ctx_ = 0;
+  uint32_t lz77_min_length_ = 0;
+  uint32_t lz77_threshold_ = 1 << 20;  // bigger than any symbol.
+  HybridUintConfig lz77_length_uint_;
+  uint32_t special_distances_[kNumSpecialDistances]{};
+  uint32_t num_special_distances_{};
+};
+
+Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code,
+                        std::vector<uint8_t>* context_map,
+                        bool disallow_lz77 = false);
+
+// Exposed for tests.
+Status DecodeUintConfigs(size_t log_alpha_size,
+                         std::vector<HybridUintConfig>* uint_config,
+                         BitReader* br);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_ANS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h
new file mode 100644
index 0000000000..df70284e3b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_bit_reader.h
@@ -0,0 +1,354 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_BIT_READER_H_
+#define LIB_JXL_DEC_BIT_READER_H_
+
+// Bounds-checked bit reader; 64-bit buffer with support for deferred refills
+// and switching to reading byte-aligned words.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#ifdef __BMI2__
+#include <immintrin.h>
+#endif
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+// Reads bits previously written to memory by BitWriter. Uses unaligned 8-byte
+// little-endian loads.
+class BitReader {
+ public:
+  static constexpr size_t kMaxBitsPerCall = 56;
+
+  // Constructs an invalid BitReader, to be overwritten before usage.
+  BitReader()
+      : buf_(0),
+        bits_in_buf_(0),
+        next_byte_{nullptr},
+        end_minus_8_{nullptr},
+        first_byte_(nullptr) {}
+  BitReader(const BitReader&) = delete;
+
+  // bytes need not be aligned nor padded!
+  template <class ArrayLike>
+  explicit BitReader(const ArrayLike& bytes)
+      : buf_(0),
+        bits_in_buf_(0),
+        next_byte_(bytes.data()),
+        // Assumes first_byte_ >= 8.
+        end_minus_8_(bytes.data() - 8 + bytes.size()),
+        first_byte_(bytes.data()) {
+    Refill();
+  }
+  ~BitReader() {
+    // Close() must be called before destroying an initialized bit reader.
+    // Invalid bit readers will have a nullptr in first_byte_.
+    JXL_ASSERT(close_called_ || !first_byte_);
+  }
+
+  // Move operator needs to invalidate the other BitReader such that it is
+  // irrelevant if we call Close() on it or not.
+  BitReader& operator=(BitReader&& other) noexcept {
+    // Ensure the current instance was already closed, before we overwrite it
+    // with other.
+    JXL_ASSERT(close_called_ || !first_byte_);
+
+    JXL_DASSERT(!other.close_called_);
+    buf_ = other.buf_;
+    bits_in_buf_ = other.bits_in_buf_;
+    next_byte_ = other.next_byte_;
+    end_minus_8_ = other.end_minus_8_;
+    first_byte_ = other.first_byte_;
+    overread_bytes_ = other.overread_bytes_;
+    close_called_ = other.close_called_;
+
+    other.first_byte_ = nullptr;
+    other.next_byte_ = nullptr;
+    return *this;
+  }
+  BitReader& operator=(const BitReader& other) = delete;
+
+  // For time-critical reads, refills can be shared by multiple reads.
+  // Based on variant 4 (plus bounds-checking), see
+  // fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
+  JXL_INLINE void Refill() {
+    if (JXL_UNLIKELY(next_byte_ > end_minus_8_)) {
+      BoundsCheckedRefill();
+    } else {
+      // It's safe to load 64 bits; insert valid (possibly nonzero) bits above
+      // bits_in_buf_. The shift requires bits_in_buf_ < 64.
+      buf_ |= LoadLE64(next_byte_) << bits_in_buf_;
+
+      // Advance by bytes fully absorbed into the buffer.
+      next_byte_ += (63 - bits_in_buf_) >> 3;
+
+      // We absorbed a multiple of 8 bits, so the lower 3 bits of bits_in_buf_
+      // must remain unchanged, otherwise the next refill's shifted bits will
+      // not align with buf_. Set the three upper bits so the result >= 56.
+      bits_in_buf_ |= 56;
+      JXL_DASSERT(56 <= bits_in_buf_ && bits_in_buf_ < 64);
+    }
+  }
+
+  // Returns the bits that would be returned by Read without calling Advance().
+  // It is legal to PEEK at more bits than present in the bitstream (required
+  // by Huffman), and those bits will be zero.
+  template <size_t N>
+  JXL_INLINE uint64_t PeekFixedBits() const {
+    static_assert(N <= kMaxBitsPerCall, "Reading too many bits in one call.");
+    JXL_DASSERT(!close_called_);
+    return buf_ & ((1ULL << N) - 1);
+  }
+
+  JXL_INLINE uint64_t PeekBits(size_t nbits) const {
+    JXL_DASSERT(nbits <= kMaxBitsPerCall);
+    JXL_DASSERT(!close_called_);
+
+    // Slightly faster but requires BMI2. It is infeasible to make the many
+    // callers reside between begin/end_target, especially because only the
+    // callers in dec_ans are time-critical. Therefore only enabled if the
+    // entire binary is compiled for (and thus requires) BMI2.
+#if defined(__BMI2__) && defined(__x86_64__)
+    return _bzhi_u64(buf_, nbits);
+#else
+    const uint64_t mask = (1ULL << nbits) - 1;
+    return buf_ & mask;
+#endif
+  }
+
+  // Removes bits from the buffer. Need not match the previous Peek size, but
+  // the buffer must contain at least num_bits (this prevents consuming more
+  // than the total number of bits).
+  JXL_INLINE void Consume(size_t num_bits) {
+    JXL_DASSERT(!close_called_);
+    JXL_DASSERT(bits_in_buf_ >= num_bits);
+#ifdef JXL_CRASH_ON_ERROR
+    // When JXL_CRASH_ON_ERROR is defined, it is a fatal error to read more bits
+    // than available in the stream. A non-zero overread_bytes_ implies that
+    // next_byte_ is already at the end of the stream, so we don't need to
+    // check that.
+    JXL_ASSERT(bits_in_buf_ >= num_bits + overread_bytes_ * kBitsPerByte);
+#endif
+    bits_in_buf_ -= num_bits;
+    buf_ >>= num_bits;
+  }
+
+  JXL_INLINE uint64_t ReadBits(size_t nbits) {
+    JXL_DASSERT(!close_called_);
+    Refill();
+    const uint64_t bits = PeekBits(nbits);
+    Consume(nbits);
+    return bits;
+  }
+
+  template <size_t N>
+  JXL_INLINE uint64_t ReadFixedBits() {
+    JXL_DASSERT(!close_called_);
+    Refill();
+    const uint64_t bits = PeekFixedBits<N>();
+    Consume(N);
+    return bits;
+  }
+
+  // Equivalent to calling ReadFixedBits(1) `skip` times, but much faster.
+  // `skip` is typically large.
+  void SkipBits(size_t skip) {
+    JXL_DASSERT(!close_called_);
+    // Buffer is large enough - don't zero buf_ below.
+    if (JXL_UNLIKELY(skip <= bits_in_buf_)) {
+      Consume(skip);
+      return;
+    }
+
+    // First deduct what we can satisfy from the buffer
+    skip -= bits_in_buf_;
+    bits_in_buf_ = 0;
+    // Not enough to call Advance - that may leave some bits in the buffer
+    // which were previously ABOVE bits_in_buf.
+    buf_ = 0;
+
+    // Skip whole bytes
+    const size_t whole_bytes = skip / kBitsPerByte;
+    skip %= kBitsPerByte;
+    if (JXL_UNLIKELY(whole_bytes >
+                     static_cast<size_t>(end_minus_8_ + 8 - next_byte_))) {
+      // This is already an overflow condition (skipping past the end of the bit
+      // stream). However if we increase next_byte_ too much we risk overflowing
+      // that value and potentially making it valid again (next_byte_ < end).
+      // This will set next_byte_ to the end of the stream and still consume
+      // some bits in overread_bytes_, however the TotalBitsConsumed() will be
+      // incorrect (still larger than the TotalBytes()).
+      next_byte_ = end_minus_8_ + 8;
+      skip += kBitsPerByte;
+    } else {
+      next_byte_ += whole_bytes;
+    }
+
+    Refill();
+    Consume(skip);
+  }
+
+  size_t TotalBitsConsumed() const {
+    const size_t bytes_read = static_cast<size_t>(next_byte_ - first_byte_);
+    return (bytes_read + overread_bytes_) * kBitsPerByte - bits_in_buf_;
+  }
+
+  Status JumpToByteBoundary() {
+    const size_t remainder = TotalBitsConsumed() % kBitsPerByte;
+    if (remainder == 0) return true;
+    if (JXL_UNLIKELY(ReadBits(kBitsPerByte - remainder) != 0)) {
+      return JXL_FAILURE("Non-zero padding bits");
+    }
+    return true;
+  }
+
+  // For interoperability with other bitreaders (for resuming at
+  // non-byte-aligned positions).
+  const uint8_t* FirstByte() const { return first_byte_; }
+  size_t TotalBytes() const {
+    return static_cast<size_t>(end_minus_8_ + 8 - first_byte_);
+  }
+
+  // Returns span of the remaining (unconsumed) bytes, e.g. for passing to
+  // external decoders such as Brotli.
+  Span<const uint8_t> GetSpan() const {
+    JXL_DASSERT(first_byte_ != nullptr);
+    JXL_ASSERT(TotalBitsConsumed() % kBitsPerByte == 0);
+    const size_t offset = TotalBitsConsumed() / kBitsPerByte;  // no remainder
+    JXL_ASSERT(offset <= TotalBytes());
+    return Span<const uint8_t>(first_byte_ + offset, TotalBytes() - offset);
+  }
+
+  // Returns whether all the bits read so far have been within the input bounds.
+  // When reading past the EOF, the Read*() and Consume() functions return zeros
+  // but flag a failure when calling Close() without checking this function.
+  Status AllReadsWithinBounds() {
+    // Mark up to which point the user checked the out of bounds condition. If
+    // the user handles the condition at higher level (e.g. fetch more bytes
+    // from network, return a custom JXL_FAILURE, ...), Close() should not
+    // output a debug error (which would break tests with JXL_CRASH_ON_ERROR
+    // even when legitimately handling the situation at higher level). This is
+    // used by Bundle::CanRead.
+    checked_out_of_bounds_bits_ = TotalBitsConsumed();
+    if (TotalBitsConsumed() > TotalBytes() * kBitsPerByte) {
+      return false;
+    }
+    return true;
+  }
+
+  // Close the bit reader and return whether all the previous reads were
+  // successful. Close must be called once.
+  Status Close() {
+    JXL_DASSERT(!close_called_);
+    close_called_ = true;
+    if (!first_byte_) return true;
+    if (TotalBitsConsumed() > checked_out_of_bounds_bits_ &&
+        TotalBitsConsumed() > TotalBytes() * kBitsPerByte) {
+      return JXL_FAILURE("Read more bits than available in the bit_reader");
+    }
+    return true;
+  }
+
+ private:
+  // Separate function avoids inlining this relatively cold code into callers.
+  JXL_NOINLINE void BoundsCheckedRefill() {
+    PROFILER_FUNC;
+    const uint8_t* end = end_minus_8_ + 8;
+
+    // Read whole bytes until we have [56, 64) bits (same as LoadLE64)
+    for (; bits_in_buf_ < 64 - kBitsPerByte; bits_in_buf_ += kBitsPerByte) {
+      if (next_byte_ >= end) break;
+      buf_ |= static_cast<uint64_t>(*next_byte_++) << bits_in_buf_;
+    }
+    JXL_DASSERT(bits_in_buf_ < 64);
+
+    // Add extra bytes as 0 at the end of the stream in the bit_buffer_. If
+    // these bits are read, Close() will return a failure.
+    size_t extra_bytes = (63 - bits_in_buf_) / kBitsPerByte;
+    overread_bytes_ += extra_bytes;
+    bits_in_buf_ += extra_bytes * kBitsPerByte;
+
+    JXL_DASSERT(bits_in_buf_ < 64);
+    JXL_DASSERT(bits_in_buf_ >= 56);
+  }
+
+  JXL_NOINLINE uint32_t BoundsCheckedReadByteAlignedWord() {
+    if (next_byte_ + 1 < end_minus_8_ + 8) {
+      uint32_t ret = LoadLE16(next_byte_);
+      next_byte_ += 2;
+      return ret;
+    }
+    overread_bytes_ += 2;
+    return 0;
+  }
+
+  uint64_t buf_;
+  size_t bits_in_buf_;  // [0, 64)
+  const uint8_t* JXL_RESTRICT next_byte_;
+  const uint8_t* end_minus_8_;  // for refill bounds check
+  const uint8_t* first_byte_;   // for GetSpan
+
+  // Number of bytes past the end that were loaded into the buf_. These bytes
+  // are not read from memory, but instead assumed 0. It is an error (likely due
+  // to an invalid stream) to Consume() more bits than specified in the range
+  // passed to the constructor.
+  uint64_t overread_bytes_{0};
+  bool close_called_{false};
+
+  uint64_t checked_out_of_bounds_bits_{0};
+};
+
+// Closes a BitReader when the BitReaderScopedCloser goes out of scope. When
+// closing the bit reader, if the status result was failure it sets this failure
+// to the passed variable pointer. Typical usage.
+//
+// Status ret = true;
+// {
+//   BitReader reader(...);
+//   BitReaderScopedCloser reader_closer(&reader, &ret);
+//
+//   // ... code that can return errors here ...
+// }
+// // ... more code that doesn't use the BitReader.
+// return ret;
+
+class BitReaderScopedCloser {
+ public:
+  BitReaderScopedCloser(BitReader* reader, Status* status)
+      : reader_(reader), status_(status) {
+    JXL_DASSERT(reader_ != nullptr);
+    JXL_DASSERT(status_ != nullptr);
+  }
+  ~BitReaderScopedCloser() {
+    if (reader_ != nullptr) {
+      Status close_ret = reader_->Close();
+      if (!close_ret) *status_ = close_ret;
+    }
+  }
+  void CloseAndSuppressError() {
+    JXL_ASSERT(reader_ != nullptr);
+    (void)reader_->Close();
+    reader_ = nullptr;
+  }
+  BitReaderScopedCloser(const BitReaderScopedCloser&) = delete;
+
+ private:
+  BitReader* reader_;
+  Status* status_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_BIT_READER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.cc b/third_party/jpeg-xl/lib/jxl/dec_cache.cc
new file mode 100644
index 0000000000..4db6f1d9a5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.cc
@@ -0,0 +1,229 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_cache.h"
+
+#include "lib/jxl/blending.h"
+#include "lib/jxl/render_pipeline/stage_blending.h"
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_epf.h"
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+#include "lib/jxl/render_pipeline/stage_noise.h"
+#include "lib/jxl/render_pipeline/stage_patches.h"
+#include "lib/jxl/render_pipeline/stage_splines.h"
+#include "lib/jxl/render_pipeline/stage_spot.h"
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_write.h"
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+namespace jxl {
+
+Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
+                                           PipelineOptions options) {
+  const FrameHeader& frame_header = shared->frame_header;
+  size_t num_c = 3 + frame_header.nonserialized_metadata->m.num_extra_channels;
+  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+    num_c += 3;
+  }
+
+  if (frame_header.CanBeReferenced()) {
+    // Necessary so that SetInputSizes() can allocate output buffers as needed.
+    frame_storage_for_referencing = ImageBundle(decoded->metadata());
+  }
+
+  RenderPipeline::Builder builder(num_c);
+
+  if (options.use_slow_render_pipeline) {
+    builder.UseSimpleImplementation();
+  }
+
+  if (!frame_header.chroma_subsampling.Is444()) {
+    for (size_t c = 0; c < 3; c++) {
+      if (frame_header.chroma_subsampling.HShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/true));
+      }
+      if (frame_header.chroma_subsampling.VShift(c) != 0) {
+        builder.AddStage(GetChromaUpsamplingStage(c, /*horizontal=*/false));
+      }
+    }
+  }
+
+  if (frame_header.loop_filter.gab) {
+    builder.AddStage(GetGaborishStage(frame_header.loop_filter));
+  }
+
+  {
+    const LoopFilter& lf = frame_header.loop_filter;
+    if (lf.epf_iters >= 3) {
+      builder.AddStage(GetEPFStage(lf, sigma, 0));
+    }
+    if (lf.epf_iters >= 1) {
+      builder.AddStage(GetEPFStage(lf, sigma, 1));
+    }
+    if (lf.epf_iters >= 2) {
+      builder.AddStage(GetEPFStage(lf, sigma, 2));
+    }
+  }
+
+  bool late_ec_upsample = frame_header.upsampling != 1;
+  for (auto ecups : frame_header.extra_channel_upsampling) {
+    if (ecups != frame_header.upsampling) {
+      // If patches are applied, either frame_header.upsampling == 1 or
+      // late_ec_upsample is true.
+      late_ec_upsample = false;
+    }
+  }
+
+  if (!late_ec_upsample) {
+    for (size_t ec = 0; ec < frame_header.extra_channel_upsampling.size();
+         ec++) {
+      if (frame_header.extra_channel_upsampling[ec] != 1) {
+        builder.AddStage(GetUpsamplingStage(
+            frame_header.nonserialized_metadata->transform_data, 3 + ec,
+            CeilLog2Nonzero(frame_header.extra_channel_upsampling[ec])));
+      }
+    }
+  }
+
+  if ((frame_header.flags & FrameHeader::kPatches) != 0) {
+    builder.AddStage(
+        GetPatchesStage(&shared->image_features.patches,
+                        3 + shared->metadata->m.num_extra_channels));
+  }
+  if ((frame_header.flags & FrameHeader::kSplines) != 0) {
+    builder.AddStage(GetSplineStage(&shared->image_features.splines));
+  }
+
+  if (frame_header.upsampling != 1) {
+    size_t nb_channels =
+        3 +
+        (late_ec_upsample ? frame_header.extra_channel_upsampling.size() : 0);
+    for (size_t c = 0; c < nb_channels; c++) {
+      builder.AddStage(GetUpsamplingStage(
+          frame_header.nonserialized_metadata->transform_data, c,
+          CeilLog2Nonzero(frame_header.upsampling)));
+    }
+  }
+
+  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+    builder.AddStage(GetConvolveNoiseStage(num_c - 3));
+    builder.AddStage(GetAddNoiseStage(shared->image_features.noise_params,
+                                      shared->cmap, num_c - 3));
+  }
+  if (frame_header.dc_level != 0) {
+    builder.AddStage(GetWriteToImage3FStage(
+        &shared_storage.dc_frames[frame_header.dc_level - 1]));
+  }
+
+  if (frame_header.CanBeReferenced() &&
+      frame_header.save_before_color_transform) {
+    builder.AddStage(GetWriteToImageBundleStage(
+        &frame_storage_for_referencing, output_encoding_info.color_encoding));
+  }
+
+  bool has_alpha = false;
+  size_t alpha_c = 0;
+  for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size(); i++) {
+    if (decoded->metadata()->extra_channel_info[i].type ==
+        ExtraChannel::kAlpha) {
+      has_alpha = true;
+      alpha_c = 3 + i;
+      break;
+    }
+  }
+
+  if (fast_xyb_srgb8_conversion) {
+    JXL_ASSERT(!NeedsBlending(this));
+    JXL_ASSERT(!frame_header.CanBeReferenced() ||
+               frame_header.save_before_color_transform);
+    JXL_ASSERT(!options.render_spotcolors ||
+               !decoded->metadata()->Find(ExtraChannel::kSpotColor));
+    bool is_rgba = (main_output.format.num_channels == 4);
+    uint8_t* rgb_output = reinterpret_cast<uint8_t*>(main_output.buffer);
+    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride,
+                                            width, height, is_rgba, has_alpha,
+                                            alpha_c));
+  } else {
+    bool linear = false;
+    if (frame_header.color_transform == ColorTransform::kYCbCr) {
+      builder.AddStage(GetYCbCrStage());
+    } else if (frame_header.color_transform == ColorTransform::kXYB) {
+      builder.AddStage(GetXYBStage(output_encoding_info));
+      if (output_encoding_info.color_encoding.GetColorSpace() !=
+          ColorSpace::kXYB) {
+        linear = true;
+      }
+    }  // Nothing to do for kNone.
+
+    if (options.coalescing && NeedsBlending(this)) {
+      if (linear) {
+        builder.AddStage(GetFromLinearStage(output_encoding_info));
+        linear = false;
+      }
+      builder.AddStage(
+          GetBlendingStage(this, output_encoding_info.color_encoding));
+    }
+
+    if (options.coalescing && frame_header.CanBeReferenced() &&
+        !frame_header.save_before_color_transform) {
+      if (linear) {
+        builder.AddStage(GetFromLinearStage(output_encoding_info));
+        linear = false;
+      }
+      builder.AddStage(GetWriteToImageBundleStage(
+          &frame_storage_for_referencing, output_encoding_info.color_encoding));
+    }
+
+    if (options.render_spotcolors &&
+        frame_header.nonserialized_metadata->m.Find(ExtraChannel::kSpotColor)) {
+      for (size_t i = 0; i < decoded->metadata()->extra_channel_info.size();
+           i++) {
+        // Don't use Find() because there may be multiple spot color channels.
+        const ExtraChannelInfo& eci =
+            decoded->metadata()->extra_channel_info[i];
+        if (eci.type == ExtraChannel::kSpotColor) {
+          builder.AddStage(GetSpotColorStage(3 + i, eci.spot_color));
+        }
+      }
+    }
+
+    auto tone_mapping_stage = GetToneMappingStage(output_encoding_info);
+    if (tone_mapping_stage) {
+      if (!linear) {
+        auto to_linear_stage = GetToLinearStage(output_encoding_info);
+        if (!to_linear_stage) {
+          return JXL_FAILURE(
+              "attempting to perform tone mapping on colorspace not "
+              "convertible to linear");
+        }
+        builder.AddStage(std::move(to_linear_stage));
+        linear = true;
+      }
+      builder.AddStage(std::move(tone_mapping_stage));
+    }
+
+    if (linear) {
+      builder.AddStage(GetFromLinearStage(output_encoding_info));
+      linear = false;
+    }
+
+    if (main_output.callback.IsPresent() || main_output.buffer) {
+      builder.AddStage(GetWriteToOutputStage(main_output, width, height,
+                                             has_alpha, unpremul_alpha, alpha_c,
+                                             undo_orientation, extra_output));
+    } else {
+      builder.AddStage(GetWriteToImageBundleStage(
+          decoded, output_encoding_info.color_encoding));
+    }
+  }
+  render_pipeline = std::move(builder).Finalize(shared->frame_dim);
+  return render_pipeline->IsInitialized();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.h b/third_party/jpeg-xl/lib/jxl/dec_cache.h
new file mode 100644
index 0000000000..7c9fe9a6c3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_CACHE_H_
+#define LIB_JXL_DEC_CACHE_H_
+
+#include <jxl/decode.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+constexpr size_t kSigmaBorder = 1;
+constexpr size_t kSigmaPadding = 2;
+
+struct PixelCallback {
+  PixelCallback() = default;
+  PixelCallback(JxlImageOutInitCallback init, JxlImageOutRunCallback run,
+                JxlImageOutDestroyCallback destroy, void* init_opaque)
+      : init(init), run(run), destroy(destroy), init_opaque(init_opaque) {
+#if JXL_ENABLE_ASSERT
+    const bool has_init = init != nullptr;
+    const bool has_run = run != nullptr;
+    const bool has_destroy = destroy != nullptr;
+    JXL_ASSERT(has_init == has_run && has_run == has_destroy);
+#endif
+  }
+
+  bool IsPresent() const { return run != nullptr; }
+
+  void* Init(size_t num_threads, size_t num_pixels) const {
+    return init(init_opaque, num_threads, num_pixels);
+  }
+
+  JxlImageOutInitCallback init = nullptr;
+  JxlImageOutRunCallback run = nullptr;
+  JxlImageOutDestroyCallback destroy = nullptr;
+  void* init_opaque = nullptr;
+};
+
+struct ImageOutput {
+  // Pixel format of the output pixels, used for buffer and callback output.
+  JxlPixelFormat format;
+  // Output bit depth for unsigned data types, used for float to int conversion.
+  size_t bits_per_sample;
+  // Callback for line-by-line output.
+  PixelCallback callback;
+  // Pixel buffer for image output.
+  void* buffer;
+  size_t buffer_size;
+  // Length of a row of image_buffer in bytes (based on oriented width).
+  size_t stride;
+};
+
+// Per-frame decoder state. All the images here should be accessed through a
+// group rect (either with block units or pixel units).
+struct PassesDecoderState {
+  PassesSharedState shared_storage;
+  // Allows avoiding copies for encoder loop.
+  const PassesSharedState* JXL_RESTRICT shared = &shared_storage;
+
+  // 8x upsampling stage for DC.
+  std::unique_ptr<RenderPipelineStage> upsampler8x;
+
+  // For ANS decoding.
+  std::vector<ANSCode> code;
+  std::vector<std::vector<uint8_t>> context_map;
+
+  // Multiplier to be applied to the quant matrices of the x channel.
+  float x_dm_multiplier;
+  float b_dm_multiplier;
+
+  // Sigma values for EPF.
+  ImageF sigma;
+
+  // Image dimensions before applying undo_orientation.
+  size_t width;
+  size_t height;
+  ImageOutput main_output;
+  std::vector<ImageOutput> extra_output;
+
+  // Whether to use int16 float-XYB-to-uint8-srgb conversion.
+  bool fast_xyb_srgb8_conversion;
+
+  // If true, the RGBA output will be unpremultiplied before writing to the
+  // output.
+  bool unpremul_alpha;
+
+  // The render pipeline will apply this orientation to bring the image to the
+  // intended display orientation.
+  Orientation undo_orientation;
+
+  // Used for seeding noise.
+  size_t visible_frame_index = 0;
+  size_t nonvisible_frame_index = 0;
+
+  // Keep track of the transform types used.
+  std::atomic<uint32_t> used_acs{0};
+
+  // Storage for coefficients if in "accumulate" mode.
+  std::unique_ptr<ACImage> coefficients = make_unique<ACImageT<int32_t>>(0, 0);
+
+  // Rendering pipeline.
+  std::unique_ptr<RenderPipeline> render_pipeline;
+
+  // Storage for the current frame if it can be referenced by future frames.
+  ImageBundle frame_storage_for_referencing;
+
+  struct PipelineOptions {
+    bool use_slow_render_pipeline;
+    bool coalescing;
+    bool render_spotcolors;
+  };
+
+  Status PreparePipeline(ImageBundle* decoded, PipelineOptions options);
+
+  // Information for colour conversions.
+  OutputEncodingInfo output_encoding_info;
+
+  // Initializes decoder-specific structures using information from *shared.
+  Status Init() {
+    x_dm_multiplier =
+        std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f);
+    b_dm_multiplier =
+        std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);
+
+    main_output.callback = PixelCallback();
+    main_output.buffer = nullptr;
+    extra_output.clear();
+
+    fast_xyb_srgb8_conversion = false;
+    unpremul_alpha = false;
+    undo_orientation = Orientation::kIdentity;
+
+    used_acs = 0;
+
+    upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3);
+    if (shared->frame_header.loop_filter.epf_iters > 0) {
+      sigma = ImageF(shared->frame_dim.xsize_blocks + 2 * kSigmaPadding,
+                     shared->frame_dim.ysize_blocks + 2 * kSigmaPadding);
+    }
+    return true;
+  }
+
+  // Initialize the decoder state after all of DC is decoded.
+  Status InitForAC(ThreadPool* pool) {
+    shared_storage.coeff_order_size = 0;
+    for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+      if (((1 << o) & used_acs) == 0) continue;
+      uint8_t ord = kStrategyOrder[o];
+      shared_storage.coeff_order_size =
+          std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize,
+                   shared_storage.coeff_order_size);
+    }
+    size_t sz = shared_storage.frame_header.passes.num_passes *
+                shared_storage.coeff_order_size;
+    if (sz > shared_storage.coeff_orders.size()) {
+      shared_storage.coeff_orders.resize(sz);
+    }
+    return true;
+  }
+
+  // Fills the `state->filter_weights.sigma` image with the precomputed sigma
+  // values in the area inside `block_rect`. Accesses the AC strategy, quant
+  // field and epf_sharpness fields in the corresponding positions.
+  void ComputeSigma(const Rect& block_rect, PassesDecoderState* state);
+};
+
+// Temp images required for decoding a single group. Reduces memory allocations
+// for large images because we only initialize min(#threads, #groups) instances.
+struct GroupDecCache {
+  void InitOnce(size_t num_passes, size_t used_acs) {
+    PROFILER_FUNC;
+
+    for (size_t i = 0; i < num_passes; i++) {
+      if (num_nzeroes[i].xsize() == 0) {
+        // Allocate enough for a whole group - partial groups on the
+        // right/bottom border just use a subset. The valid size is passed via
+        // Rect.
+
+        num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
+      }
+    }
+    size_t max_block_area = 0;
+
+    for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+      AcStrategy acs = AcStrategy::FromRawStrategy(o);
+      if ((used_acs & (1 << o)) == 0) continue;
+      size_t area =
+          acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
+      max_block_area = std::max(area, max_block_area);
+    }
+
+    if (max_block_area > max_block_area_) {
+      max_block_area_ = max_block_area;
+      // We need 3x float blocks for dequantized coefficients and 1x for scratch
+      // space for transforms.
+      float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 4);
+      // We need 3x int32 or int16 blocks for quantized coefficients.
+      int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3);
+      int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3);
+    }
+
+    dec_group_block = float_memory_.get();
+    scratch_space = dec_group_block + max_block_area_ * 3;
+    dec_group_qblock = int32_memory_.get();
+    dec_group_qblock16 = int16_memory_.get();
+  }
+
+  void InitDCBufferOnce() {
+    if (dc_buffer.xsize() == 0) {
+      dc_buffer = ImageF(kGroupDimInBlocks + kRenderPipelineXOffset * 2,
+                         kGroupDimInBlocks + 4);
+    }
+  }
+
+  // Scratch space used by DecGroupImpl().
+  float* dec_group_block;
+  int32_t* dec_group_qblock;
+  int16_t* dec_group_qblock16;
+
+  // For TransformToPixels.
+  float* scratch_space;
+  // Note that scratch_space is never used at the same time as dec_group_qblock.
+  // Moreover, only one of dec_group_qblock16 is ever used.
+  // TODO(veluca): figure out if we can save allocations.
+
+  // AC decoding
+  Image3I num_nzeroes[kMaxNumPasses];
+
+  // Buffer for DC upsampling.
+  ImageF dc_buffer;
+
+ private:
+  hwy::AlignedFreeUniquePtr<float[]> float_memory_;
+  hwy::AlignedFreeUniquePtr<int32_t[]> int32_memory_;
+  hwy::AlignedFreeUniquePtr<int16_t[]> int16_memory_;
+  size_t max_block_area_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_CACHE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.cc b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc
new file mode 100644
index 0000000000..1b291650d7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.cc
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_context_map.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/inverse_mtf-inl.h"
+
+namespace jxl {
+
+namespace {
+
+Status VerifyContextMap(const std::vector<uint8_t>& context_map,
+                        const size_t num_htrees) {
+  std::vector<bool> have_htree(num_htrees);
+  size_t num_found = 0;
+  for (const uint8_t htree : context_map) {
+    if (htree >= num_htrees) {
+      return JXL_FAILURE("Invalid histogram index in context map.");
+    }
+    if (!have_htree[htree]) {
+      have_htree[htree] = true;
+      ++num_found;
+    }
+  }
+  if (num_found != num_htrees) {
+    return JXL_FAILURE("Incomplete context map.");
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DecodeContextMap(std::vector<uint8_t>* context_map, size_t* num_htrees,
+                        BitReader* input) {
+  bool is_simple = input->ReadFixedBits<1>();
+  if (is_simple) {
+    int bits_per_entry = input->ReadFixedBits<2>();
+    if (bits_per_entry != 0) {
+      for (size_t i = 0; i < context_map->size(); i++) {
+        (*context_map)[i] = input->ReadBits(bits_per_entry);
+      }
+    } else {
+      std::fill(context_map->begin(), context_map->end(), 0);
+    }
+  } else {
+    bool use_mtf = input->ReadFixedBits<1>();
+    ANSCode code;
+    std::vector<uint8_t> dummy_ctx_map;
+    // Usage of LZ77 is disallowed if decoding only two symbols. This doesn't
+    // make sense in non-malicious bitstreams, and could cause a stack overflow
+    // in malicious bitstreams by making every context map require its own
+    // context map.
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(input, 1, &code, &dummy_ctx_map,
+                         /*disallow_lz77=*/context_map->size() <= 2));
+    ANSSymbolReader reader(&code, input);
+    size_t i = 0;
+    while (i < context_map->size()) {
+      uint32_t sym = reader.ReadHybridUint(0, input, dummy_ctx_map);
+      if (sym >= kMaxClusters) {
+        return JXL_FAILURE("Invalid cluster ID");
+      }
+      (*context_map)[i] = sym;
+      i++;
+    }
+    if (!reader.CheckANSFinalState()) {
+      return JXL_FAILURE("Invalid context map");
+    }
+    if (use_mtf) {
+      InverseMoveToFrontTransform(context_map->data(), context_map->size());
+    }
+  }
+  *num_htrees = *std::max_element(context_map->begin(), context_map->end()) + 1;
+  return VerifyContextMap(*context_map, *num_htrees);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_context_map.h b/third_party/jpeg-xl/lib/jxl/dec_context_map.h
new file mode 100644
index 0000000000..95b8a0ca92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_context_map.h
@@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_CONTEXT_MAP_H_
+#define LIB_JXL_DEC_CONTEXT_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/dec_bit_reader.h"
+
+namespace jxl {
+
+// Context map uses uint8_t.
+constexpr size_t kMaxClusters = 256;
+
+// Reads the context map from the bit stream. On calling this function,
+// context_map->size() must be the number of possible context ids.
+// Sets *num_htrees to the number of different histogram ids in
+// *context_map.
+Status DecodeContextMap(std::vector<uint8_t>* context_map, size_t* num_htrees,
+                        BitReader* input);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_CONTEXT_MAP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
new file mode 100644
index 0000000000..bbf457ba91
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
@@ -0,0 +1,493 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_external_image.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_external_image.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::NearestInt;
+
+// TODO(jon): check if this can be replaced by a FloatToU16 function
+void FloatToU32(const float* in, uint32_t* out, size_t num, float mul,
+                size_t bits_per_sample) {
+  const HWY_FULL(float) d;
+  const hwy::HWY_NAMESPACE::Rebind<uint32_t, decltype(d)> du;
+
+  // Unpoison accessing partially-uninitialized vectors with memory sanitizer.
+  // This is because we run NearestInt() on the vector, which triggers msan even
+  // it it safe to do so since the values are not mixed between lanes.
+  const size_t num_round_up = RoundUpTo(num, Lanes(d));
+  msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num));
+
+  const auto one = Set(d, 1.0f);
+  const auto scale = Set(d, mul);
+  for (size_t x = 0; x < num; x += Lanes(d)) {
+    auto v = Load(d, in + x);
+    // Clamp turns NaN to 'min'.
+    v = Clamp(v, Zero(d), one);
+    auto i = NearestInt(Mul(v, scale));
+    Store(BitCast(du, i), du, out + x);
+  }
+
+  // Poison back the output.
+  msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num));
+}
+
+void FloatToF16(const float* in, hwy::float16_t* out, size_t num) {
+  const HWY_FULL(float) d;
+  const hwy::HWY_NAMESPACE::Rebind<hwy::float16_t, decltype(d)> du;
+
+  // Unpoison accessing partially-uninitialized vectors with memory sanitizer.
+  // This is because we run DemoteTo() on the vector which triggers msan.
+  const size_t num_round_up = RoundUpTo(num, Lanes(d));
+  msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num));
+
+  for (size_t x = 0; x < num; x += Lanes(d)) {
+    auto v = Load(d, in + x);
+    auto v16 = DemoteTo(du, v);
+    Store(v16, du, out + x);
+  }
+
+  // Poison back the output.
+  msan::PoisonMemory(out + num, sizeof(out[0]) * (num_round_up - num));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+namespace {
+
+// Stores a float in big endian
+void StoreBEFloat(float value, uint8_t* p) {
+  uint32_t u;
+  memcpy(&u, &value, 4);
+  StoreBE32(u, p);
+}
+
+// Stores a float in little endian
+void StoreLEFloat(float value, uint8_t* p) {
+  uint32_t u;
+  memcpy(&u, &value, 4);
+  StoreLE32(u, p);
+}
+
+// The orientation may not be identity.
+// TODO(lode): SIMDify where possible
+template <typename T>
+Status UndoOrientation(jxl::Orientation undo_orientation, const Plane<T>& image,
+                       Plane<T>& out, jxl::ThreadPool* pool) {
+  const size_t xsize = image.xsize();
+  const size_t ysize = image.ysize();
+
+  if (undo_orientation == Orientation::kFlipHorizontal) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[xsize - x - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate180) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(ysize - y - 1);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[xsize - x - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kFlipVertical) {
+    out = Plane<T>(xsize, ysize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          T* JXL_RESTRICT row_out = out.Row(ysize - y - 1);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[x] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kTranspose) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(x)[y] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate90) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(x)[ysize - y - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kAntiTranspose) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(xsize - x - 1)[ysize - y - 1] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  } else if (undo_orientation == Orientation::kRotate270) {
+    out = Plane<T>(ysize, xsize);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+        [&](const uint32_t task, size_t /*thread*/) {
+          const int64_t y = task;
+          const T* JXL_RESTRICT row_in = image.Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            out.Row(xsize - x - 1)[y] = row_in[x];
+          }
+        },
+        "UndoOrientation"));
+  }
+  return true;
+}
+}  // namespace
+
+HWY_EXPORT(FloatToU32);
+HWY_EXPORT(FloatToF16);
+
+namespace {
+
+using StoreFuncType = void(uint32_t value, uint8_t* dest);
+template <StoreFuncType StoreFunc>
+void StoreUintRow(uint32_t* JXL_RESTRICT* rows_u32, size_t num_channels,
+                  size_t xsize, size_t bytes_per_sample,
+                  uint8_t* JXL_RESTRICT out) {
+  for (size_t x = 0; x < xsize; ++x) {
+    for (size_t c = 0; c < num_channels; c++) {
+      StoreFunc(rows_u32[c][x],
+                out + (num_channels * x + c) * bytes_per_sample);
+    }
+  }
+}
+
+template <void(StoreFunc)(float, uint8_t*)>
+void StoreFloatRow(const float* JXL_RESTRICT* rows_in, size_t num_channels,
+                   size_t xsize, uint8_t* JXL_RESTRICT out) {
+  for (size_t x = 0; x < xsize; ++x) {
+    for (size_t c = 0; c < num_channels; c++) {
+      StoreFunc(rows_in[c][x], out + (num_channels * x + c) * sizeof(float));
+    }
+  }
+}
+
+void JXL_INLINE Store8(uint32_t value, uint8_t* dest) { *dest = value & 0xff; }
+
+// Maximum number of channels for the ConvertChannelsToExternal function.
+const size_t kConvertMaxChannels = 4;
+
+// Converts a list of channels to an interleaved image, applying transformations
+// when needed.
+// The input channels are given as a (non-const!) array of channel pointers and
+// interleaved in that order.
+//
+// Note: if a pointer in channels[] is nullptr, a 1.0 value will be used
+// instead. This is useful for handling when a user requests an alpha channel
+// from an image that doesn't have one. The first channel in the list may not
+// be nullptr, since it is used to determine the image size.
+Status ConvertChannelsToExternal(const ImageF* channels[], size_t num_channels,
+                                 size_t bits_per_sample, bool float_out,
+                                 JxlEndianness endianness, size_t stride,
+                                 jxl::ThreadPool* pool, void* out_image,
+                                 size_t out_size,
+                                 const PixelCallback& out_callback,
+                                 jxl::Orientation undo_orientation) {
+  JXL_DASSERT(num_channels != 0 && num_channels <= kConvertMaxChannels);
+  JXL_DASSERT(channels[0] != nullptr);
+  JXL_CHECK(float_out ? bits_per_sample == 16 || bits_per_sample == 32
+                      : bits_per_sample > 0 && bits_per_sample <= 16);
+  if (!!out_image == out_callback.IsPresent()) {
+    return JXL_FAILURE(
+        "Must provide either an out_image or an out_callback, but not both.");
+  }
+
+  const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
+  const size_t bytes_per_pixel = num_channels * bytes_per_channel;
+
+  std::vector<std::vector<uint8_t>> row_out_callback;
+  const auto FreeCallbackOpaque = [&out_callback](void* p) {
+    out_callback.destroy(p);
+  };
+  std::unique_ptr<void, decltype(FreeCallbackOpaque)> out_run_opaque(
+      nullptr, FreeCallbackOpaque);
+  auto InitOutCallback = [&](size_t num_threads) -> Status {
+    if (out_callback.IsPresent()) {
+      out_run_opaque.reset(out_callback.Init(num_threads, stride));
+      JXL_RETURN_IF_ERROR(out_run_opaque != nullptr);
+      row_out_callback.resize(num_threads);
+      for (size_t i = 0; i < num_threads; ++i) {
+        row_out_callback[i].resize(stride);
+      }
+    }
+    return true;
+  };
+
+  // Channels used to store the transformed original channels if needed.
+  ImageF temp_channels[kConvertMaxChannels];
+  if (undo_orientation != Orientation::kIdentity) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      if (channels[c]) {
+        JXL_RETURN_IF_ERROR(UndoOrientation(undo_orientation, *channels[c],
+                                            temp_channels[c], pool));
+        channels[c] = &(temp_channels[c]);
+      }
+    }
+  }
+
+  // First channel may not be nullptr.
+  size_t xsize = channels[0]->xsize();
+  size_t ysize = channels[0]->ysize();
+  if (stride < bytes_per_pixel * xsize) {
+    return JXL_FAILURE("stride is smaller than scanline width in bytes: %" PRIuS
+                       " vs %" PRIuS,
+                       stride, bytes_per_pixel * xsize);
+  }
+  if (!out_callback.IsPresent() &&
+      out_size < (ysize - 1) * stride + bytes_per_pixel * xsize) {
+    return JXL_FAILURE("out_size is too small to store image");
+  }
+
+  const bool little_endian =
+      endianness == JXL_LITTLE_ENDIAN ||
+      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+
+  // Handle the case where a channel is nullptr by creating a single row with
+  // ones to use instead.
+  ImageF ones;
+  for (size_t c = 0; c < num_channels; ++c) {
+    if (!channels[c]) {
+      ones = ImageF(xsize, 1);
+      FillImage(1.0f, &ones);
+      break;
+    }
+  }
+
+  if (float_out) {
+    if (bits_per_sample == 16) {
+      bool swap_endianness = little_endian != IsLittleEndian();
+      Plane<hwy::float16_t> f16_cache;
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, static_cast<uint32_t>(ysize),
+          [&](size_t num_threads) {
+            f16_cache =
+                Plane<hwy::float16_t>(xsize, num_channels * num_threads);
+            return InitOutCallback(num_threads);
+          },
+          [&](const uint32_t task, const size_t thread) {
+            const int64_t y = task;
+            const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+            }
+            hwy::float16_t* JXL_RESTRICT row_f16[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_f16[c] = f16_cache.Row(c + thread * num_channels);
+              HWY_DYNAMIC_DISPATCH(FloatToF16)
+              (row_in[c], row_f16[c], xsize);
+            }
+            uint8_t* row_out =
+                out_callback.IsPresent()
+                    ? row_out_callback[thread].data()
+                    : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+            // interleave the one scanline
+            hwy::float16_t* row_f16_out =
+                reinterpret_cast<hwy::float16_t*>(row_out);
+            for (size_t x = 0; x < xsize; x++) {
+              for (size_t c = 0; c < num_channels; c++) {
+                row_f16_out[x * num_channels + c] = row_f16[c][x];
+              }
+            }
+            if (swap_endianness) {
+              size_t size = xsize * num_channels * 2;
+              for (size_t i = 0; i < size; i += 2) {
+                std::swap(row_out[i + 0], row_out[i + 1]);
+              }
+            }
+            if (out_callback.IsPresent()) {
+              out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                               row_out);
+            }
+          },
+          "ConvertF16"));
+    } else if (bits_per_sample == 32) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, static_cast<uint32_t>(ysize),
+          [&](size_t num_threads) { return InitOutCallback(num_threads); },
+          [&](const uint32_t task, const size_t thread) {
+            const int64_t y = task;
+            uint8_t* row_out =
+                out_callback.IsPresent()
+                    ? row_out_callback[thread].data()
+                    : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+            const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+            for (size_t c = 0; c < num_channels; c++) {
+              row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+            }
+            if (little_endian) {
+              StoreFloatRow<StoreLEFloat>(row_in, num_channels, xsize, row_out);
+            } else {
+              StoreFloatRow<StoreBEFloat>(row_in, num_channels, xsize, row_out);
+            }
+            if (out_callback.IsPresent()) {
+              out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                               row_out);
+            }
+          },
+          "ConvertFloat"));
+    } else {
+      return JXL_FAILURE("float other than 16-bit and 32-bit not supported");
+    }
+  } else {
+    // Multiplier to convert from floating point 0-1 range to the integer
+    // range.
+    float mul = (1ull << bits_per_sample) - 1;
+    Plane<uint32_t> u32_cache;
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, static_cast<uint32_t>(ysize),
+        [&](size_t num_threads) {
+          u32_cache = Plane<uint32_t>(xsize, num_channels * num_threads);
+          return InitOutCallback(num_threads);
+        },
+        [&](const uint32_t task, const size_t thread) {
+          const int64_t y = task;
+          uint8_t* row_out =
+              out_callback.IsPresent()
+                  ? row_out_callback[thread].data()
+                  : &(reinterpret_cast<uint8_t*>(out_image))[stride * y];
+          const float* JXL_RESTRICT row_in[kConvertMaxChannels];
+          for (size_t c = 0; c < num_channels; c++) {
+            row_in[c] = channels[c] ? channels[c]->Row(y) : ones.Row(0);
+          }
+          uint32_t* JXL_RESTRICT row_u32[kConvertMaxChannels];
+          for (size_t c = 0; c < num_channels; c++) {
+            row_u32[c] = u32_cache.Row(c + thread * num_channels);
+            // row_u32[] is a per-thread temporary row storage, this isn't
+            // intended to be initialized on a previous run.
+            msan::PoisonMemory(row_u32[c], xsize * sizeof(row_u32[c][0]));
+            HWY_DYNAMIC_DISPATCH(FloatToU32)
+            (row_in[c], row_u32[c], xsize, mul, bits_per_sample);
+          }
+          if (bits_per_sample <= 8) {
+            StoreUintRow<Store8>(row_u32, num_channels, xsize, 1, row_out);
+          } else {
+            if (little_endian) {
+              StoreUintRow<StoreLE16>(row_u32, num_channels, xsize, 2, row_out);
+            } else {
+              StoreUintRow<StoreBE16>(row_u32, num_channels, xsize, 2, row_out);
+            }
+          }
+          if (out_callback.IsPresent()) {
+            out_callback.run(out_run_opaque.get(), thread, 0, y, xsize,
+                             row_out);
+          }
+        },
+        "ConvertUint"));
+  }
+  return true;
+}
+
+}  // namespace
+
+Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
+                         bool float_out, size_t num_channels,
+                         JxlEndianness endianness, size_t stride,
+                         jxl::ThreadPool* pool, void* out_image,
+                         size_t out_size, const PixelCallback& out_callback,
+                         jxl::Orientation undo_orientation,
+                         bool unpremul_alpha) {
+  bool want_alpha = num_channels == 2 || num_channels == 4;
+  size_t color_channels = num_channels <= 2 ? 1 : 3;
+
+  const Image3F* color = &ib.color();
+  // Undo premultiplied alpha.
+  Image3F unpremul;
+  if (ib.AlphaIsPremultiplied() && ib.HasAlpha() && unpremul_alpha) {
+    unpremul = Image3F(color->xsize(), color->ysize());
+    CopyImageTo(*color, &unpremul);
+    for (size_t y = 0; y < unpremul.ysize(); y++) {
+      UnpremultiplyAlpha(unpremul.PlaneRow(0, y), unpremul.PlaneRow(1, y),
+                         unpremul.PlaneRow(2, y), ib.alpha().Row(y),
+                         unpremul.xsize());
+    }
+    color = &unpremul;
+  }
+
+  const ImageF* channels[kConvertMaxChannels];
+  size_t c = 0;
+  for (; c < color_channels; c++) {
+    channels[c] = &color->Plane(c);
+  }
+  if (want_alpha) {
+    channels[c++] = ib.HasAlpha() ? &ib.alpha() : nullptr;
+  }
+  JXL_ASSERT(num_channels == c);
+
+  return ConvertChannelsToExternal(
+      channels, num_channels, bits_per_sample, float_out, endianness, stride,
+      pool, out_image, out_size, out_callback, undo_orientation);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image.h b/third_party/jpeg-xl/lib/jxl/dec_external_image.h
new file mode 100644
index 0000000000..6ca7abff62
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.h
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_EXTERNAL_IMAGE_H_
+#define LIB_JXL_DEC_EXTERNAL_IMAGE_H_
+
+// Interleaved image for color transforms and Codec.
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Converts ib to interleaved void* pixel buffer with the given format.
+// bits_per_sample: must be 16 or 32 if float_out is true, and at most 16
+// if it is false. No bit packing is done.
+// num_channels: must be 1, 2, 3 or 4 for gray, gray+alpha, RGB, RGB+alpha.
+// This supports the features needed for the C API and does not perform
+// color space conversion.
+// TODO(lode): support rectangle crop.
+// stride_out is output scanline size in bytes, must be >=
+// output_xsize * output_bytes_per_pixel.
+// undo_orientation is an EXIF orientation to undo. Depending on the
+// orientation, the output xsize and ysize are swapped compared to input
+// xsize and ysize.
+Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
+                         bool float_out, size_t num_channels,
+                         JxlEndianness endianness, size_t stride_out,
+                         jxl::ThreadPool* thread_pool, void* out_image,
+                         size_t out_size, const PixelCallback& out_callback,
+                         jxl::Orientation undo_orientation,
+                         bool unpremul_alpha = false);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_EXTERNAL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc
new file mode 100644
index 0000000000..c87a4d5f36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image_gbench.cc
@@ -0,0 +1,56 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+// Decoder case, interleaves an internal float image.
+void BM_DecExternalImage_ConvertImageRGBA(benchmark::State& state) {
+  const size_t kNumIter = 5;
+  size_t xsize = state.range();
+  size_t ysize = state.range();
+  size_t num_channels = 4;
+
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+  Image3F color(xsize, ysize);
+  ZeroFillImage(&color);
+  ib.SetFromImage(std::move(color), ColorEncoding::SRGB());
+  ImageF alpha(xsize, ysize);
+  ZeroFillImage(&alpha);
+  ib.SetAlpha(std::move(alpha));
+
+  const size_t bytes_per_row = xsize * num_channels;
+  std::vector<uint8_t> interleaved(bytes_per_row * ysize);
+
+  for (auto _ : state) {
+    for (size_t i = 0; i < kNumIter; ++i) {
+      JXL_CHECK(ConvertToExternal(
+          ib,
+          /*bits_per_sample=*/8,
+          /*float_out=*/false, num_channels, JXL_NATIVE_ENDIAN,
+          /*stride*/ bytes_per_row,
+          /*thread_pool=*/nullptr, interleaved.data(), interleaved.size(),
+          /*out_callback=*/{},
+          /*undo_orientation=*/jxl::Orientation::kIdentity));
+    }
+  }
+
+  // Pixels per second.
+  state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize);
+  state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size());
+}
+
+BENCHMARK(BM_DecExternalImage_ConvertImageRGBA)
+    ->RangeMultiplier(2)
+    ->Range(256, 2048);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.cc b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
new file mode 100644
index 0000000000..98508e431b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
@@ -0,0 +1,878 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_frame.h"
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <atomic>
+#include <hwy/aligned_allocator.h>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_group.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/splines.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+Status DecodeGlobalDCInfo(BitReader* reader, bool is_jpeg,
+                          PassesDecoderState* state, ThreadPool* pool) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(state->shared_storage.quantizer.Decode(reader));
+
+  JXL_RETURN_IF_ERROR(
+      DecodeBlockCtxMap(reader, &state->shared_storage.block_ctx_map));
+
+  JXL_RETURN_IF_ERROR(state->shared_storage.cmap.DecodeDC(reader));
+
+  // Pre-compute info for decoding a group.
+  if (is_jpeg) {
+    state->shared_storage.quantizer.ClearDCMul();  // Don't dequant DC
+  }
+
+  state->shared_storage.ac_strategy.FillInvalid();
+  return true;
+}
+}  // namespace
+
+Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
+                   const uint8_t* next_in, size_t avail_in,
+                   ImageBundle* decoded, const CodecMetadata& metadata,
+                   bool use_slow_rendering_pipeline) {
+  FrameDecoder frame_decoder(dec_state, metadata, pool,
+                             use_slow_rendering_pipeline);
+
+  BitReader reader(Span<const uint8_t>(next_in, avail_in));
+  JXL_RETURN_IF_ERROR(frame_decoder.InitFrame(&reader, decoded,
+                                              /*is_preview=*/false));
+  JXL_RETURN_IF_ERROR(frame_decoder.InitFrameOutput());
+
+  JXL_RETURN_IF_ERROR(reader.AllReadsWithinBounds());
+  size_t header_bytes = reader.TotalBitsConsumed() / kBitsPerByte;
+  JXL_RETURN_IF_ERROR(reader.Close());
+
+  size_t processed_bytes = header_bytes;
+  Status close_ok = true;
+  std::vector<std::unique_ptr<BitReader>> section_readers;
+  {
+    std::vector<std::unique_ptr<BitReaderScopedCloser>> section_closers;
+    std::vector<FrameDecoder::SectionInfo> section_info;
+    std::vector<FrameDecoder::SectionStatus> section_status;
+    size_t pos = header_bytes;
+    size_t index = 0;
+    for (auto toc_entry : frame_decoder.Toc()) {
+      JXL_RETURN_IF_ERROR(pos + toc_entry.size <= avail_in);
+      auto br = make_unique<BitReader>(
+          Span<const uint8_t>(next_in + pos, toc_entry.size));
+      section_info.emplace_back(
+          FrameDecoder::SectionInfo{br.get(), toc_entry.id, index++});
+      section_closers.emplace_back(
+          make_unique<BitReaderScopedCloser>(br.get(), &close_ok));
+      section_readers.emplace_back(std::move(br));
+      pos += toc_entry.size;
+    }
+    section_status.resize(section_info.size());
+    JXL_RETURN_IF_ERROR(frame_decoder.ProcessSections(
+        section_info.data(), section_info.size(), section_status.data()));
+    for (size_t i = 0; i < section_status.size(); i++) {
+      JXL_RETURN_IF_ERROR(section_status[i] == FrameDecoder::kDone);
+      processed_bytes += frame_decoder.Toc()[i].size;
+    }
+  }
+  JXL_RETURN_IF_ERROR(close_ok);
+  JXL_RETURN_IF_ERROR(frame_decoder.FinalizeFrame());
+  decoded->SetDecodedBytes(processed_bytes);
+  return true;
+}
+
+Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
+                               bool is_preview) {
+  PROFILER_FUNC;
+  decoded_ = decoded;
+  JXL_ASSERT(is_finalized_);
+
+  // Reset the dequantization matrices to their default values.
+  dec_state_->shared_storage.matrices = DequantMatrices();
+
+  frame_header_.nonserialized_is_preview = is_preview;
+  JXL_ASSERT(frame_header_.nonserialized_metadata != nullptr);
+  JXL_RETURN_IF_ERROR(ReadFrameHeader(br, &frame_header_));
+  frame_dim_ = frame_header_.ToFrameDimensions();
+  JXL_DEBUG_V(2, "FrameHeader: %s", frame_header_.DebugString().c_str());
+
+  const size_t num_passes = frame_header_.passes.num_passes;
+  const size_t num_groups = frame_dim_.num_groups;
+
+  // If the previous frame was not a kRegularFrame, `decoded` may have different
+  // dimensions; must reset to avoid errors.
+  decoded->RemoveColor();
+  decoded->ClearExtraChannels();
+
+  decoded->duration = frame_header_.animation_frame.duration;
+
+  if (!frame_header_.nonserialized_is_preview &&
+      (frame_header_.is_last || frame_header_.animation_frame.duration > 0) &&
+      (frame_header_.frame_type == kRegularFrame ||
+       frame_header_.frame_type == kSkipProgressive)) {
+    ++dec_state_->visible_frame_index;
+    dec_state_->nonvisible_frame_index = 0;
+  } else {
+    ++dec_state_->nonvisible_frame_index;
+  }
+
+  // Read TOC.
+  const bool has_ac_global = true;
+  const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups,
+                                           num_passes, has_ac_global);
+  std::vector<uint32_t> sizes;
+  std::vector<coeff_order_t> permutation;
+  JXL_RETURN_IF_ERROR(ReadToc(toc_entries, br, &sizes, &permutation));
+  bool have_permutation = !permutation.empty();
+  toc_.resize(toc_entries);
+  section_sizes_sum_ = 0;
+  for (size_t i = 0; i < toc_entries; ++i) {
+    toc_[i].size = sizes[i];
+    size_t index = have_permutation ? permutation[i] : i;
+    toc_[index].id = i;
+    if (section_sizes_sum_ + toc_[i].size < section_sizes_sum_) {
+      return JXL_FAILURE("group offset overflow");
+    }
+    section_sizes_sum_ += toc_[i].size;
+  }
+
+  JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0);
+  const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte;
+  JXL_DASSERT(!toc_.empty());
+
+  // Overflow check.
+  if (group_codes_begin + section_sizes_sum_ < group_codes_begin) {
+    return JXL_FAILURE("Invalid group codes");
+  }
+
+  if (!frame_header_.chroma_subsampling.Is444() &&
+      !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) &&
+      frame_header_.encoding == FrameEncoding::kVarDCT) {
+    return JXL_FAILURE(
+        "Non-444 chroma subsampling is not allowed when adaptive DC "
+        "smoothing is enabled");
+  }
+  return true;
+}
+
+Status FrameDecoder::InitFrameOutput() {
+  JXL_RETURN_IF_ERROR(
+      InitializePassesSharedState(frame_header_, &dec_state_->shared_storage));
+  JXL_RETURN_IF_ERROR(dec_state_->Init());
+  modular_frame_decoder_.Init(frame_dim_);
+
+  if (decoded_->IsJPEG()) {
+    if (frame_header_.encoding == FrameEncoding::kModular) {
+      return JXL_FAILURE("Cannot output JPEG from Modular");
+    }
+    jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get();
+    size_t num_components = jpeg_data->components.size();
+    if (num_components != 1 && num_components != 3) {
+      return JXL_FAILURE("Invalid number of components");
+    }
+    if (frame_header_.nonserialized_metadata->m.xyb_encoded) {
+      return JXL_FAILURE("Cannot decode to JPEG an XYB image");
+    }
+    auto jpeg_c_map = JpegOrder(ColorTransform::kYCbCr, num_components == 1);
+    decoded_->jpeg_data->width = frame_dim_.xsize;
+    decoded_->jpeg_data->height = frame_dim_.ysize;
+    for (size_t c = 0; c < num_components; c++) {
+      auto& component = jpeg_data->components[jpeg_c_map[c]];
+      component.width_in_blocks =
+          frame_dim_.xsize_blocks >> frame_header_.chroma_subsampling.HShift(c);
+      component.height_in_blocks =
+          frame_dim_.ysize_blocks >> frame_header_.chroma_subsampling.VShift(c);
+      component.h_samp_factor =
+          1 << frame_header_.chroma_subsampling.RawHShift(c);
+      component.v_samp_factor =
+          1 << frame_header_.chroma_subsampling.RawVShift(c);
+      component.coeffs.resize(component.width_in_blocks *
+                              component.height_in_blocks * jxl::kDCTBlockSize);
+    }
+  }
+
+  // Clear the state.
+  decoded_dc_global_ = false;
+  decoded_ac_global_ = false;
+  is_finalized_ = false;
+  finalized_dc_ = false;
+  num_sections_done_ = 0;
+  decoded_dc_groups_.clear();
+  decoded_dc_groups_.resize(frame_dim_.num_dc_groups);
+  decoded_passes_per_ac_group_.clear();
+  decoded_passes_per_ac_group_.resize(frame_dim_.num_groups, 0);
+  processed_section_.clear();
+  processed_section_.resize(toc_.size());
+  allocated_ = false;
+  return true;
+}
+
+Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
+  PROFILER_FUNC;
+  PassesSharedState& shared = dec_state_->shared_storage;
+  if (shared.frame_header.flags & FrameHeader::kPatches) {
+    bool uses_extra_channels = false;
+    JXL_RETURN_IF_ERROR(shared.image_features.patches.Decode(
+        br, frame_dim_.xsize_padded, frame_dim_.ysize_padded,
+        &uses_extra_channels));
+    if (uses_extra_channels && frame_header_.upsampling != 1) {
+      for (size_t ecups : frame_header_.extra_channel_upsampling) {
+        if (ecups != frame_header_.upsampling) {
+          return JXL_FAILURE(
+              "Cannot use extra channels in patches if color channels are "
+              "subsampled differently from extra channels");
+        }
+      }
+    }
+  } else {
+    shared.image_features.patches.Clear();
+  }
+  shared.image_features.splines.Clear();
+  if (shared.frame_header.flags & FrameHeader::kSplines) {
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.Decode(
+        br, frame_dim_.xsize * frame_dim_.ysize));
+  }
+  if (shared.frame_header.flags & FrameHeader::kNoise) {
+    JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params));
+  }
+  JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));
+
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
+  }
+  // Splines' draw cache uses the color correlation map.
+  if (shared.frame_header.flags & FrameHeader::kSplines) {
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache(
+        frame_dim_.xsize_upsampled, frame_dim_.ysize_upsampled,
+        dec_state_->shared->cmap));
+  }
+  Status dec_status = modular_frame_decoder_.DecodeGlobalInfo(
+      br, frame_header_, /*allow_truncated_group=*/false);
+  if (dec_status.IsFatalError()) return dec_status;
+  if (dec_status) {
+    decoded_dc_global_ = true;
+  }
+  return dec_status;
+}
+
+Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) {
+  PROFILER_FUNC;
+  const size_t gx = dc_group_id % frame_dim_.xsize_dc_groups;
+  const size_t gy = dc_group_id / frame_dim_.xsize_dc_groups;
+  const LoopFilter& lf = dec_state_->shared->frame_header.loop_filter;
+  if (frame_header_.encoding == FrameEncoding::kVarDCT &&
+      !(frame_header_.flags & FrameHeader::kUseDcFrame)) {
+    JXL_RETURN_IF_ERROR(
+        modular_frame_decoder_.DecodeVarDCTDC(dc_group_id, br, dec_state_));
+  }
+  const Rect mrect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim,
+                   frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
+  JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+      mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id),
+      /*zerofill=*/false, nullptr, nullptr,
+      /*allow_truncated=*/false));
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_));
+  } else if (lf.epf_iters > 0) {
+    FillImage(kInvSigmaNum / lf.epf_sigma_for_modular, &dec_state_->sigma);
+  }
+  decoded_dc_groups_[dc_group_id] = uint8_t{true};
+  return true;
+}
+
+void FrameDecoder::FinalizeDC() {
+  // Do Adaptive DC smoothing if enabled. This *must* happen between all the
+  // ProcessDCGroup and ProcessACGroup.
+  if (frame_header_.encoding == FrameEncoding::kVarDCT &&
+      !(frame_header_.flags & FrameHeader::kSkipAdaptiveDCSmoothing) &&
+      !(frame_header_.flags & FrameHeader::kUseDcFrame)) {
+    AdaptiveDCSmoothing(dec_state_->shared->quantizer.MulDC(),
+                        &dec_state_->shared_storage.dc_storage, pool_);
+  }
+
+  finalized_dc_ = true;
+}
+
+Status FrameDecoder::AllocateOutput() {
+  if (allocated_) return true;
+  modular_frame_decoder_.MaybeDropFullImage();
+  decoded_->origin = dec_state_->shared->frame_header.frame_origin;
+  JXL_RETURN_IF_ERROR(dec_state_->InitForAC(nullptr));
+  allocated_ = true;
+  return true;
+}
+
+Status FrameDecoder::ProcessACGlobal(BitReader* br) {
+  JXL_CHECK(finalized_dc_);
+
+  // Decode AC group.
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.Decode(
+        br, &modular_frame_decoder_));
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.EnsureComputed(
+        dec_state_->used_acs));
+
+    size_t num_histo_bits =
+        CeilLog2Nonzero(dec_state_->shared->frame_dim.num_groups);
+    dec_state_->shared_storage.num_histograms =
+        1 + br->ReadBits(num_histo_bits);
+
+    dec_state_->code.resize(kMaxNumPasses);
+    dec_state_->context_map.resize(kMaxNumPasses);
+    // Read coefficient orders and histograms.
+    size_t max_num_bits_ac = 0;
+    for (size_t i = 0;
+         i < dec_state_->shared_storage.frame_header.passes.num_passes; i++) {
+      uint16_t used_orders = U32Coder::Read(kOrderEnc, br);
+      JXL_RETURN_IF_ERROR(DecodeCoeffOrders(
+          used_orders, dec_state_->used_acs,
+          &dec_state_->shared_storage
+               .coeff_orders[i * dec_state_->shared_storage.coeff_order_size],
+          br));
+      size_t num_contexts =
+          dec_state_->shared->num_histograms *
+          dec_state_->shared_storage.block_ctx_map.NumACContexts();
+      JXL_RETURN_IF_ERROR(DecodeHistograms(
+          br, num_contexts, &dec_state_->code[i], &dec_state_->context_map[i]));
+      // Add extra values to enable the cheat in hot loop of DecodeACVarBlock.
+      dec_state_->context_map[i].resize(
+          num_contexts + kZeroDensityContextLimit - kZeroDensityContextCount);
+      max_num_bits_ac =
+          std::max(max_num_bits_ac, dec_state_->code[i].max_num_bits);
+    }
+    max_num_bits_ac += CeilLog2Nonzero(
+        dec_state_->shared_storage.frame_header.passes.num_passes);
+    // 16-bit buffer for decoding to JPEG are not implemented.
+    // TODO(veluca): figure out the exact limit - 16 should still work with
+    // 16-bit buffers, but we are excluding it for safety.
+    bool use_16_bit = max_num_bits_ac < 16 && !decoded_->IsJPEG();
+    bool store = frame_header_.passes.num_passes > 1;
+    size_t xs = store ? kGroupDim * kGroupDim : 0;
+    size_t ys = store ? frame_dim_.num_groups : 0;
+    if (use_16_bit) {
+      dec_state_->coefficients = make_unique<ACImageT<int16_t>>(xs, ys);
+    } else {
+      dec_state_->coefficients = make_unique<ACImageT<int32_t>>(xs, ys);
+    }
+    if (store) {
+      dec_state_->coefficients->ZeroFill();
+    }
+  }
+
+  // Set JPEG decoding data.
+  if (decoded_->IsJPEG()) {
+    decoded_->color_transform = frame_header_.color_transform;
+    decoded_->chroma_subsampling = frame_header_.chroma_subsampling;
+    const std::vector<QuantEncoding>& qe =
+        dec_state_->shared_storage.matrices.encodings();
+    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
+        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
+      return JXL_FAILURE(
+          "Quantization table is not a JPEG quantization table.");
+    }
+    jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get();
+    size_t num_components = jpeg_data->components.size();
+    bool is_gray = (num_components == 1);
+    auto jpeg_c_map = JpegOrder(frame_header_.color_transform, is_gray);
+    size_t qt_set = 0;
+    for (size_t c = 0; c < num_components; c++) {
+      // TODO(eustas): why 1-st quant table for gray?
+      size_t quant_c = is_gray ? 1 : c;
+      size_t qpos = jpeg_data->components[jpeg_c_map[c]].quant_idx;
+      JXL_CHECK(qpos != jpeg_data->quant.size());
+      qt_set |= 1 << qpos;
+      for (size_t x = 0; x < 8; x++) {
+        for (size_t y = 0; y < 8; y++) {
+          jpeg_data->quant[qpos].values[x * 8 + y] =
+              (*qe[0].qraw.qtable)[quant_c * 64 + y * 8 + x];
+        }
+      }
+    }
+    for (size_t i = 0; i < jpeg_data->quant.size(); i++) {
+      if (qt_set & (1 << i)) continue;
+      if (i == 0) return JXL_FAILURE("First quant table unused.");
+      // Unused quant table is set to copy of previous quant table
+      for (size_t j = 0; j < 64; j++) {
+        jpeg_data->quant[i].values[j] = jpeg_data->quant[i - 1].values[j];
+      }
+    }
+  }
+  decoded_ac_global_ = true;
+  return true;
+}
+
+Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
+                                    BitReader* JXL_RESTRICT* br,
+                                    size_t num_passes, size_t thread,
+                                    bool force_draw, bool dc_only) {
+  PROFILER_ZONE("process_group");
+  size_t group_dim = frame_dim_.group_dim;
+  const size_t gx = ac_group_id % frame_dim_.xsize_groups;
+  const size_t gy = ac_group_id / frame_dim_.xsize_groups;
+  const size_t x = gx * group_dim;
+  const size_t y = gy * group_dim;
+  JXL_DEBUG_V(3,
+              "Processing AC group %" PRIuS "(%" PRIuS ",%" PRIuS
+              ") group_dim: %" PRIuS " decoded passes: %u new passes: %" PRIuS,
+              ac_group_id, gx, gy, group_dim,
+              decoded_passes_per_ac_group_[ac_group_id], num_passes);
+
+  RenderPipelineInput render_pipeline_input =
+      dec_state_->render_pipeline->GetInputBuffers(ac_group_id, thread);
+
+  bool should_run_pipeline = true;
+
+  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+    group_dec_caches_[thread].InitOnce(frame_header_.passes.num_passes,
+                                       dec_state_->used_acs);
+    JXL_RETURN_IF_ERROR(DecodeGroup(br, num_passes, ac_group_id, dec_state_,
+                                    &group_dec_caches_[thread], thread,
+                                    render_pipeline_input, decoded_,
+                                    decoded_passes_per_ac_group_[ac_group_id],
+                                    force_draw, dc_only, &should_run_pipeline));
+  }
+
+  // don't limit to image dimensions here (is done in DecodeGroup)
+  const Rect mrect(x, y, group_dim, group_dim);
+  bool modular_ready = false;
+  size_t pass0 = decoded_passes_per_ac_group_[ac_group_id];
+  size_t pass1 =
+      force_draw ? frame_header_.passes.num_passes : pass0 + num_passes;
+  for (size_t i = pass0; i < pass1; ++i) {
+    int minShift, maxShift;
+    frame_header_.passes.GetDownsamplingBracket(i, minShift, maxShift);
+    bool modular_pass_ready = true;
+    if (i < pass0 + num_passes) {
+      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+          mrect, br[i - pass0], minShift, maxShift,
+          ModularStreamId::ModularAC(ac_group_id, i),
+          /*zerofill=*/false, dec_state_, &render_pipeline_input,
+          /*allow_truncated=*/false, &modular_pass_ready));
+    } else {
+      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
+          mrect, nullptr, minShift, maxShift,
+          ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true,
+          dec_state_, &render_pipeline_input,
+          /*allow_truncated=*/false, &modular_pass_ready));
+    }
+    if (modular_pass_ready) modular_ready = true;
+  }
+  decoded_passes_per_ac_group_[ac_group_id] += num_passes;
+
+  if ((frame_header_.flags & FrameHeader::kNoise) != 0) {
+    PROFILER_ZONE("GenerateNoise");
+    size_t noise_c_start =
+        3 + frame_header_.nonserialized_metadata->m.num_extra_channels;
+    // When the color channels are downsampled, we need to generate more noise
+    // input for the current group than just the group dimensions.
+    std::pair<ImageF*, Rect> rects[3];
+    for (size_t iy = 0; iy < frame_header_.upsampling; iy++) {
+      for (size_t ix = 0; ix < frame_header_.upsampling; ix++) {
+        for (size_t c = 0; c < 3; c++) {
+          auto r = render_pipeline_input.GetBuffer(noise_c_start + c);
+          rects[c].first = r.first;
+          size_t x1 = r.second.x0() + r.second.xsize();
+          size_t y1 = r.second.y0() + r.second.ysize();
+          rects[c].second = Rect(r.second.x0() + ix * group_dim,
+                                 r.second.y0() + iy * group_dim, group_dim,
+                                 group_dim, x1, y1);
+        }
+        Random3Planes(dec_state_->visible_frame_index,
+                      dec_state_->nonvisible_frame_index,
+                      (gx * frame_header_.upsampling + ix) * group_dim,
+                      (gy * frame_header_.upsampling + iy) * group_dim,
+                      rects[0], rects[1], rects[2]);
+      }
+    }
+  }
+
+  if (!modular_frame_decoder_.UsesFullImage() && !decoded_->IsJPEG()) {
+    if (should_run_pipeline && modular_ready) {
+      render_pipeline_input.Done();
+    } else if (force_draw) {
+      return JXL_FAILURE("Modular group decoding failed.");
+    }
+  }
+  return true;
+}
+
+void FrameDecoder::MarkSections(const SectionInfo* sections, size_t num,
+                                SectionStatus* section_status) {
+  num_sections_done_ += num;
+  for (size_t i = 0; i < num; i++) {
+    if (section_status[i] != SectionStatus::kDone) {
+      processed_section_[sections[i].id] = false;
+      num_sections_done_--;
+    }
+  }
+}
+
+Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num,
+                                     SectionStatus* section_status) {
+  if (num == 0) return true;  // Nothing to process
+  std::fill(section_status, section_status + num, SectionStatus::kSkipped);
+  size_t dc_global_sec = num;
+  size_t ac_global_sec = num;
+  std::vector<size_t> dc_group_sec(frame_dim_.num_dc_groups, num);
+  std::vector<std::vector<size_t>> ac_group_sec(
+      frame_dim_.num_groups,
+      std::vector<size_t>(frame_header_.passes.num_passes, num));
+  // This keeps track of the number of ac passes we want to process during this
+  // call of ProcessSections.
+  std::vector<size_t> desired_num_ac_passes(frame_dim_.num_groups);
+  bool single_section =
+      frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1;
+  if (single_section) {
+    JXL_ASSERT(num == 1);
+    JXL_ASSERT(sections[0].id == 0);
+    if (processed_section_[0] == false) {
+      processed_section_[0] = true;
+      ac_group_sec[0].resize(1);
+      dc_global_sec = ac_global_sec = dc_group_sec[0] = ac_group_sec[0][0] = 0;
+      desired_num_ac_passes[0] = 1;
+    } else {
+      section_status[0] = SectionStatus::kDuplicate;
+    }
+  } else {
+    size_t ac_global_index = frame_dim_.num_dc_groups + 1;
+    for (size_t i = 0; i < num; i++) {
+      JXL_ASSERT(sections[i].id < processed_section_.size());
+      if (processed_section_[sections[i].id]) {
+        section_status[i] = SectionStatus::kDuplicate;
+        continue;
+      }
+      if (sections[i].id == 0) {
+        dc_global_sec = i;
+      } else if (sections[i].id < ac_global_index) {
+        dc_group_sec[sections[i].id - 1] = i;
+      } else if (sections[i].id == ac_global_index) {
+        ac_global_sec = i;
+      } else {
+        size_t ac_idx = sections[i].id - ac_global_index - 1;
+        size_t acg = ac_idx % frame_dim_.num_groups;
+        size_t acp = ac_idx / frame_dim_.num_groups;
+        if (acp >= frame_header_.passes.num_passes) {
+          return JXL_FAILURE("Invalid section ID");
+        }
+        ac_group_sec[acg][acp] = i;
+      }
+      processed_section_[sections[i].id] = true;
+    }
+    // Count number of new passes per group.
+    for (size_t g = 0; g < ac_group_sec.size(); g++) {
+      size_t j = 0;
+      for (; j + decoded_passes_per_ac_group_[g] <
+             frame_header_.passes.num_passes;
+           j++) {
+        if (ac_group_sec[g][j + decoded_passes_per_ac_group_[g]] == num) {
+          break;
+        }
+      }
+      desired_num_ac_passes[g] = j;
+    }
+  }
+  if (dc_global_sec != num) {
+    Status dc_global_status = ProcessDCGlobal(sections[dc_global_sec].br);
+    if (dc_global_status.IsFatalError()) return dc_global_status;
+    if (dc_global_status) {
+      section_status[dc_global_sec] = SectionStatus::kDone;
+    } else {
+      section_status[dc_global_sec] = SectionStatus::kPartial;
+    }
+  }
+
+  std::atomic<bool> has_error{false};
+  if (decoded_dc_global_) {
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, dc_group_sec.size(), ThreadPool::NoInit,
+        [this, &dc_group_sec, &num, &sections, &section_status, &has_error](
+            size_t i, size_t thread) {
+          if (dc_group_sec[i] != num) {
+            if (!ProcessDCGroup(i, sections[dc_group_sec[i]].br)) {
+              has_error = true;
+            } else {
+              section_status[dc_group_sec[i]] = SectionStatus::kDone;
+            }
+          }
+        },
+        "DecodeDCGroup"));
+  }
+  if (has_error) return JXL_FAILURE("Error in DC group");
+
+  if (*std::min_element(decoded_dc_groups_.begin(), decoded_dc_groups_.end()) &&
+      !finalized_dc_) {
+    PassesDecoderState::PipelineOptions pipeline_options;
+    pipeline_options.use_slow_render_pipeline = use_slow_rendering_pipeline_;
+    pipeline_options.coalescing = coalescing_;
+    pipeline_options.render_spotcolors = render_spotcolors_;
+    JXL_RETURN_IF_ERROR(
+        dec_state_->PreparePipeline(decoded_, pipeline_options));
+    FinalizeDC();
+    JXL_RETURN_IF_ERROR(AllocateOutput());
+    if (progressive_detail_ >= JxlProgressiveDetail::kDC) {
+      MarkSections(sections, num, section_status);
+      return true;
+    }
+  }
+
+  if (finalized_dc_ && ac_global_sec != num && !decoded_ac_global_) {
+    JXL_RETURN_IF_ERROR(ProcessACGlobal(sections[ac_global_sec].br));
+    section_status[ac_global_sec] = SectionStatus::kDone;
+  }
+
+  if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) {
+    // Mark that we only want the next progression pass.
+    size_t target_complete_passes = NextNumPassesToPause();
+    for (size_t i = 0; i < ac_group_sec.size(); i++) {
+      desired_num_ac_passes[i] =
+          std::min(desired_num_ac_passes[i],
+                   target_complete_passes - decoded_passes_per_ac_group_[i]);
+    }
+  }
+
+  if (decoded_ac_global_) {
+    // Mark all the AC groups that we received as not complete yet.
+    for (size_t i = 0; i < ac_group_sec.size(); i++) {
+      if (desired_num_ac_passes[i] != 0) {
+        dec_state_->render_pipeline->ClearDone(i);
+      }
+    }
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, ac_group_sec.size(),
+        [this](size_t num_threads) {
+          return PrepareStorage(num_threads,
+                                decoded_passes_per_ac_group_.size());
+        },
+        [this, &ac_group_sec, &desired_num_ac_passes, &num, &sections,
+         &section_status, &has_error](size_t g, size_t thread) {
+          if (desired_num_ac_passes[g] == 0) {
+            // no new AC pass, nothing to do
+            return;
+          }
+          (void)num;
+          size_t first_pass = decoded_passes_per_ac_group_[g];
+          BitReader* JXL_RESTRICT readers[kMaxNumPasses];
+          for (size_t i = 0; i < desired_num_ac_passes[g]; i++) {
+            JXL_ASSERT(ac_group_sec[g][first_pass + i] != num);
+            readers[i] = sections[ac_group_sec[g][first_pass + i]].br;
+          }
+          if (!ProcessACGroup(g, readers, desired_num_ac_passes[g],
+                              GetStorageLocation(thread, g),
+                              /*force_draw=*/false, /*dc_only=*/false)) {
+            has_error = true;
+          } else {
+            for (size_t i = 0; i < desired_num_ac_passes[g]; i++) {
+              section_status[ac_group_sec[g][first_pass + i]] =
+                  SectionStatus::kDone;
+            }
+          }
+        },
+        "DecodeGroup"));
+  }
+  if (has_error) return JXL_FAILURE("Error in AC group");
+
+  MarkSections(sections, num, section_status);
+  return true;
+}
+
+Status FrameDecoder::Flush() {
+  bool has_blending = frame_header_.blending_info.mode != BlendMode::kReplace ||
+                      frame_header_.custom_size_or_origin;
+  for (const auto& blending_info_ec :
+       frame_header_.extra_channel_blending_info) {
+    if (blending_info_ec.mode != BlendMode::kReplace) has_blending = true;
+  }
+  // No early Flush() if blending is enabled.
+  if (has_blending && !is_finalized_) {
+    return false;
+  }
+  // No early Flush() - nothing to do - if the frame is a kSkipProgressive
+  // frame.
+  if (frame_header_.frame_type == FrameType::kSkipProgressive &&
+      !is_finalized_) {
+    return true;
+  }
+  if (decoded_->IsJPEG()) {
+    // Nothing to do.
+    return true;
+  }
+  JXL_RETURN_IF_ERROR(AllocateOutput());
+
+  uint32_t completely_decoded_ac_pass = *std::min_element(
+      decoded_passes_per_ac_group_.begin(), decoded_passes_per_ac_group_.end());
+  if (completely_decoded_ac_pass < frame_header_.passes.num_passes) {
+    // We don't have all AC yet: force a draw of all the missing areas.
+    // Mark all sections as not complete.
+    for (size_t i = 0; i < decoded_passes_per_ac_group_.size(); i++) {
+      if (decoded_passes_per_ac_group_[i] < frame_header_.passes.num_passes) {
+        dec_state_->render_pipeline->ClearDone(i);
+      }
+    }
+    std::atomic<bool> has_error{false};
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool_, 0, decoded_passes_per_ac_group_.size(),
+        [this](const size_t num_threads) {
+          return PrepareStorage(num_threads,
+                                decoded_passes_per_ac_group_.size());
+        },
+        [this, &has_error](const uint32_t g, size_t thread) {
+          if (decoded_passes_per_ac_group_[g] ==
+              frame_header_.passes.num_passes) {
+            // This group was drawn already, nothing to do.
+            return;
+          }
+          BitReader* JXL_RESTRICT readers[kMaxNumPasses] = {};
+          bool ok = ProcessACGroup(
+              g, readers, /*num_passes=*/0, GetStorageLocation(thread, g),
+              /*force_draw=*/true, /*dc_only=*/!decoded_ac_global_);
+          if (!ok) has_error = true;
+        },
+        "ForceDrawGroup"));
+    if (has_error) {
+      return JXL_FAILURE("Drawing groups failed");
+    }
+  }
+
+  // undo global modular transforms and copy int pixel buffers to float ones
+  JXL_RETURN_IF_ERROR(modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_,
+                                                              is_finalized_));
+
+  return true;
+}
+
+int FrameDecoder::SavedAs(const FrameHeader& header) {
+  if (header.frame_type == FrameType::kDCFrame) {
+    // bits 16, 32, 64, 128 for DC level
+    return 16 << (header.dc_level - 1);
+  } else if (header.CanBeReferenced()) {
+    // bits 1, 2, 4 and 8 for the references
+    return 1 << header.save_as_reference;
+  }
+
+  return 0;
+}
+
+bool FrameDecoder::HasEverything() const {
+  if (!decoded_dc_global_) return false;
+  if (!decoded_ac_global_) return false;
+  for (auto& have_dc_group : decoded_dc_groups_) {
+    if (!have_dc_group) return false;
+  }
+  for (auto& nb_passes : decoded_passes_per_ac_group_) {
+    if (nb_passes < frame_header_.passes.num_passes) return false;
+  }
+  return true;
+}
+
+int FrameDecoder::References() const {
+  if (is_finalized_) {
+    return 0;
+  }
+  if (!HasEverything()) return 0;
+
+  int result = 0;
+
+  // Blending
+  if (frame_header_.frame_type == FrameType::kRegularFrame ||
+      frame_header_.frame_type == FrameType::kSkipProgressive) {
+    bool cropped = frame_header_.custom_size_or_origin;
+    if (cropped || frame_header_.blending_info.mode != BlendMode::kReplace) {
+      result |= (1 << frame_header_.blending_info.source);
+    }
+    const auto& extra = frame_header_.extra_channel_blending_info;
+    for (size_t i = 0; i < extra.size(); ++i) {
+      if (cropped || extra[i].mode != BlendMode::kReplace) {
+        result |= (1 << extra[i].source);
+      }
+    }
+  }
+
+  // Patches
+  if (frame_header_.flags & FrameHeader::kPatches) {
+    result |= dec_state_->shared->image_features.patches.GetReferences();
+  }
+
+  // DC Level
+  if (frame_header_.flags & FrameHeader::kUseDcFrame) {
+    // Reads from the next dc level
+    int dc_level = frame_header_.dc_level + 1;
+    // bits 16, 32, 64, 128 for DC level
+    result |= (16 << (dc_level - 1));
+  }
+
+  return result;
+}
+
+Status FrameDecoder::FinalizeFrame() {
+  if (is_finalized_) {
+    return JXL_FAILURE("FinalizeFrame called multiple times");
+  }
+  is_finalized_ = true;
+  if (decoded_->IsJPEG()) {
+    // Nothing to do.
+    return true;
+  }
+
+  // undo global modular transforms and copy int pixel buffers to float ones
+  JXL_RETURN_IF_ERROR(
+      modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_,
+                                              /*inplace=*/true));
+
+  if (frame_header_.CanBeReferenced()) {
+    auto& info = dec_state_->shared_storage
+                     .reference_frames[frame_header_.save_as_reference];
+    info.frame = std::move(dec_state_->frame_storage_for_referencing);
+    info.ib_is_in_xyb = frame_header_.save_before_color_transform;
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.h b/third_party/jpeg-xl/lib/jxl/dec_frame.h
new file mode 100644
index 0000000000..6b54ac631f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h
@@ -0,0 +1,329 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_FRAME_H_
+#define LIB_JXL_DEC_FRAME_H_
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/blending.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Decodes a frame. Groups may be processed in parallel by `pool`.
+// `metadata` is the metadata that applies to all frames of the codestream
+// `decoded->metadata` must already be set and must match metadata.m.
+// Used in the encoder to model decoder behaviour, and in tests.
+Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
+                   const uint8_t* next_in, size_t avail_in,
+                   ImageBundle* decoded, const CodecMetadata& metadata,
+                   bool use_slow_rendering_pipeline = false);
+
+// TODO(veluca): implement "forced drawing".
+class FrameDecoder {
+ public:
+  // All parameters must outlive the FrameDecoder.
+  FrameDecoder(PassesDecoderState* dec_state, const CodecMetadata& metadata,
+               ThreadPool* pool, bool use_slow_rendering_pipeline)
+      : dec_state_(dec_state),
+        pool_(pool),
+        frame_header_(&metadata),
+        use_slow_rendering_pipeline_(use_slow_rendering_pipeline) {}
+
+  void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; }
+  void SetCoalescing(bool c) { coalescing_ = c; }
+
+  // Read FrameHeader and table of contents from the given BitReader.
+  Status InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
+                   bool is_preview);
+
+  // Checks frame dimensions for their limits, and sets the output
+  // image buffer.
+  Status InitFrameOutput();
+
+  struct SectionInfo {
+    BitReader* JXL_RESTRICT br;
+    // Logical index of the section, regardless of any permutation that may be
+    // applied in the table of contents or of the physical position in the file.
+    size_t id;
+    // Index of the section in the order of the bytes inside the frame.
+    size_t index;
+  };
+
+  struct TocEntry {
+    size_t size;
+    size_t id;
+  };
+
+  enum SectionStatus {
+    // Processed correctly.
+    kDone = 0,
+    // Skipped because other required sections were not yet processed.
+    kSkipped = 1,
+    // Skipped because the section was already processed.
+    kDuplicate = 2,
+    // Only partially decoded: the section will need to be processed again.
+    kPartial = 3,
+  };
+
+  // Processes `num` sections; each SectionInfo contains the index
+  // of the section and a BitReader that only contains the data of the section.
+  // `section_status` should point to `num` elements, and will be filled with
+  // information about whether each section was processed or not.
+  // A section is a part of the encoded file that is indexed by the TOC.
+  Status ProcessSections(const SectionInfo* sections, size_t num,
+                         SectionStatus* section_status);
+
+  // Flushes all the data decoded so far to pixels.
+  Status Flush();
+
+  // Runs final operations once a frame data is decoded.
+  // Must be called exactly once per frame, after all calls to ProcessSections.
+  Status FinalizeFrame();
+
+  // Returns dependencies of this frame on reference ids as a bit mask: bits 0-3
+  // indicate reference frame 0-3 for patches and blending, bits 4-7 indicate DC
+  // frames this frame depends on. Only returns a valid result after all calls
+  // to ProcessSections are finished and before FinalizeFrame.
+  int References() const;
+
+  // Returns reference id of storage location where this frame is stored as a
+  // bit flag, or 0 if not stored.
+  // Matches the bit mask used for GetReferences: bits 0-3 indicate it is stored
+  // for patching or blending, bits 4-7 indicate DC frame.
+  // Unlike References, can be ran at any time as
+  // soon as the frame header is known.
+  static int SavedAs(const FrameHeader& header);
+
+  uint64_t SumSectionSizes() const { return section_sizes_sum_; }
+  const std::vector<TocEntry>& Toc() const { return toc_; }
+
+  const FrameHeader& GetFrameHeader() const { return frame_header_; }
+
+  // Returns whether a DC image has been decoded, accessible at low resolution
+  // at passes.shared_storage.dc_storage
+  bool HasDecodedDC() const { return finalized_dc_; }
+  bool HasDecodedAll() const { return toc_.size() == num_sections_done_; }
+
+  size_t NumCompletePasses() const {
+    return *std::min_element(decoded_passes_per_ac_group_.begin(),
+                             decoded_passes_per_ac_group_.end());
+  }
+
+  // If enabled, ProcessSections will stop and return true when the DC
+  // sections have been processed, instead of starting the AC sections. This
+  // will only occur if supported (that is, flushing will produce a valid
+  // 1/8th*1/8th resolution image). The return value of true then does not mean
+  // all sections have been processed, use HasDecodedDC and HasDecodedAll
+  // to check the true finished state.
+  // Returns the progressive detail that will be effective for the frame.
+  JxlProgressiveDetail SetPauseAtProgressive(JxlProgressiveDetail prog_detail) {
+    bool single_section =
+        frame_dim_.num_groups == 1 && frame_header_.passes.num_passes == 1;
+    if (frame_header_.frame_type != kSkipProgressive &&
+        // If there's only one group and one pass, there is no separate section
+        // for DC and the entire full resolution image is available at once.
+        !single_section &&
+        // If extra channels are encoded with modular without squeeze, they
+        // don't support DC. If the are encoded with squeeze, DC works in theory
+        // but the implementation may not yet correctly support this for Flush.
+        // Therefore, can't correctly pause for a progressive step if there is
+        // an extra channel (including alpha channel)
+        // TODO(firsching): Check if this is still the case.
+        decoded_->metadata()->extra_channel_info.empty() &&
+        // DC is not guaranteed to be available in modular mode and may be a
+        // black image. If squeeze is used, it may be available depending on the
+        // current implementation.
+        // TODO(lode): do return DC if it's known that flushing at this point
+        // will produce a valid 1/8th downscaled image with modular encoding.
+        frame_header_.encoding == FrameEncoding::kVarDCT) {
+      progressive_detail_ = prog_detail;
+    } else {
+      progressive_detail_ = JxlProgressiveDetail::kFrames;
+    }
+    if (progressive_detail_ >= JxlProgressiveDetail::kPasses) {
+      for (size_t i = 1; i < frame_header_.passes.num_passes; ++i) {
+        passes_to_pause_.push_back(i);
+      }
+    } else if (progressive_detail_ >= JxlProgressiveDetail::kLastPasses) {
+      for (size_t i = 0; i < frame_header_.passes.num_downsample; ++i) {
+        passes_to_pause_.push_back(frame_header_.passes.last_pass[i] + 1);
+      }
+      // The format does not guarantee that these values are sorted.
+      std::sort(passes_to_pause_.begin(), passes_to_pause_.end());
+    }
+    return progressive_detail_;
+  }
+
+  size_t NextNumPassesToPause() const {
+    auto it = std::upper_bound(passes_to_pause_.begin(), passes_to_pause_.end(),
+                               NumCompletePasses());
+    return (it != passes_to_pause_.end() ? *it
+                                         : std::numeric_limits<size_t>::max());
+  }
+
+  // Sets the pixel callback or image buffer where the pixels will be decoded.
+  //
+  // @param undo_orientation: if true, indicates the frame decoder should apply
+  // the exif orientation to bring the image to the intended display
+  // orientation.
+  void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer,
+                      size_t image_buffer_size, size_t xsize, size_t ysize,
+                      JxlPixelFormat format, size_t bits_per_sample,
+                      bool unpremul_alpha, bool undo_orientation) const {
+    dec_state_->width = xsize;
+    dec_state_->height = ysize;
+    dec_state_->main_output.format = format;
+    dec_state_->main_output.bits_per_sample = bits_per_sample;
+    dec_state_->main_output.callback = pixel_callback;
+    dec_state_->main_output.buffer = image_buffer;
+    dec_state_->main_output.buffer_size = image_buffer_size;
+    dec_state_->main_output.stride = GetStride(xsize, format);
+    const jxl::ExtraChannelInfo* alpha =
+        decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha);
+    if (alpha && alpha->alpha_associated && unpremul_alpha) {
+      dec_state_->unpremul_alpha = true;
+    }
+    if (undo_orientation) {
+      dec_state_->undo_orientation = decoded_->metadata()->GetOrientation();
+      if (static_cast<int>(dec_state_->undo_orientation) > 4) {
+        std::swap(dec_state_->width, dec_state_->height);
+      }
+    }
+    dec_state_->extra_output.clear();
+#if !JXL_HIGH_PRECISION
+    if (dec_state_->main_output.buffer &&
+        (format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) &&
+        !dec_state_->unpremul_alpha &&
+        (dec_state_->undo_orientation == Orientation::kIdentity) &&
+        decoded_->metadata()->xyb_encoded &&
+        dec_state_->output_encoding_info.color_encoding.IsSRGB() &&
+        dec_state_->output_encoding_info.all_default_opsin &&
+        (dec_state_->output_encoding_info.desired_intensity_target ==
+         dec_state_->output_encoding_info.orig_intensity_target) &&
+        HasFastXYBTosRGB8() && frame_header_.needs_color_transform()) {
+      dec_state_->fast_xyb_srgb8_conversion = true;
+    }
+#endif
+  }
+
+  void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize,
+                             JxlPixelFormat format, size_t bits_per_sample) {
+    ImageOutput out;
+    out.format = format;
+    out.bits_per_sample = bits_per_sample;
+    out.buffer = buffer;
+    out.buffer_size = buffer_size;
+    out.stride = GetStride(xsize, format);
+    dec_state_->extra_output.push_back(out);
+  }
+
+ private:
+  Status ProcessDCGlobal(BitReader* br);
+  Status ProcessDCGroup(size_t dc_group_id, BitReader* br);
+  void FinalizeDC();
+  Status AllocateOutput();
+  Status ProcessACGlobal(BitReader* br);
+  Status ProcessACGroup(size_t ac_group_id, BitReader* JXL_RESTRICT* br,
+                        size_t num_passes, size_t thread, bool force_draw,
+                        bool dc_only);
+  void MarkSections(const SectionInfo* sections, size_t num,
+                    SectionStatus* section_status);
+
+  // Allocates storage for parallel decoding using up to `num_threads` threads
+  // of up to `num_tasks` tasks. The value of `thread` passed to
+  // `GetStorageLocation` must be smaller than the `num_threads` value passed
+  // here. The value of `task` passed to `GetStorageLocation` must be smaller
+  // than the value of `num_tasks` passed here.
+  Status PrepareStorage(size_t num_threads, size_t num_tasks) {
+    size_t storage_size = std::min(num_threads, num_tasks);
+    if (storage_size > group_dec_caches_.size()) {
+      group_dec_caches_.resize(storage_size);
+    }
+    use_task_id_ = num_threads > num_tasks;
+    bool use_group_ids = (modular_frame_decoder_.UsesFullImage() &&
+                          (frame_header_.encoding == FrameEncoding::kVarDCT ||
+                           (frame_header_.flags & FrameHeader::kNoise)));
+    if (dec_state_->render_pipeline) {
+      JXL_RETURN_IF_ERROR(dec_state_->render_pipeline->PrepareForThreads(
+          storage_size, use_group_ids));
+    }
+    return true;
+  }
+
+  size_t GetStorageLocation(size_t thread, size_t task) {
+    if (use_task_id_) return task;
+    return thread;
+  }
+
+  static size_t BytesPerChannel(JxlDataType data_type) {
+    return (data_type == JXL_TYPE_UINT8   ? 1u
+            : data_type == JXL_TYPE_FLOAT ? 4u
+                                          : 2u);
+  }
+
+  static size_t GetStride(const size_t xsize, JxlPixelFormat format) {
+    size_t stride =
+        (xsize * BytesPerChannel(format.data_type) * format.num_channels);
+    if (format.align > 1) {
+      stride = (jxl::DivCeil(stride, format.align) * format.align);
+    }
+    return stride;
+  }
+
+  PassesDecoderState* dec_state_;
+  ThreadPool* pool_;
+  std::vector<TocEntry> toc_;
+  uint64_t section_sizes_sum_;
+  // TODO(veluca): figure out the duplication between these and dec_state_.
+  FrameHeader frame_header_;
+  FrameDimensions frame_dim_;
+  ImageBundle* decoded_;
+  ModularFrameDecoder modular_frame_decoder_;
+  bool render_spotcolors_ = true;
+  bool coalescing_ = true;
+
+  std::vector<uint8_t> processed_section_;
+  std::vector<uint8_t> decoded_passes_per_ac_group_;
+  std::vector<uint8_t> decoded_dc_groups_;
+  bool decoded_dc_global_;
+  bool decoded_ac_global_;
+  bool HasEverything() const;
+  bool finalized_dc_ = true;
+  size_t num_sections_done_ = 0;
+  bool is_finalized_ = true;
+  bool allocated_ = false;
+
+  std::vector<GroupDecCache> group_dec_caches_;
+
+  // Whether or not the task id should be used for storage indexing, instead of
+  // the thread id.
+  bool use_task_id_ = false;
+
+  // Testing setting: whether or not to use the slow rendering pipeline.
+  bool use_slow_rendering_pipeline_;
+
+  JxlProgressiveDetail progressive_detail_ = kFrames;
+  // Number of completed passes where section decoding should pause.
+  // Used for progressive details at least kLastPasses.
+  std::vector<int> passes_to_pause_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_FRAME_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.cc b/third_party/jpeg-xl/lib/jxl/dec_group.cc
new file mode 100644
index 0000000000..be8df9b062
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group.cc
@@ -0,0 +1,801 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_group.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "lib/jxl/frame_header.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_group.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer-inl.h"
+#include "lib/jxl/quantizer.h"
+
+#ifndef LIB_JXL_DEC_GROUP_CC
+#define LIB_JXL_DEC_GROUP_CC
+namespace jxl {
+
+struct AuxOut;
+
+// Interface for reading groups for DecodeGroupImpl.
+class GetBlock {
+ public:
+  virtual void StartRow(size_t by) = 0;
+  virtual Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs,
+                           size_t size, size_t log2_covered_blocks,
+                           ACPtr block[3], ACType ac_type) = 0;
+  virtual ~GetBlock() {}
+};
+
+// Controls whether DecodeGroupImpl renders to pixels or not.
+enum DrawMode {
+  // Render to pixels.
+  kDraw = 0,
+  // Don't render to pixels.
+  kDontDraw = 1,
+};
+
+}  // namespace jxl
+#endif  // LIB_JXL_DEC_GROUP_CC
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftRight;
+
+using D = HWY_FULL(float);
+using DU = HWY_FULL(uint32_t);
+using DI = HWY_FULL(int32_t);
+using DI16 = Rebind<int16_t, DI>;
+constexpr D d;
+constexpr DI di;
+constexpr DI16 di16;
+
+// TODO(veluca): consider SIMDfying.
+void Transpose8x8InPlace(int32_t* JXL_RESTRICT block) {
+  for (size_t x = 0; x < 8; x++) {
+    for (size_t y = x + 1; y < 8; y++) {
+      std::swap(block[y * 8 + x], block[x * 8 + y]);
+    }
+  }
+}
+
+template <ACType ac_type>
+void DequantLane(Vec<D> scaled_dequant_x, Vec<D> scaled_dequant_y,
+                 Vec<D> scaled_dequant_b,
+                 const float* JXL_RESTRICT dequant_matrices, size_t size,
+                 size_t k, Vec<D> x_cc_mul, Vec<D> b_cc_mul,
+                 const float* JXL_RESTRICT biases, ACPtr qblock[3],
+                 float* JXL_RESTRICT block) {
+  const auto x_mul = Mul(Load(d, dequant_matrices + k), scaled_dequant_x);
+  const auto y_mul =
+      Mul(Load(d, dequant_matrices + size + k), scaled_dequant_y);
+  const auto b_mul =
+      Mul(Load(d, dequant_matrices + 2 * size + k), scaled_dequant_b);
+
+  Vec<DI> quantized_x_int;
+  Vec<DI> quantized_y_int;
+  Vec<DI> quantized_b_int;
+  if (ac_type == ACType::k16) {
+    Rebind<int16_t, DI> di16;
+    quantized_x_int = PromoteTo(di, Load(di16, qblock[0].ptr16 + k));
+    quantized_y_int = PromoteTo(di, Load(di16, qblock[1].ptr16 + k));
+    quantized_b_int = PromoteTo(di, Load(di16, qblock[2].ptr16 + k));
+  } else {
+    quantized_x_int = Load(di, qblock[0].ptr32 + k);
+    quantized_y_int = Load(di, qblock[1].ptr32 + k);
+    quantized_b_int = Load(di, qblock[2].ptr32 + k);
+  }
+
+  const auto dequant_x_cc =
+      Mul(AdjustQuantBias(di, 0, quantized_x_int, biases), x_mul);
+  const auto dequant_y =
+      Mul(AdjustQuantBias(di, 1, quantized_y_int, biases), y_mul);
+  const auto dequant_b_cc =
+      Mul(AdjustQuantBias(di, 2, quantized_b_int, biases), b_mul);
+
+  const auto dequant_x = MulAdd(x_cc_mul, dequant_y, dequant_x_cc);
+  const auto dequant_b = MulAdd(b_cc_mul, dequant_y, dequant_b_cc);
+  Store(dequant_x, d, block + k);
+  Store(dequant_y, d, block + size + k);
+  Store(dequant_b, d, block + 2 * size + k);
+}
+
+template <ACType ac_type>
+void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
+                  float x_dm_multiplier, float b_dm_multiplier, Vec<D> x_cc_mul,
+                  Vec<D> b_cc_mul, size_t kind, size_t size,
+                  const Quantizer& quantizer, size_t covered_blocks,
+                  const size_t* sbx,
+                  const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
+                  size_t dc_stride, const float* JXL_RESTRICT biases,
+                  ACPtr qblock[3], float* JXL_RESTRICT block) {
+  PROFILER_FUNC;
+
+  const auto scaled_dequant_s = inv_global_scale / quant;
+
+  const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
+  const auto scaled_dequant_y = Set(d, scaled_dequant_s);
+  const auto scaled_dequant_b = Set(d, scaled_dequant_s * b_dm_multiplier);
+
+  const float* dequant_matrices = quantizer.DequantMatrix(kind, 0);
+
+  for (size_t k = 0; k < covered_blocks * kDCTBlockSize; k += Lanes(d)) {
+    DequantLane<ac_type>(scaled_dequant_x, scaled_dequant_y, scaled_dequant_b,
+                         dequant_matrices, size, k, x_cc_mul, b_cc_mul, biases,
+                         qblock, block);
+  }
+  for (size_t c = 0; c < 3; c++) {
+    LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
+                            block + c * size);
+  }
+}
+
+Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
+                       GroupDecCache* JXL_RESTRICT group_dec_cache,
+                       PassesDecoderState* JXL_RESTRICT dec_state,
+                       size_t thread, size_t group_idx,
+                       RenderPipelineInput& render_pipeline_input,
+                       ImageBundle* decoded, DrawMode draw) {
+  // TODO(veluca): investigate cache usage in this function.
+  PROFILER_FUNC;
+  const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx);
+  const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
+
+  const size_t xsize_blocks = block_rect.xsize();
+  const size_t ysize_blocks = block_rect.ysize();
+
+  const size_t dc_stride = dec_state->shared->dc->PixelsPerRow();
+
+  const float inv_global_scale = dec_state->shared->quantizer.InvGlobalScale();
+
+  const YCbCrChromaSubsampling& cs =
+      dec_state->shared->frame_header.chroma_subsampling;
+
+  size_t idct_stride[3];
+  for (size_t c = 0; c < 3; c++) {
+    idct_stride[c] = render_pipeline_input.GetBuffer(c).first->PixelsPerRow();
+  }
+
+  HWY_ALIGN int32_t scaled_qtable[64 * 3];
+
+  ACType ac_type = dec_state->coefficients->Type();
+  auto dequant_block = ac_type == ACType::k16 ? DequantBlock<ACType::k16>
+                                              : DequantBlock<ACType::k32>;
+  // Whether or not coefficients should be stored for future usage, and/or read
+  // from past usage.
+  bool accumulate = !dec_state->coefficients->IsEmpty();
+  // Offset of the current block in the group.
+  size_t offset = 0;
+
+  std::array<int, 3> jpeg_c_map;
+  bool jpeg_is_gray = false;
+  std::array<int, 3> dcoff = {};
+
+  // TODO(veluca): all of this should be done only once per image.
+  if (decoded->IsJPEG()) {
+    if (!dec_state->shared->cmap.IsJPEGCompatible()) {
+      return JXL_FAILURE("The CfL map is not JPEG-compatible");
+    }
+    jpeg_is_gray = (decoded->jpeg_data->components.size() == 1);
+    jpeg_c_map = JpegOrder(dec_state->shared->frame_header.color_transform,
+                           jpeg_is_gray);
+    const std::vector<QuantEncoding>& qe =
+        dec_state->shared->matrices.encodings();
+    if (qe.empty() || qe[0].mode != QuantEncoding::Mode::kQuantModeRAW ||
+        std::abs(qe[0].qraw.qtable_den - 1.f / (8 * 255)) > 1e-8f) {
+      return JXL_FAILURE(
+          "Quantization table is not a JPEG quantization table.");
+    }
+    for (size_t c = 0; c < 3; c++) {
+      if (dec_state->shared->frame_header.color_transform ==
+          ColorTransform::kNone) {
+        dcoff[c] = 1024 / (*qe[0].qraw.qtable)[64 * c];
+      }
+      for (size_t i = 0; i < 64; i++) {
+        // Transpose the matrix, as it will be used on the transposed block.
+        int n = qe[0].qraw.qtable->at(64 + i);
+        int d = qe[0].qraw.qtable->at(64 * c + i);
+        if (n <= 0 || d <= 0 || n >= 65536 || d >= 65536) {
+          return JXL_FAILURE("Invalid JPEG quantization table");
+        }
+        scaled_qtable[64 * c + (i % 8) * 8 + (i / 8)] =
+            (1 << kCFLFixedPointPrecision) * n / d;
+      }
+    }
+  }
+
+  size_t hshift[3] = {cs.HShift(0), cs.HShift(1), cs.HShift(2)};
+  size_t vshift[3] = {cs.VShift(0), cs.VShift(1), cs.VShift(2)};
+  Rect r[3];
+  for (size_t i = 0; i < 3; i++) {
+    r[i] =
+        Rect(block_rect.x0() >> hshift[i], block_rect.y0() >> vshift[i],
+             block_rect.xsize() >> hshift[i], block_rect.ysize() >> vshift[i]);
+    if (!r[i].IsInside({0, 0, dec_state->shared->dc->Plane(i).xsize(),
+                        dec_state->shared->dc->Plane(i).ysize()})) {
+      return JXL_FAILURE("Frame dimensions are too big for the image.");
+    }
+  }
+
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    get_block->StartRow(by);
+    size_t sby[3] = {by >> vshift[0], by >> vshift[1], by >> vshift[2]};
+
+    const int32_t* JXL_RESTRICT row_quant =
+        block_rect.ConstRow(dec_state->shared->raw_quant_field, by);
+
+    const float* JXL_RESTRICT dc_rows[3] = {
+        r[0].ConstPlaneRow(*dec_state->shared->dc, 0, sby[0]),
+        r[1].ConstPlaneRow(*dec_state->shared->dc, 1, sby[1]),
+        r[2].ConstPlaneRow(*dec_state->shared->dc, 2, sby[2]),
+    };
+
+    const size_t ty = (block_rect.y0() + by) / kColorTileDimInBlocks;
+    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
+
+    const int8_t* JXL_RESTRICT row_cmap[3] = {
+        dec_state->shared->cmap.ytox_map.ConstRow(ty),
+        nullptr,
+        dec_state->shared->cmap.ytob_map.ConstRow(ty),
+    };
+
+    float* JXL_RESTRICT idct_row[3];
+    int16_t* JXL_RESTRICT jpeg_row[3];
+    for (size_t c = 0; c < 3; c++) {
+      idct_row[c] = render_pipeline_input.GetBuffer(c).second.Row(
+          render_pipeline_input.GetBuffer(c).first, sby[c] * kBlockDim);
+      if (decoded->IsJPEG()) {
+        auto& component = decoded->jpeg_data->components[jpeg_c_map[c]];
+        jpeg_row[c] =
+            component.coeffs.data() +
+            (component.width_in_blocks * (r[c].y0() + sby[c]) + r[c].x0()) *
+                kDCTBlockSize;
+      }
+    }
+
+    size_t bx = 0;
+    for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
+         tx++) {
+      size_t abs_tx = tx + block_rect.x0() / kColorTileDimInBlocks;
+      auto x_cc_mul =
+          Set(d, dec_state->shared->cmap.YtoXRatio(row_cmap[0][abs_tx]));
+      auto b_cc_mul =
+          Set(d, dec_state->shared->cmap.YtoBRatio(row_cmap[2][abs_tx]));
+      // Increment bx by llf_x because those iterations would otherwise
+      // immediately continue (!IsFirstBlock). Reduces mispredictions.
+      for (; bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks;) {
+        size_t sbx[3] = {bx >> hshift[0], bx >> hshift[1], bx >> hshift[2]};
+        AcStrategy acs = acs_row[bx];
+        const size_t llf_x = acs.covered_blocks_x();
+
+        // Can only happen in the second or lower rows of a varblock.
+        if (JXL_UNLIKELY(!acs.IsFirstBlock())) {
+          bx += llf_x;
+          continue;
+        }
+        PROFILER_ZONE("DecodeGroupImpl inner");
+        const size_t log2_covered_blocks = acs.log2_covered_blocks();
+
+        const size_t covered_blocks = 1 << log2_covered_blocks;
+        const size_t size = covered_blocks * kDCTBlockSize;
+
+        ACPtr qblock[3];
+        if (accumulate) {
+          for (size_t c = 0; c < 3; c++) {
+            qblock[c] = dec_state->coefficients->PlaneRow(c, group_idx, offset);
+          }
+        } else {
+          // No point in reading from bitstream without accumulating and not
+          // drawing.
+          JXL_ASSERT(draw == kDraw);
+          if (ac_type == ACType::k16) {
+            memset(group_dec_cache->dec_group_qblock16, 0,
+                   size * 3 * sizeof(int16_t));
+            for (size_t c = 0; c < 3; c++) {
+              qblock[c].ptr16 = group_dec_cache->dec_group_qblock16 + c * size;
+            }
+          } else {
+            memset(group_dec_cache->dec_group_qblock, 0,
+                   size * 3 * sizeof(int32_t));
+            for (size_t c = 0; c < 3; c++) {
+              qblock[c].ptr32 = group_dec_cache->dec_group_qblock + c * size;
+            }
+          }
+        }
+        JXL_RETURN_IF_ERROR(get_block->LoadBlock(
+            bx, by, acs, size, log2_covered_blocks, qblock, ac_type));
+        offset += size;
+        if (draw == kDontDraw) {
+          bx += llf_x;
+          continue;
+        }
+
+        if (JXL_UNLIKELY(decoded->IsJPEG())) {
+          if (acs.Strategy() != AcStrategy::Type::DCT) {
+            return JXL_FAILURE(
+                "Can only decode to JPEG if only DCT-8 is used.");
+          }
+
+          HWY_ALIGN int32_t transposed_dct_y[64];
+          for (size_t c : {1, 0, 2}) {
+            // Propagate only Y for grayscale.
+            if (jpeg_is_gray && c != 1) {
+              continue;
+            }
+            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
+              continue;
+            }
+            int16_t* JXL_RESTRICT jpeg_pos =
+                jpeg_row[c] + sbx[c] * kDCTBlockSize;
+            // JPEG XL is transposed, JPEG is not.
+            auto transposed_dct = qblock[c].ptr32;
+            Transpose8x8InPlace(transposed_dct);
+            // No CfL - no need to store the y block converted to integers.
+            if (!cs.Is444() ||
+                (row_cmap[0][abs_tx] == 0 && row_cmap[2][abs_tx] == 0)) {
+              for (size_t i = 0; i < 64; i += Lanes(d)) {
+                const auto ini = Load(di, transposed_dct + i);
+                const auto ini16 = DemoteTo(di16, ini);
+                StoreU(ini16, di16, jpeg_pos + i);
+              }
+            } else if (c == 1) {
+              // Y channel: save for restoring X/B, but nothing else to do.
+              for (size_t i = 0; i < 64; i += Lanes(d)) {
+                const auto ini = Load(di, transposed_dct + i);
+                Store(ini, di, transposed_dct_y + i);
+                const auto ini16 = DemoteTo(di16, ini);
+                StoreU(ini16, di16, jpeg_pos + i);
+              }
+            } else {
+              // transposed_dct_y contains the y channel block, transposed.
+              const auto scale = Set(
+                  di, dec_state->shared->cmap.RatioJPEG(row_cmap[c][abs_tx]));
+              const auto round = Set(di, 1 << (kCFLFixedPointPrecision - 1));
+              for (int i = 0; i < 64; i += Lanes(d)) {
+                auto in = Load(di, transposed_dct + i);
+                auto in_y = Load(di, transposed_dct_y + i);
+                auto qt = Load(di, scaled_qtable + c * size + i);
+                auto coeff_scale = ShiftRight<kCFLFixedPointPrecision>(
+                    Add(Mul(qt, scale), round));
+                auto cfl_factor = ShiftRight<kCFLFixedPointPrecision>(
+                    Add(Mul(in_y, coeff_scale), round));
+                StoreU(DemoteTo(di16, Add(in, cfl_factor)), di16, jpeg_pos + i);
+              }
+            }
+            jpeg_pos[0] =
+                Clamp1<float>(dc_rows[c][sbx[c]] - dcoff[c], -2047, 2047);
+          }
+        } else {
+          HWY_ALIGN float* const block = group_dec_cache->dec_group_block;
+          // Dequantize and add predictions.
+          dequant_block(
+              acs, inv_global_scale, row_quant[bx], dec_state->x_dm_multiplier,
+              dec_state->b_dm_multiplier, x_cc_mul, b_cc_mul, acs.RawStrategy(),
+              size, dec_state->shared->quantizer,
+              acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
+              dc_stride,
+              dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
+              block);
+
+          for (size_t c : {1, 0, 2}) {
+            if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
+              continue;
+            }
+            // IDCT
+            float* JXL_RESTRICT idct_pos = idct_row[c] + sbx[c] * kBlockDim;
+            TransformToPixels(acs.Strategy(), block + c * size, idct_pos,
+                              idct_stride[c], group_dec_cache->scratch_space);
+          }
+        }
+        bx += llf_x;
+      }
+    }
+  }
+  if (draw == kDontDraw) {
+    return true;
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+// Decode quantized AC coefficients of DCT blocks.
+// LLF components in the output block will not be modified.
+template <ACType ac_type>
+Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
+                        int32_t* JXL_RESTRICT row_nzeros,
+                        const int32_t* JXL_RESTRICT row_nzeros_top,
+                        size_t nzeros_stride, size_t c, size_t bx, size_t by,
+                        size_t lbx, AcStrategy acs,
+                        const coeff_order_t* JXL_RESTRICT coeff_order,
+                        BitReader* JXL_RESTRICT br,
+                        ANSSymbolReader* JXL_RESTRICT decoder,
+                        const std::vector<uint8_t>& context_map,
+                        const uint8_t* qdc_row, const int32_t* qf_row,
+                        const BlockCtxMap& block_ctx_map, ACPtr block,
+                        size_t shift = 0) {
+  PROFILER_FUNC;
+  // Equal to number of LLF coefficients.
+  const size_t covered_blocks = 1 << log2_covered_blocks;
+  const size_t size = covered_blocks * kDCTBlockSize;
+  int32_t predicted_nzeros =
+      PredictFromTopAndLeft(row_nzeros_top, row_nzeros, bx, 32);
+
+  size_t ord = kStrategyOrder[acs.RawStrategy()];
+  const coeff_order_t* JXL_RESTRICT order =
+      &coeff_order[CoeffOrderOffset(ord, c)];
+
+  size_t block_ctx = block_ctx_map.Context(qdc_row[lbx], qf_row[bx], ord, c);
+  const int32_t nzero_ctx =
+      block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
+
+  size_t nzeros = decoder->ReadHybridUint(nzero_ctx, br, context_map);
+  if (nzeros + covered_blocks > size) {
+    return JXL_FAILURE("Invalid AC: nzeros too large");
+  }
+  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+      row_nzeros[bx + x + y * nzeros_stride] =
+          (nzeros + covered_blocks - 1) >> log2_covered_blocks;
+    }
+  }
+
+  const size_t histo_offset =
+      ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
+
+  // Skip LLF
+  {
+    PROFILER_ZONE("AcDecSkipLLF, reader");
+    size_t prev = (nzeros > size / 16 ? 0 : 1);
+    for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
+      const size_t ctx =
+          histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
+                                            log2_covered_blocks, prev);
+      const size_t u_coeff = decoder->ReadHybridUint(ctx, br, context_map);
+      // Hand-rolled version of UnpackSigned, shifting before the conversion to
+      // signed integer to avoid undefined behavior of shifting negative
+      // numbers.
+      const size_t magnitude = u_coeff >> 1;
+      const size_t neg_sign = (~u_coeff) & 1;
+      const intptr_t coeff =
+          static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
+      if (ac_type == ACType::k16) {
+        block.ptr16[order[k]] += coeff;
+      } else {
+        block.ptr32[order[k]] += coeff;
+      }
+      prev = static_cast<size_t>(u_coeff != 0);
+      nzeros -= prev;
+    }
+    if (JXL_UNLIKELY(nzeros != 0)) {
+      return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS
+                         "), channel %" PRIuS,
+                         bx, by, c);
+    }
+  }
+  return true;
+}
+
+// Structs used by DecodeGroupImpl to get a quantized block.
+// GetBlockFromBitstream uses ANS decoding (and thus keeps track of row
+// pointers in row_nzeros), GetBlockFromEncoder simply reads the coefficient
+// image provided by the encoder.
+
+struct GetBlockFromBitstream : public GetBlock {
+  void StartRow(size_t by) override {
+    qf_row = rect.ConstRow(*qf, by);
+    for (size_t c = 0; c < 3; c++) {
+      size_t sby = by >> vshift[c];
+      quant_dc_row = quant_dc->ConstRow(rect.y0() + by) + rect.x0();
+      for (size_t i = 0; i < num_passes; i++) {
+        row_nzeros[i][c] = group_dec_cache->num_nzeroes[i].PlaneRow(c, sby);
+        row_nzeros_top[i][c] =
+            sby == 0
+                ? nullptr
+                : group_dec_cache->num_nzeroes[i].ConstPlaneRow(c, sby - 1);
+      }
+    }
+  }
+
+  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
+                   size_t log2_covered_blocks, ACPtr block[3],
+                   ACType ac_type) override {
+    auto decode_ac_varblock = ac_type == ACType::k16
+                                  ? DecodeACVarBlock<ACType::k16>
+                                  : DecodeACVarBlock<ACType::k32>;
+    for (size_t c : {1, 0, 2}) {
+      size_t sbx = bx >> hshift[c];
+      size_t sby = by >> vshift[c];
+      if (JXL_UNLIKELY((sbx << hshift[c] != bx) || (sby << vshift[c] != by))) {
+        continue;
+      }
+
+      for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
+        JXL_RETURN_IF_ERROR(decode_ac_varblock(
+            ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
+            row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
+            &coeff_orders[pass * coeff_order_size], readers[pass],
+            &decoders[pass], context_map[pass], quant_dc_row, qf_row,
+            *block_ctx_map, block[c], shift_for_pass[pass]));
+      }
+    }
+    return true;
+  }
+
+  Status Init(BitReader* JXL_RESTRICT* JXL_RESTRICT readers, size_t num_passes,
+              size_t group_idx, size_t histo_selector_bits, const Rect& rect,
+              GroupDecCache* JXL_RESTRICT group_dec_cache,
+              PassesDecoderState* dec_state, size_t first_pass) {
+    for (size_t i = 0; i < 3; i++) {
+      hshift[i] = dec_state->shared->frame_header.chroma_subsampling.HShift(i);
+      vshift[i] = dec_state->shared->frame_header.chroma_subsampling.VShift(i);
+    }
+    this->coeff_order_size = dec_state->shared->coeff_order_size;
+    this->coeff_orders =
+        dec_state->shared->coeff_orders.data() + first_pass * coeff_order_size;
+    this->context_map = dec_state->context_map.data() + first_pass;
+    this->readers = readers;
+    this->num_passes = num_passes;
+    this->shift_for_pass =
+        dec_state->shared->frame_header.passes.shift + first_pass;
+    this->group_dec_cache = group_dec_cache;
+    this->rect = rect;
+    block_ctx_map = &dec_state->shared->block_ctx_map;
+    qf = &dec_state->shared->raw_quant_field;
+    quant_dc = &dec_state->shared->quant_dc;
+
+    for (size_t pass = 0; pass < num_passes; pass++) {
+      // Select which histogram set to use among those of the current pass.
+      size_t cur_histogram = 0;
+      if (histo_selector_bits != 0) {
+        cur_histogram = readers[pass]->ReadBits(histo_selector_bits);
+      }
+      if (cur_histogram >= dec_state->shared->num_histograms) {
+        return JXL_FAILURE("Invalid histogram selector");
+      }
+      ctx_offset[pass] = cur_histogram * block_ctx_map->NumACContexts();
+
+      decoders[pass] =
+          ANSSymbolReader(&dec_state->code[pass + first_pass], readers[pass]);
+    }
+    nzeros_stride = group_dec_cache->num_nzeroes[0].PixelsPerRow();
+    for (size_t i = 0; i < num_passes; i++) {
+      JXL_ASSERT(
+          nzeros_stride ==
+          static_cast<size_t>(group_dec_cache->num_nzeroes[i].PixelsPerRow()));
+    }
+    return true;
+  }
+
+  const uint32_t* shift_for_pass = nullptr;  // not owned
+  const coeff_order_t* JXL_RESTRICT coeff_orders;
+  size_t coeff_order_size;
+  const std::vector<uint8_t>* JXL_RESTRICT context_map;
+  ANSSymbolReader decoders[kMaxNumPasses];
+  BitReader* JXL_RESTRICT* JXL_RESTRICT readers;
+  size_t num_passes;
+  size_t ctx_offset[kMaxNumPasses];
+  size_t nzeros_stride;
+  int32_t* JXL_RESTRICT row_nzeros[kMaxNumPasses][3];
+  const int32_t* JXL_RESTRICT row_nzeros_top[kMaxNumPasses][3];
+  GroupDecCache* JXL_RESTRICT group_dec_cache;
+  const BlockCtxMap* block_ctx_map;
+  const ImageI* qf;
+  const ImageB* quant_dc;
+  const int32_t* qf_row;
+  const uint8_t* quant_dc_row;
+  Rect rect;
+  size_t hshift[3], vshift[3];
+};
+
+struct GetBlockFromEncoder : public GetBlock {
+  void StartRow(size_t by) override {}
+
+  Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
+                   size_t log2_covered_blocks, ACPtr block[3],
+                   ACType ac_type) override {
+    JXL_DASSERT(ac_type == ACType::k32);
+    for (size_t c = 0; c < 3; c++) {
+      // for each pass
+      for (size_t i = 0; i < quantized_ac->size(); i++) {
+        for (size_t k = 0; k < size; k++) {
+          // TODO(veluca): SIMD.
+          block[c].ptr32[k] +=
+              rows[i][c][offset + k] * (1 << shift_for_pass[i]);
+        }
+      }
+    }
+    offset += size;
+    return true;
+  }
+
+  GetBlockFromEncoder(const std::vector<std::unique_ptr<ACImage>>& ac,
+                      size_t group_idx, const uint32_t* shift_for_pass)
+      : quantized_ac(&ac), shift_for_pass(shift_for_pass) {
+    // TODO(veluca): not supported with chroma subsampling.
+    for (size_t i = 0; i < quantized_ac->size(); i++) {
+      JXL_CHECK((*quantized_ac)[i]->Type() == ACType::k32);
+      for (size_t c = 0; c < 3; c++) {
+        rows[i][c] = (*quantized_ac)[i]->PlaneRow(c, group_idx, 0).ptr32;
+      }
+    }
+  }
+
+  const std::vector<std::unique_ptr<ACImage>>* JXL_RESTRICT quantized_ac;
+  size_t offset = 0;
+  const int32_t* JXL_RESTRICT rows[kMaxNumPasses][3];
+  const uint32_t* shift_for_pass = nullptr;  // not owned
+};
+
+HWY_EXPORT(DecodeGroupImpl);
+
+}  // namespace
+
+Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
+                   size_t num_passes, size_t group_idx,
+                   PassesDecoderState* JXL_RESTRICT dec_state,
+                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
+                   RenderPipelineInput& render_pipeline_input,
+                   ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
+                   bool force_draw, bool dc_only, bool* should_run_pipeline) {
+  PROFILER_FUNC;
+
+  DrawMode draw = (num_passes + first_pass ==
+                   dec_state->shared->frame_header.passes.num_passes) ||
+                          force_draw
+                      ? kDraw
+                      : kDontDraw;
+
+  if (should_run_pipeline) {
+    *should_run_pipeline = draw != kDontDraw;
+  }
+
+  if (draw == kDraw && num_passes == 0 && first_pass == 0) {
+    group_dec_cache->InitDCBufferOnce();
+    const YCbCrChromaSubsampling& cs =
+        dec_state->shared->frame_header.chroma_subsampling;
+    for (size_t c : {0, 1, 2}) {
+      size_t hs = cs.HShift(c);
+      size_t vs = cs.VShift(c);
+      // We reuse filter_input_storage here as it is not currently in use.
+      const Rect src_rect_precs = dec_state->shared->BlockGroupRect(group_idx);
+      const Rect src_rect =
+          Rect(src_rect_precs.x0() >> hs, src_rect_precs.y0() >> vs,
+               src_rect_precs.xsize() >> hs, src_rect_precs.ysize() >> vs);
+      const Rect copy_rect(kRenderPipelineXOffset, 2, src_rect.xsize(),
+                           src_rect.ysize());
+      CopyImageToWithPadding(src_rect, dec_state->shared->dc->Plane(c), 2,
+                             copy_rect, &group_dec_cache->dc_buffer);
+      // Mirrorpad. Interleaving left and right padding ensures that padding
+      // works out correctly even for images with DC size of 1.
+      for (size_t y = 0; y < src_rect.ysize() + 4; y++) {
+        size_t xend = kRenderPipelineXOffset +
+                      (dec_state->shared->dc->Plane(c).xsize() >> hs) -
+                      src_rect.x0();
+        for (size_t ix = 0; ix < 2; ix++) {
+          if (src_rect.x0() == 0) {
+            group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset - ix - 1] =
+                group_dec_cache->dc_buffer.Row(y)[kRenderPipelineXOffset + ix];
+          }
+          if (src_rect.x0() + src_rect.xsize() + 2 >=
+              (dec_state->shared->dc->xsize() >> hs)) {
+            group_dec_cache->dc_buffer.Row(y)[xend + ix] =
+                group_dec_cache->dc_buffer.Row(y)[xend - ix - 1];
+          }
+        }
+      }
+      Rect dst_rect = render_pipeline_input.GetBuffer(c).second;
+      ImageF* upsampling_dst = render_pipeline_input.GetBuffer(c).first;
+      JXL_ASSERT(dst_rect.IsInside(*upsampling_dst));
+
+      RenderPipelineStage::RowInfo input_rows(1, std::vector<float*>(5));
+      RenderPipelineStage::RowInfo output_rows(1, std::vector<float*>(8));
+      for (size_t y = src_rect.y0(); y < src_rect.y0() + src_rect.ysize();
+           y++) {
+        for (ssize_t iy = 0; iy < 5; iy++) {
+          input_rows[0][iy] = group_dec_cache->dc_buffer.Row(
+              Mirror(ssize_t(y) + iy - 2,
+                     dec_state->shared->dc->Plane(c).ysize() >> vs) +
+              2 - src_rect.y0());
+        }
+        for (size_t iy = 0; iy < 8; iy++) {
+          output_rows[0][iy] =
+              dst_rect.Row(upsampling_dst, ((y - src_rect.y0()) << 3) + iy) -
+              kRenderPipelineXOffset;
+        }
+        // Arguments set to 0/nullptr are not used.
+        dec_state->upsampler8x->ProcessRow(input_rows, output_rows,
+                                           /*xextra=*/0, src_rect.xsize(), 0, 0,
+                                           thread);
+      }
+    }
+    return true;
+  }
+
+  size_t histo_selector_bits = 0;
+  if (dc_only) {
+    JXL_ASSERT(num_passes == 0);
+  } else {
+    JXL_ASSERT(dec_state->shared->num_histograms > 0);
+    histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
+  }
+
+  GetBlockFromBitstream get_block;
+  JXL_RETURN_IF_ERROR(
+      get_block.Init(readers, num_passes, group_idx, histo_selector_bits,
+                     dec_state->shared->BlockGroupRect(group_idx),
+                     group_dec_cache, dec_state, first_pass));
+
+  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
+      &get_block, group_dec_cache, dec_state, thread, group_idx,
+      render_pipeline_input, decoded, draw));
+
+  for (size_t pass = 0; pass < num_passes; pass++) {
+    if (!get_block.decoders[pass].CheckANSFinalState()) {
+      return JXL_FAILURE("ANS checksum failure.");
+    }
+  }
+  return true;
+}
+
+Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
+                               size_t group_idx,
+                               PassesDecoderState* JXL_RESTRICT dec_state,
+                               GroupDecCache* JXL_RESTRICT group_dec_cache,
+                               size_t thread,
+                               RenderPipelineInput& render_pipeline_input,
+                               ImageBundle* JXL_RESTRICT decoded,
+                               AuxOut* aux_out) {
+  PROFILER_FUNC;
+
+  GetBlockFromEncoder get_block(ac, group_idx,
+                                dec_state->shared->frame_header.passes.shift);
+  group_dec_cache->InitOnce(
+      /*num_passes=*/0,
+      /*used_acs=*/(1u << AcStrategy::kNumValidStrategies) - 1);
+
+  return HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
+      &get_block, group_dec_cache, dec_state, thread, group_idx,
+      render_pipeline_input, decoded, kDraw);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group.h b/third_party/jpeg-xl/lib/jxl/dec_group.h
new file mode 100644
index 0000000000..e32ea67b5f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_GROUP_H_
+#define LIB_JXL_DEC_GROUP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
+                   size_t num_passes, size_t group_idx,
+                   PassesDecoderState* JXL_RESTRICT dec_state,
+                   GroupDecCache* JXL_RESTRICT group_dec_cache, size_t thread,
+                   RenderPipelineInput& render_pipeline_input,
+                   ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
+                   bool force_draw, bool dc_only, bool* should_run_pipeline);
+
+Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
+                               size_t group_idx,
+                               PassesDecoderState* JXL_RESTRICT dec_state,
+                               GroupDecCache* JXL_RESTRICT group_dec_cache,
+                               size_t thread,
+                               RenderPipelineInput& render_pipeline_input,
+                               ImageBundle* JXL_RESTRICT decoded,
+                               AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_GROUP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.cc b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc
new file mode 100644
index 0000000000..4bee3ae6ef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.cc
@@ -0,0 +1,184 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_group_border.h"
+
+#include <atomic>
+
+namespace jxl {
+
+void GroupBorderAssigner::Init(const FrameDimensions& frame_dim) {
+  frame_dim_ = frame_dim;
+  size_t num_corners =
+      (frame_dim_.xsize_groups + 1) * (frame_dim_.ysize_groups + 1);
+  counters_.reset(new std::atomic<uint8_t>[num_corners]);
+  // Initialize counters.
+  for (size_t y = 0; y < frame_dim_.ysize_groups + 1; y++) {
+    for (size_t x = 0; x < frame_dim_.xsize_groups + 1; x++) {
+      // Counters at image borders don't have anything on the other side, we
+      // pre-fill their value to have more uniform handling afterwards.
+      uint8_t init_value = 0;
+      if (x == 0) {
+        init_value |= kTopLeft | kBottomLeft;
+      }
+      if (x == frame_dim_.xsize_groups) {
+        init_value |= kTopRight | kBottomRight;
+      }
+      if (y == 0) {
+        init_value |= kTopLeft | kTopRight;
+      }
+      if (y == frame_dim_.ysize_groups) {
+        init_value |= kBottomLeft | kBottomRight;
+      }
+      counters_[y * (frame_dim_.xsize_groups + 1) + x] = init_value;
+    }
+  }
+}
+
+void GroupBorderAssigner::ClearDone(size_t group_id) {
+  size_t x = group_id % frame_dim_.xsize_groups;
+  size_t y = group_id / frame_dim_.xsize_groups;
+  size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x;
+  size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x;
+  counters_[top_left_idx].fetch_and(~kBottomRight);
+  counters_[top_right_idx].fetch_and(~kBottomLeft);
+  counters_[bottom_left_idx].fetch_and(~kTopRight);
+  counters_[bottom_right_idx].fetch_and(~kTopLeft);
+}
+
+// Looking at each corner between groups, we can guarantee that the four
+// involved groups will agree between each other regarding the order in which
+// each of the four groups terminated. Thus, the last of the four groups
+// gets the responsibility of handling the corner. For borders, every border
+// is assigned to its top corner (for vertical borders) or to its left corner
+// (for horizontal borders): the order as seen on those corners will decide who
+// handles that border.
+
+void GroupBorderAssigner::GroupDone(size_t group_id, size_t padx, size_t pady,
+                                    Rect* rects_to_finalize,
+                                    size_t* num_to_finalize) {
+  size_t x = group_id % frame_dim_.xsize_groups;
+  size_t y = group_id / frame_dim_.xsize_groups;
+  Rect block_rect(x * frame_dim_.group_dim / kBlockDim,
+                  y * frame_dim_.group_dim / kBlockDim,
+                  frame_dim_.group_dim / kBlockDim,
+                  frame_dim_.group_dim / kBlockDim, frame_dim_.xsize_blocks,
+                  frame_dim_.ysize_blocks);
+
+  size_t top_left_idx = y * (frame_dim_.xsize_groups + 1) + x;
+  size_t top_right_idx = y * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_right_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x + 1;
+  size_t bottom_left_idx = (y + 1) * (frame_dim_.xsize_groups + 1) + x;
+
+  auto fetch_status = [this](size_t idx, uint8_t bit) {
+    // Note that the acq-rel semantics of this fetch are actually needed to
+    // ensure that the pixel data of the group is already written to memory.
+    size_t status = counters_[idx].fetch_or(bit);
+    JXL_DASSERT((bit & status) == 0);
+    return bit | status;
+  };
+
+  size_t top_left_status = fetch_status(top_left_idx, kBottomRight);
+  size_t top_right_status = fetch_status(top_right_idx, kBottomLeft);
+  size_t bottom_right_status = fetch_status(bottom_right_idx, kTopLeft);
+  size_t bottom_left_status = fetch_status(bottom_left_idx, kTopRight);
+
+  size_t x1 = block_rect.x0() + block_rect.xsize();
+  size_t y1 = block_rect.y0() + block_rect.ysize();
+
+  bool is_last_group_x = frame_dim_.xsize_groups == x + 1;
+  bool is_last_group_y = frame_dim_.ysize_groups == y + 1;
+
+  // Start of border of neighbouring group, end of border of this group, start
+  // of border of this group (on the other side), end of border of next group.
+  size_t xpos[4] = {
+      block_rect.x0() == 0 ? 0 : block_rect.x0() * kBlockDim - padx,
+      block_rect.x0() == 0
+          ? 0
+          : std::min(frame_dim_.xsize, block_rect.x0() * kBlockDim + padx),
+      is_last_group_x ? frame_dim_.xsize : x1 * kBlockDim - padx,
+      std::min(frame_dim_.xsize, x1 * kBlockDim + padx)};
+  size_t ypos[4] = {
+      block_rect.y0() == 0 ? 0 : block_rect.y0() * kBlockDim - pady,
+      block_rect.y0() == 0
+          ? 0
+          : std::min(frame_dim_.ysize, block_rect.y0() * kBlockDim + pady),
+      is_last_group_y ? frame_dim_.ysize : y1 * kBlockDim - pady,
+      std::min(frame_dim_.ysize, y1 * kBlockDim + pady)};
+
+  *num_to_finalize = 0;
+  auto append_rect = [&](size_t x0, size_t x1, size_t y0, size_t y1) {
+    Rect rect(xpos[x0], ypos[y0], xpos[x1] - xpos[x0], ypos[y1] - ypos[y0]);
+    if (rect.xsize() == 0 || rect.ysize() == 0) return;
+    JXL_DASSERT(*num_to_finalize < kMaxToFinalize);
+    rects_to_finalize[(*num_to_finalize)++] = rect;
+  };
+
+  // Because of how group borders are assigned, it is impossible that we need to
+  // process the left and right side of some area but not the center area. Thus,
+  // we compute the first/last part to process in every horizontal strip and
+  // merge them together. We first collect a mask of what parts should be
+  // processed.
+  // We do this horizontally rather than vertically because horizontal borders
+  // are larger.
+  bool available_parts_mask[3][3] = {};  // [x][y]
+  // Center
+  available_parts_mask[1][1] = true;
+  // Corners
+  if (top_left_status == 0xF) available_parts_mask[0][0] = true;
+  if (top_right_status == 0xF) available_parts_mask[2][0] = true;
+  if (bottom_right_status == 0xF) available_parts_mask[2][2] = true;
+  if (bottom_left_status == 0xF) available_parts_mask[0][2] = true;
+  // Other borders
+  if (top_left_status & kTopRight) available_parts_mask[1][0] = true;
+  if (top_left_status & kBottomLeft) available_parts_mask[0][1] = true;
+  if (top_right_status & kBottomRight) available_parts_mask[2][1] = true;
+  if (bottom_left_status & kBottomRight) available_parts_mask[1][2] = true;
+
+  // Collect horizontal ranges.
+  constexpr size_t kNoSegment = 3;
+  std::pair<size_t, size_t> horizontal_segments[3] = {{kNoSegment, kNoSegment},
+                                                      {kNoSegment, kNoSegment},
+                                                      {kNoSegment, kNoSegment}};
+  for (size_t y = 0; y < 3; y++) {
+    for (size_t x = 0; x < 3; x++) {
+      if (!available_parts_mask[x][y]) continue;
+      JXL_DASSERT(horizontal_segments[y].second == kNoSegment ||
+                  horizontal_segments[y].second == x);
+      JXL_DASSERT((horizontal_segments[y].first == kNoSegment) ==
+                  (horizontal_segments[y].second == kNoSegment));
+      if (horizontal_segments[y].first == kNoSegment) {
+        horizontal_segments[y].first = x;
+      }
+      horizontal_segments[y].second = x + 1;
+    }
+  }
+  if (horizontal_segments[0] == horizontal_segments[1] &&
+      horizontal_segments[0] == horizontal_segments[2]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                3);
+  } else if (horizontal_segments[0] == horizontal_segments[1]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                2);
+    append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2,
+                3);
+  } else if (horizontal_segments[1] == horizontal_segments[2]) {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                1);
+    append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1,
+                3);
+  } else {
+    append_rect(horizontal_segments[0].first, horizontal_segments[0].second, 0,
+                1);
+    append_rect(horizontal_segments[1].first, horizontal_segments[1].second, 1,
+                2);
+    append_rect(horizontal_segments[2].first, horizontal_segments[2].second, 2,
+                3);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_group_border.h b/third_party/jpeg-xl/lib/jxl/dec_group_border.h
new file mode 100644
index 0000000000..2d974c9987
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_group_border.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_GROUP_BORDER_H_
+#define LIB_JXL_DEC_GROUP_BORDER_H_
+
+#include <stddef.h>
+
+#include <atomic>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+class GroupBorderAssigner {
+ public:
+  // Prepare the GroupBorderAssigner to handle a given frame.
+  void Init(const FrameDimensions& frame_dim);
+  // Marks a group as done, and returns the (at most 3) rects to run
+  // FinalizeImageRect on. `block_rect` must be the rect corresponding
+  // to the given `group_id`, measured in blocks.
+  void GroupDone(size_t group_id, size_t padx, size_t pady,
+                 Rect* rects_to_finalize, size_t* num_to_finalize);
+  // Marks a group as not-done, for running re-paints.
+  void ClearDone(size_t group_id);
+
+  static constexpr size_t kMaxToFinalize = 3;
+
+ private:
+  FrameDimensions frame_dim_;
+  std::unique_ptr<std::atomic<uint8_t>[]> counters_;
+
+  // Constants to identify group positions relative to the corners.
+  static constexpr uint8_t kTopLeft = 0x01;
+  static constexpr uint8_t kTopRight = 0x02;
+  static constexpr uint8_t kBottomRight = 0x04;
+  static constexpr uint8_t kBottomLeft = 0x08;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_GROUP_BORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.cc b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc
new file mode 100644
index 0000000000..05b275773a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc
@@ -0,0 +1,255 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_huffman.h"
+
+#include <string.h> /* for memset */
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+static const int kCodeLengthCodes = 18;
+static const uint8_t kCodeLengthCodeOrder[kCodeLengthCodes] = {
+    1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+static const uint8_t kDefaultCodeLength = 8;
+static const uint8_t kCodeLengthRepeatCode = 16;
+
+int ReadHuffmanCodeLengths(const uint8_t* code_length_code_lengths,
+                           int num_symbols, uint8_t* code_lengths,
+                           BitReader* br) {
+  int symbol = 0;
+  uint8_t prev_code_len = kDefaultCodeLength;
+  int repeat = 0;
+  uint8_t repeat_code_len = 0;
+  int space = 32768;
+  HuffmanCode table[32];
+
+  uint16_t counts[16] = {0};
+  for (int i = 0; i < kCodeLengthCodes; ++i) {
+    ++counts[code_length_code_lengths[i]];
+  }
+  if (!BuildHuffmanTable(table, 5, code_length_code_lengths, kCodeLengthCodes,
+                         &counts[0])) {
+    return 0;
+  }
+
+  while (symbol < num_symbols && space > 0) {
+    const HuffmanCode* p = table;
+    uint8_t code_len;
+    br->Refill();
+    p += br->PeekFixedBits<5>();
+    br->Consume(p->bits);
+    code_len = (uint8_t)p->value;
+    if (code_len < kCodeLengthRepeatCode) {
+      repeat = 0;
+      code_lengths[symbol++] = code_len;
+      if (code_len != 0) {
+        prev_code_len = code_len;
+        space -= 32768u >> code_len;
+      }
+    } else {
+      const int extra_bits = code_len - 14;
+      int old_repeat;
+      int repeat_delta;
+      uint8_t new_len = 0;
+      if (code_len == kCodeLengthRepeatCode) {
+        new_len = prev_code_len;
+      }
+      if (repeat_code_len != new_len) {
+        repeat = 0;
+        repeat_code_len = new_len;
+      }
+      old_repeat = repeat;
+      if (repeat > 0) {
+        repeat -= 2;
+        repeat <<= extra_bits;
+      }
+      repeat += (int)br->ReadBits(extra_bits) + 3;
+      repeat_delta = repeat - old_repeat;
+      if (symbol + repeat_delta > num_symbols) {
+        return 0;
+      }
+      memset(&code_lengths[symbol], repeat_code_len, (size_t)repeat_delta);
+      symbol += repeat_delta;
+      if (repeat_code_len != 0) {
+        space -= repeat_delta << (15 - repeat_code_len);
+      }
+    }
+  }
+  if (space != 0) {
+    return 0;
+  }
+  memset(&code_lengths[symbol], 0, (size_t)(num_symbols - symbol));
+  return true;
+}
+
+static JXL_INLINE bool ReadSimpleCode(size_t alphabet_size, BitReader* br,
+                                      HuffmanCode* table) {
+  size_t max_bits =
+      (alphabet_size > 1u) ? FloorLog2Nonzero(alphabet_size - 1u) + 1 : 0;
+
+  size_t num_symbols = br->ReadFixedBits<2>() + 1;
+
+  uint16_t symbols[4] = {0};
+  for (size_t i = 0; i < num_symbols; ++i) {
+    uint16_t symbol = br->ReadBits(max_bits);
+    if (symbol >= alphabet_size) {
+      return false;
+    }
+    symbols[i] = symbol;
+  }
+
+  for (size_t i = 0; i < num_symbols - 1; ++i) {
+    for (size_t j = i + 1; j < num_symbols; ++j) {
+      if (symbols[i] == symbols[j]) return false;
+    }
+  }
+
+  // 4 symbols have to option to encode.
+  if (num_symbols == 4) num_symbols += br->ReadFixedBits<1>();
+
+  const auto swap_symbols = [&symbols](size_t i, size_t j) {
+    uint16_t t = symbols[j];
+    symbols[j] = symbols[i];
+    symbols[i] = t;
+  };
+
+  size_t table_size = 1;
+  switch (num_symbols) {
+    case 1:
+      table[0] = {0, symbols[0]};
+      break;
+    case 2:
+      if (symbols[0] > symbols[1]) swap_symbols(0, 1);
+      table[0] = {1, symbols[0]};
+      table[1] = {1, symbols[1]};
+      table_size = 2;
+      break;
+    case 3:
+      if (symbols[1] > symbols[2]) swap_symbols(1, 2);
+      table[0] = {1, symbols[0]};
+      table[2] = {1, symbols[0]};
+      table[1] = {2, symbols[1]};
+      table[3] = {2, symbols[2]};
+      table_size = 4;
+      break;
+    case 4: {
+      for (size_t i = 0; i < 3; ++i) {
+        for (size_t j = i + 1; j < 4; ++j) {
+          if (symbols[i] > symbols[j]) swap_symbols(i, j);
+        }
+      }
+      table[0] = {2, symbols[0]};
+      table[2] = {2, symbols[1]};
+      table[1] = {2, symbols[2]};
+      table[3] = {2, symbols[3]};
+      table_size = 4;
+      break;
+    }
+    case 5: {
+      if (symbols[2] > symbols[3]) swap_symbols(2, 3);
+      table[0] = {1, symbols[0]};
+      table[1] = {2, symbols[1]};
+      table[2] = {1, symbols[0]};
+      table[3] = {3, symbols[2]};
+      table[4] = {1, symbols[0]};
+      table[5] = {2, symbols[1]};
+      table[6] = {1, symbols[0]};
+      table[7] = {3, symbols[3]};
+      table_size = 8;
+      break;
+    }
+    default: {
+      // Unreachable.
+      return false;
+    }
+  }
+
+  const uint32_t goal_size = 1u << kHuffmanTableBits;
+  while (table_size != goal_size) {
+    memcpy(&table[table_size], &table[0],
+           (size_t)table_size * sizeof(table[0]));
+    table_size <<= 1;
+  }
+
+  return true;
+}
+
+bool HuffmanDecodingData::ReadFromBitStream(size_t alphabet_size,
+                                            BitReader* br) {
+  if (alphabet_size > (1 << PREFIX_MAX_BITS)) return false;
+
+  /* simple_code_or_skip is used as follows:
+     1 for simple code;
+     0 for no skipping, 2 skips 2 code lengths, 3 skips 3 code lengths */
+  uint32_t simple_code_or_skip = br->ReadFixedBits<2>();
+  if (simple_code_or_skip == 1u) {
+    table_.resize(1u << kHuffmanTableBits);
+    return ReadSimpleCode(alphabet_size, br, table_.data());
+  }
+
+  std::vector<uint8_t> code_lengths(alphabet_size, 0);
+  uint8_t code_length_code_lengths[kCodeLengthCodes] = {0};
+  int space = 32;
+  int num_codes = 0;
+  /* Static Huffman code for the code length code lengths */
+  static const HuffmanCode huff[16] = {
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1},
+      {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 5},
+  };
+  for (size_t i = simple_code_or_skip; i < kCodeLengthCodes && space > 0; ++i) {
+    const int code_len_idx = kCodeLengthCodeOrder[i];
+    const HuffmanCode* p = huff;
+    uint8_t v;
+    br->Refill();
+    p += br->PeekFixedBits<4>();
+    br->Consume(p->bits);
+    v = (uint8_t)p->value;
+    code_length_code_lengths[code_len_idx] = v;
+    if (v != 0) {
+      space -= (32u >> v);
+      ++num_codes;
+    }
+  }
+  bool ok = (num_codes == 1 || space == 0) &&
+            ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size,
+                                   &code_lengths[0], br);
+
+  if (!ok) return false;
+  uint16_t counts[16] = {0};
+  for (size_t i = 0; i < alphabet_size; ++i) {
+    ++counts[code_lengths[i]];
+  }
+  table_.resize(alphabet_size + 376);
+  uint32_t table_size =
+      BuildHuffmanTable(table_.data(), kHuffmanTableBits, &code_lengths[0],
+                        alphabet_size, &counts[0]);
+  table_.resize(table_size);
+  return (table_size > 0);
+}
+
+// Decodes the next Huffman coded symbol from the bit-stream.
+uint16_t HuffmanDecodingData::ReadSymbol(BitReader* br) const {
+  size_t n_bits;
+  const HuffmanCode* table = table_.data();
+  table += br->PeekBits(kHuffmanTableBits);
+  n_bits = table->bits;
+  if (n_bits > kHuffmanTableBits) {
+    br->Consume(kHuffmanTableBits);
+    n_bits -= kHuffmanTableBits;
+    table += table->value;
+    table += br->PeekBits(n_bits);
+  }
+  br->Consume(table->bits);
+  return table->value;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.h b/third_party/jpeg-xl/lib/jxl/dec_huffman.h
new file mode 100644
index 0000000000..162c3e309c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_HUFFMAN_H_
+#define LIB_JXL_DEC_HUFFMAN_H_
+
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+static constexpr size_t kHuffmanTableBits = 8u;
+
+struct HuffmanDecodingData {
+  // Decodes the Huffman code lengths from the bit-stream and fills in the
+  // pre-allocated table with the corresponding 2-level Huffman decoding table.
+  // Returns false if the Huffman code lengths can not de decoded.
+  bool ReadFromBitStream(size_t alphabet_size, BitReader* br);
+
+  uint16_t ReadSymbol(BitReader* br) const;
+
+  std::vector<HuffmanCode> table_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.cc b/third_party/jpeg-xl/lib/jxl/dec_modular.cc
new file mode 100644
index 0000000000..bf85eaa05c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.cc
@@ -0,0 +1,774 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_modular.h"
+
+#include <stdint.h>
+
+#include <atomic>
+#include <sstream>
+#include <vector>
+
+#include "lib/jxl/frame_header.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_modular.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+
+void MultiplySum(const size_t xsize,
+                 const pixel_type* const JXL_RESTRICT row_in,
+                 const pixel_type* const JXL_RESTRICT row_in_Y,
+                 const float factor, float* const JXL_RESTRICT row_out) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Add(Load(di, row_in + x), Load(di, row_in_Y + x));
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, row_out + x);
+  }
+}
+
+void RgbFromSingle(const size_t xsize,
+                   const pixel_type* const JXL_RESTRICT row_in,
+                   const float factor, float* out_r, float* out_g,
+                   float* out_b) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Load(di, row_in + x);
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, out_r + x);
+    Store(out, df, out_g + x);
+    Store(out, df, out_b + x);
+  }
+}
+
+void SingleFromSingle(const size_t xsize,
+                      const pixel_type* const JXL_RESTRICT row_in,
+                      const float factor, float* row_out) {
+  const HWY_FULL(float) df;
+  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
+
+  const auto factor_v = Set(df, factor);
+  for (size_t x = 0; x < xsize; x += Lanes(di)) {
+    const auto in = Load(di, row_in + x);
+    const auto out = Mul(ConvertTo(df, in), factor_v);
+    Store(out, df, row_out + x);
+  }
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(MultiplySum);       // Local function
+HWY_EXPORT(RgbFromSingle);     // Local function
+HWY_EXPORT(SingleFromSingle);  // Local function
+
+// Slow conversion using double precision multiplication, only
+// needed when the bit depth is too high for single precision
+void SingleFromSingleAccurate(const size_t xsize,
+                              const pixel_type* const JXL_RESTRICT row_in,
+                              const double factor, float* row_out) {
+  for (size_t x = 0; x < xsize; x++) {
+    row_out[x] = row_in[x] * factor;
+  }
+}
+
+// convert custom [bits]-bit float (with [exp_bits] exponent bits) stored as int
+// back to binary32 float
+void int_to_float(const pixel_type* const JXL_RESTRICT row_in,
+                  float* const JXL_RESTRICT row_out, const size_t xsize,
+                  const int bits, const int exp_bits) {
+  if (bits == 32) {
+    JXL_ASSERT(sizeof(pixel_type) == sizeof(float));
+    JXL_ASSERT(exp_bits == 8);
+    memcpy(row_out, row_in, xsize * sizeof(float));
+    return;
+  }
+  int exp_bias = (1 << (exp_bits - 1)) - 1;
+  int sign_shift = bits - 1;
+  int mant_bits = bits - exp_bits - 1;
+  int mant_shift = 23 - mant_bits;
+  for (size_t x = 0; x < xsize; ++x) {
+    uint32_t f;
+    memcpy(&f, &row_in[x], 4);
+    int signbit = (f >> sign_shift);
+    f &= (1 << sign_shift) - 1;
+    if (f == 0) {
+      row_out[x] = (signbit ? -0.f : 0.f);
+      continue;
+    }
+    int exp = (f >> mant_bits);
+    int mantissa = (f & ((1 << mant_bits) - 1));
+    mantissa <<= mant_shift;
+    // Try to normalize only if there is space for maneuver.
+    if (exp == 0 && exp_bits < 8) {
+      // subnormal number
+      while ((mantissa & 0x800000) == 0) {
+        mantissa <<= 1;
+        exp--;
+      }
+      exp++;
+      // remove leading 1 because it is implicit now
+      mantissa &= 0x7fffff;
+    }
+    exp -= exp_bias;
+    // broke up the arbitrary float into its parts, now reassemble into
+    // binary32
+    exp += 127;
+    JXL_ASSERT(exp >= 0);
+    f = (signbit ? 0x80000000 : 0);
+    f |= (exp << 23);
+    f |= mantissa;
+    memcpy(&row_out[x], &f, 4);
+  }
+}
+
+std::string ModularStreamId::DebugString() const {
+  std::ostringstream os;
+  os << (kind == kGlobalData   ? "ModularGlobal"
+         : kind == kVarDCTDC   ? "VarDCTDC"
+         : kind == kModularDC  ? "ModularDC"
+         : kind == kACMetadata ? "ACMeta"
+         : kind == kQuantTable ? "QuantTable"
+         : kind == kModularAC  ? "ModularAC"
+                               : "");
+  if (kind == kVarDCTDC || kind == kModularDC || kind == kACMetadata ||
+      kind == kModularAC) {
+    os << " group " << group_id;
+  }
+  if (kind == kModularAC) {
+    os << " pass " << pass_id;
+  }
+  if (kind == kQuantTable) {
+    os << " " << quant_table_id;
+  }
+  return os.str();
+}
+
+Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader,
+                                             const FrameHeader& frame_header,
+                                             bool allow_truncated_group) {
+  bool decode_color = frame_header.encoding == FrameEncoding::kModular;
+  const auto& metadata = frame_header.nonserialized_metadata->m;
+  bool is_gray = metadata.color_encoding.IsGray();
+  size_t nb_chans = 3;
+  if (is_gray && frame_header.color_transform == ColorTransform::kNone) {
+    nb_chans = 1;
+  }
+  do_color = decode_color;
+  size_t nb_extra = metadata.extra_channel_info.size();
+  bool has_tree = reader->ReadBits(1);
+  if (!allow_truncated_group ||
+      reader->TotalBitsConsumed() < reader->TotalBytes() * kBitsPerByte) {
+    if (has_tree) {
+      size_t tree_size_limit =
+          std::min(static_cast<size_t>(1 << 22),
+                   1024 + frame_dim.xsize * frame_dim.ysize *
+                              (nb_chans + nb_extra) / 16);
+      JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
+      JXL_RETURN_IF_ERROR(
+          DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
+    }
+  }
+  if (!do_color) nb_chans = 0;
+
+  bool fp = metadata.bit_depth.floating_point_sample;
+
+  // bits_per_sample is just metadata for XYB images.
+  if (metadata.bit_depth.bits_per_sample >= 32 && do_color &&
+      frame_header.color_transform != ColorTransform::kXYB) {
+    if (metadata.bit_depth.bits_per_sample == 32 && fp == false) {
+      return JXL_FAILURE("uint32_t not supported in dec_modular");
+    } else if (metadata.bit_depth.bits_per_sample > 32) {
+      return JXL_FAILURE("bits_per_sample > 32 not supported");
+    }
+  }
+
+  Image gi(frame_dim.xsize, frame_dim.ysize, metadata.bit_depth.bits_per_sample,
+           nb_chans + nb_extra);
+
+  all_same_shift = true;
+  if (frame_header.color_transform == ColorTransform::kYCbCr) {
+    for (size_t c = 0; c < nb_chans; c++) {
+      gi.channel[c].hshift = frame_header.chroma_subsampling.HShift(c);
+      gi.channel[c].vshift = frame_header.chroma_subsampling.VShift(c);
+      size_t xsize_shifted =
+          DivCeil(frame_dim.xsize, 1 << gi.channel[c].hshift);
+      size_t ysize_shifted =
+          DivCeil(frame_dim.ysize, 1 << gi.channel[c].vshift);
+      gi.channel[c].shrink(xsize_shifted, ysize_shifted);
+      if (gi.channel[c].hshift != gi.channel[0].hshift ||
+          gi.channel[c].vshift != gi.channel[0].vshift)
+        all_same_shift = false;
+    }
+  }
+
+  for (size_t ec = 0, c = nb_chans; ec < nb_extra; ec++, c++) {
+    size_t ecups = frame_header.extra_channel_upsampling[ec];
+    gi.channel[c].shrink(DivCeil(frame_dim.xsize_upsampled, ecups),
+                         DivCeil(frame_dim.ysize_upsampled, ecups));
+    gi.channel[c].hshift = gi.channel[c].vshift =
+        CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling);
+    if (gi.channel[c].hshift != gi.channel[0].hshift ||
+        gi.channel[c].vshift != gi.channel[0].vshift)
+      all_same_shift = false;
+  }
+
+  JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (w/o transforms) %s",
+              gi.DebugString().c_str());
+  ModularOptions options;
+  options.max_chan_size = frame_dim.group_dim;
+  options.group_dim = frame_dim.group_dim;
+  Status dec_status = ModularGenericDecompress(
+      reader, gi, &global_header, ModularStreamId::Global().ID(frame_dim),
+      &options,
+      /*undo_transforms=*/false, &tree, &code, &context_map,
+      allow_truncated_group);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status);
+  if (dec_status.IsFatalError()) {
+    return JXL_FAILURE("Failed to decode global modular info");
+  }
+
+  // TODO(eustas): are we sure this can be done after partial decode?
+  have_something = false;
+  for (size_t c = 0; c < gi.channel.size(); c++) {
+    Channel& gic = gi.channel[c];
+    if (c >= gi.nb_meta_channels && gic.w <= frame_dim.group_dim &&
+        gic.h <= frame_dim.group_dim)
+      have_something = true;
+  }
+  // move global transforms to groups if possible
+  if (!have_something && all_same_shift) {
+    if (gi.transform.size() == 1 && gi.transform[0].id == TransformId::kRCT) {
+      global_transform = gi.transform;
+      gi.transform.clear();
+      // TODO(jon): also move no-delta-palette out (trickier though)
+    }
+  }
+  full_image = std::move(gi);
+  JXL_DEBUG_V(6, "DecodeGlobalInfo: full_image (with transforms) %s",
+              full_image.DebugString().c_str());
+  return dec_status;
+}
+
+void ModularFrameDecoder::MaybeDropFullImage() {
+  if (full_image.transform.empty() && !have_something && all_same_shift) {
+    use_full_image = false;
+    JXL_DEBUG_V(6, "Dropping full image");
+    for (auto& ch : full_image.channel) {
+      // keep metadata on channels around, but dealloc their planes
+      ch.plane = Plane<pixel_type>();
+    }
+  }
+}
+
+Status ModularFrameDecoder::DecodeGroup(
+    const Rect& rect, BitReader* reader, int minShift, int maxShift,
+    const ModularStreamId& stream, bool zerofill, PassesDecoderState* dec_state,
+    RenderPipelineInput* render_pipeline_input, bool allow_truncated,
+    bool* should_run_pipeline) {
+  JXL_DEBUG_V(6, "Decoding %s with rect %s and shift bracket %d..%d %s",
+              stream.DebugString().c_str(), Description(rect).c_str(), minShift,
+              maxShift, zerofill ? "using zerofill" : "");
+  JXL_DASSERT(stream.kind == ModularStreamId::kModularDC ||
+              stream.kind == ModularStreamId::kModularAC);
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  Image gi(xsize, ysize, full_image.bitdepth, 0);
+  // start at the first bigger-than-groupsize non-metachannel
+  size_t c = full_image.nb_meta_channels;
+  for (; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    if (fc.w > frame_dim.group_dim || fc.h > frame_dim.group_dim) break;
+  }
+  size_t beginc = c;
+  for (; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    int shift = std::min(fc.hshift, fc.vshift);
+    if (shift > maxShift) continue;
+    if (shift < minShift) continue;
+    Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+           rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+    if (r.xsize() == 0 || r.ysize() == 0) continue;
+    if (zerofill && use_full_image) {
+      for (size_t y = 0; y < r.ysize(); ++y) {
+        pixel_type* const JXL_RESTRICT row_out = r.Row(&fc.plane, y);
+        memset(row_out, 0, r.xsize() * sizeof(*row_out));
+      }
+    } else {
+      Channel gc(r.xsize(), r.ysize());
+      if (zerofill) ZeroFillImage(&gc.plane);
+      gc.hshift = fc.hshift;
+      gc.vshift = fc.vshift;
+      gi.channel.emplace_back(std::move(gc));
+    }
+  }
+  if (zerofill && use_full_image) return true;
+  // Return early if there's nothing to decode. Otherwise there might be
+  // problems later (in ModularImageToDecodedRect).
+  if (gi.channel.empty()) {
+    if (dec_state && should_run_pipeline) {
+      const auto& frame_header = dec_state->shared->frame_header;
+      const auto* metadata = frame_header.nonserialized_metadata;
+      if (do_color || metadata->m.num_extra_channels > 0) {
+        // Signal to FrameDecoder that we do not have some of the required input
+        // for the render pipeline.
+        *should_run_pipeline = false;
+      }
+    }
+    JXL_DEBUG_V(6, "Nothing to decode, returning early.");
+    return true;
+  }
+  ModularOptions options;
+  if (!zerofill) {
+    auto status = ModularGenericDecompress(
+        reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options,
+        /*undo_transforms=*/true, &tree, &code, &context_map, allow_truncated);
+    if (!allow_truncated) JXL_RETURN_IF_ERROR(status);
+    if (status.IsFatalError()) return status;
+  }
+  // Undo global transforms that have been pushed to the group level
+  if (!use_full_image) {
+    JXL_ASSERT(render_pipeline_input);
+    for (auto t : global_transform) {
+      JXL_RETURN_IF_ERROR(t.Inverse(gi, global_header.wp_header));
+    }
+    JXL_RETURN_IF_ERROR(ModularImageToDecodedRect(gi, dec_state, nullptr,
+                                                  *render_pipeline_input,
+                                                  Rect(0, 0, gi.w, gi.h)));
+    return true;
+  }
+  int gic = 0;
+  for (c = beginc; c < full_image.channel.size(); c++) {
+    Channel& fc = full_image.channel[c];
+    int shift = std::min(fc.hshift, fc.vshift);
+    if (shift > maxShift) continue;
+    if (shift < minShift) continue;
+    Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+           rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+    if (r.xsize() == 0 || r.ysize() == 0) continue;
+    JXL_ASSERT(use_full_image);
+    CopyImageTo(/*rect_from=*/Rect(0, 0, r.xsize(), r.ysize()),
+                /*from=*/gi.channel[gic].plane,
+                /*rect_to=*/r, /*to=*/&fc.plane);
+    gic++;
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::DecodeVarDCTDC(size_t group_id, BitReader* reader,
+                                           PassesDecoderState* dec_state) {
+  const Rect r = dec_state->shared->DCGroupRect(group_id);
+  // TODO(eustas): investigate if we could reduce the impact of
+  //               EvalRationalPolynomial; generally speaking, the limit is
+  //               2**(128/(3*magic)), where 128 comes from IEEE 754 exponent,
+  //               3 comes from XybToRgb that cubes the values, and "magic" is
+  //               the sum of all other contributions. 2**18 is known to lead
+  //               to NaN on input found by fuzzing (see commit message).
+  Image image(r.xsize(), r.ysize(), full_image.bitdepth, 3);
+  size_t stream_id = ModularStreamId::VarDCTDC(group_id).ID(frame_dim);
+  reader->Refill();
+  size_t extra_precision = reader->ReadFixedBits<2>();
+  float mul = 1.0f / (1 << extra_precision);
+  ModularOptions options;
+  for (size_t c = 0; c < 3; c++) {
+    Channel& ch = image.channel[c < 2 ? c ^ 1 : c];
+    ch.w >>= dec_state->shared->frame_header.chroma_subsampling.HShift(c);
+    ch.h >>= dec_state->shared->frame_header.chroma_subsampling.VShift(c);
+    ch.shrink();
+  }
+  if (!ModularGenericDecompress(
+          reader, image, /*header=*/nullptr, stream_id, &options,
+          /*undo_transforms=*/true, &tree, &code, &context_map)) {
+    return JXL_FAILURE("Failed to decode modular DC group");
+  }
+  DequantDC(r, &dec_state->shared_storage.dc_storage,
+            &dec_state->shared_storage.quant_dc, image,
+            dec_state->shared->quantizer.MulDC(), mul,
+            dec_state->shared->cmap.DCFactors(),
+            dec_state->shared->frame_header.chroma_subsampling,
+            dec_state->shared->block_ctx_map);
+  return true;
+}
+
+Status ModularFrameDecoder::DecodeAcMetadata(size_t group_id, BitReader* reader,
+                                             PassesDecoderState* dec_state) {
+  const Rect r = dec_state->shared->DCGroupRect(group_id);
+  size_t upper_bound = r.xsize() * r.ysize();
+  reader->Refill();
+  size_t count = reader->ReadBits(CeilLog2Nonzero(upper_bound)) + 1;
+  size_t stream_id = ModularStreamId::ACMetadata(group_id).ID(frame_dim);
+  // YToX, YToB, ACS + QF, EPF
+  Image image(r.xsize(), r.ysize(), full_image.bitdepth, 4);
+  static_assert(kColorTileDimInBlocks == 8, "Color tile size changed");
+  Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3);
+  image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[2] = Channel(count, 2, 0, 0);
+  ModularOptions options;
+  if (!ModularGenericDecompress(
+          reader, image, /*header=*/nullptr, stream_id, &options,
+          /*undo_transforms=*/true, &tree, &code, &context_map)) {
+    return JXL_FAILURE("Failed to decode AC metadata");
+  }
+  ConvertPlaneAndClamp(Rect(image.channel[0].plane), image.channel[0].plane, cr,
+                       &dec_state->shared_storage.cmap.ytox_map);
+  ConvertPlaneAndClamp(Rect(image.channel[1].plane), image.channel[1].plane, cr,
+                       &dec_state->shared_storage.cmap.ytob_map);
+  size_t num = 0;
+  bool is444 = dec_state->shared->frame_header.chroma_subsampling.Is444();
+  auto& ac_strategy = dec_state->shared_storage.ac_strategy;
+  size_t xlim = std::min(ac_strategy.xsize(), r.x0() + r.xsize());
+  size_t ylim = std::min(ac_strategy.ysize(), r.y0() + r.ysize());
+  uint32_t local_used_acs = 0;
+  for (size_t iy = 0; iy < r.ysize(); iy++) {
+    size_t y = r.y0() + iy;
+    int32_t* row_qf = r.Row(&dec_state->shared_storage.raw_quant_field, iy);
+    uint8_t* row_epf = r.Row(&dec_state->shared_storage.epf_sharpness, iy);
+    int32_t* row_in_1 = image.channel[2].plane.Row(0);
+    int32_t* row_in_2 = image.channel[2].plane.Row(1);
+    int32_t* row_in_3 = image.channel[3].plane.Row(iy);
+    for (size_t ix = 0; ix < r.xsize(); ix++) {
+      size_t x = r.x0() + ix;
+      int sharpness = row_in_3[ix];
+      if (sharpness < 0 || sharpness >= LoopFilter::kEpfSharpEntries) {
+        return JXL_FAILURE("Corrupted sharpness field");
+      }
+      row_epf[ix] = sharpness;
+      if (ac_strategy.IsValid(x, y)) {
+        continue;
+      }
+
+      if (num >= count) return JXL_FAILURE("Corrupted stream");
+
+      if (!AcStrategy::IsRawStrategyValid(row_in_1[num])) {
+        return JXL_FAILURE("Invalid AC strategy");
+      }
+      local_used_acs |= 1u << row_in_1[num];
+      AcStrategy acs = AcStrategy::FromRawStrategy(row_in_1[num]);
+      if ((acs.covered_blocks_x() > 1 || acs.covered_blocks_y() > 1) &&
+          !is444) {
+        return JXL_FAILURE(
+            "AC strategy not compatible with chroma subsampling");
+      }
+      // Ensure that blocks do not overflow *AC* groups.
+      size_t next_x_ac_block = (x / kGroupDimInBlocks + 1) * kGroupDimInBlocks;
+      size_t next_y_ac_block = (y / kGroupDimInBlocks + 1) * kGroupDimInBlocks;
+      size_t next_x_dct_block = x + acs.covered_blocks_x();
+      size_t next_y_dct_block = y + acs.covered_blocks_y();
+      if (next_x_dct_block > next_x_ac_block || next_x_dct_block > xlim) {
+        return JXL_FAILURE("Invalid AC strategy, x overflow");
+      }
+      if (next_y_dct_block > next_y_ac_block || next_y_dct_block > ylim) {
+        return JXL_FAILURE("Invalid AC strategy, y overflow");
+      }
+      JXL_RETURN_IF_ERROR(
+          ac_strategy.SetNoBoundsCheck(x, y, AcStrategy::Type(row_in_1[num])));
+      row_qf[ix] = 1 + std::max<int32_t>(0, std::min(Quantizer::kQuantMax - 1,
+                                                     row_in_2[num]));
+      num++;
+    }
+  }
+  dec_state->used_acs |= local_used_acs;
+  if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
+    ComputeSigma(r, dec_state);
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::ModularImageToDecodedRect(
+    Image& gi, PassesDecoderState* dec_state, jxl::ThreadPool* pool,
+    RenderPipelineInput& render_pipeline_input, Rect modular_rect) {
+  const auto& frame_header = dec_state->shared->frame_header;
+  const auto* metadata = frame_header.nonserialized_metadata;
+  JXL_CHECK(gi.transform.empty());
+
+  auto get_row = [&](size_t c, size_t y) {
+    const auto& buffer = render_pipeline_input.GetBuffer(c);
+    return buffer.second.Row(buffer.first, y);
+  };
+
+  size_t c = 0;
+  if (do_color) {
+    const bool rgb_from_gray =
+        metadata->m.color_encoding.IsGray() &&
+        frame_header.color_transform == ColorTransform::kNone;
+    const bool fp = metadata->m.bit_depth.floating_point_sample &&
+                    frame_header.color_transform != ColorTransform::kXYB;
+    for (; c < 3; c++) {
+      double factor = full_image.bitdepth < 32
+                          ? 1.0 / ((1u << full_image.bitdepth) - 1)
+                          : 0;
+      size_t c_in = c;
+      if (frame_header.color_transform == ColorTransform::kXYB) {
+        factor = dec_state->shared->matrices.DCQuants()[c];
+        // XYB is encoded as YX(B-Y)
+        if (c < 2) c_in = 1 - c;
+      } else if (rgb_from_gray) {
+        c_in = 0;
+      }
+      JXL_ASSERT(c_in < gi.channel.size());
+      Channel& ch_in = gi.channel[c_in];
+      // TODO(eustas): could we detect it on earlier stage?
+      if (ch_in.w == 0 || ch_in.h == 0) {
+        return JXL_FAILURE("Empty image");
+      }
+      JXL_CHECK(ch_in.hshift <= 3 && ch_in.vshift <= 3);
+      Rect r = render_pipeline_input.GetBuffer(c).second;
+      Rect mr(modular_rect.x0() >> ch_in.hshift,
+              modular_rect.y0() >> ch_in.vshift,
+              DivCeil(modular_rect.xsize(), 1 << ch_in.hshift),
+              DivCeil(modular_rect.ysize(), 1 << ch_in.vshift));
+      mr = mr.Crop(ch_in.plane);
+      size_t xsize_shifted = r.xsize();
+      size_t ysize_shifted = r.ysize();
+      if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) {
+        return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS
+                           "x%" PRIuS
+                           " modular channel into "
+                           "a %" PRIuS "x%" PRIuS " rect",
+                           mr.xsize(), mr.ysize(), r.xsize(), r.ysize());
+      }
+      if (frame_header.color_transform == ColorTransform::kXYB && c == 2) {
+        JXL_ASSERT(!fp);
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              const pixel_type* const JXL_RESTRICT row_in_Y =
+                  mr.Row(&gi.channel[0].plane, y);
+              float* const JXL_RESTRICT row_out = get_row(c, y);
+              HWY_DYNAMIC_DISPATCH(MultiplySum)
+              (xsize_shifted, row_in, row_in_Y, factor, row_out);
+            },
+            "ModularIntToFloat"));
+      } else if (fp) {
+        int bits = metadata->m.bit_depth.bits_per_sample;
+        int exp_bits = metadata->m.bit_depth.exponent_bits_per_sample;
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              if (rgb_from_gray) {
+                for (size_t cc = 0; cc < 3; cc++) {
+                  float* const JXL_RESTRICT row_out = get_row(cc, y);
+                  int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits);
+                }
+              } else {
+                float* const JXL_RESTRICT row_out = get_row(c, y);
+                int_to_float(row_in, row_out, xsize_shifted, bits, exp_bits);
+              }
+            },
+            "ModularIntToFloat_losslessfloat"));
+      } else {
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const uint32_t task, size_t /* thread */) {
+              const size_t y = task;
+              const pixel_type* const JXL_RESTRICT row_in =
+                  mr.Row(&ch_in.plane, y);
+              if (rgb_from_gray) {
+                if (full_image.bitdepth < 23) {
+                  HWY_DYNAMIC_DISPATCH(RgbFromSingle)
+                  (xsize_shifted, row_in, factor, get_row(0, y), get_row(1, y),
+                   get_row(2, y));
+                } else {
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(0, y));
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(1, y));
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           get_row(2, y));
+                }
+              } else {
+                float* const JXL_RESTRICT row_out = get_row(c, y);
+                if (full_image.bitdepth < 23) {
+                  HWY_DYNAMIC_DISPATCH(SingleFromSingle)
+                  (xsize_shifted, row_in, factor, row_out);
+                } else {
+                  SingleFromSingleAccurate(xsize_shifted, row_in, factor,
+                                           row_out);
+                }
+              }
+            },
+            "ModularIntToFloat"));
+      }
+      if (rgb_from_gray) {
+        break;
+      }
+    }
+    if (rgb_from_gray) {
+      c = 1;
+    }
+  }
+  size_t num_extra_channels = metadata->m.num_extra_channels;
+  for (size_t ec = 0; ec < num_extra_channels; ec++, c++) {
+    const ExtraChannelInfo& eci = metadata->m.extra_channel_info[ec];
+    int bits = eci.bit_depth.bits_per_sample;
+    int exp_bits = eci.bit_depth.exponent_bits_per_sample;
+    bool fp = eci.bit_depth.floating_point_sample;
+    JXL_ASSERT(fp || bits < 32);
+    const double factor = fp ? 0 : (1.0 / ((1u << bits) - 1));
+    JXL_ASSERT(c < gi.channel.size());
+    Channel& ch_in = gi.channel[c];
+    Rect r = render_pipeline_input.GetBuffer(3 + ec).second;
+    Rect mr(modular_rect.x0() >> ch_in.hshift,
+            modular_rect.y0() >> ch_in.vshift,
+            DivCeil(modular_rect.xsize(), 1 << ch_in.hshift),
+            DivCeil(modular_rect.ysize(), 1 << ch_in.vshift));
+    mr = mr.Crop(ch_in.plane);
+    if (r.ysize() != mr.ysize() || r.xsize() != mr.xsize()) {
+      return JXL_FAILURE("Dimension mismatch: trying to fit a %" PRIuS
+                         "x%" PRIuS
+                         " modular channel into "
+                         "a %" PRIuS "x%" PRIuS " rect",
+                         mr.xsize(), mr.ysize(), r.xsize(), r.ysize());
+    }
+    for (size_t y = 0; y < r.ysize(); ++y) {
+      float* const JXL_RESTRICT row_out =
+          r.Row(render_pipeline_input.GetBuffer(3 + ec).first, y);
+      const pixel_type* const JXL_RESTRICT row_in = mr.Row(&ch_in.plane, y);
+      if (fp) {
+        int_to_float(row_in, row_out, r.xsize(), bits, exp_bits);
+      } else {
+        if (full_image.bitdepth < 23) {
+          HWY_DYNAMIC_DISPATCH(SingleFromSingle)
+          (r.xsize(), row_in, factor, row_out);
+        } else {
+          SingleFromSingleAccurate(r.xsize(), row_in, factor, row_out);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+Status ModularFrameDecoder::FinalizeDecoding(PassesDecoderState* dec_state,
+                                             jxl::ThreadPool* pool,
+                                             bool inplace) {
+  if (!use_full_image) return true;
+  Image gi = (inplace ? std::move(full_image) : full_image.clone());
+  size_t xsize = gi.w;
+  size_t ysize = gi.h;
+
+  JXL_DEBUG_V(3, "Finalizing decoding for modular image: %s",
+              gi.DebugString().c_str());
+
+  // Don't use threads if total image size is smaller than a group
+  if (xsize * ysize < frame_dim.group_dim * frame_dim.group_dim) pool = nullptr;
+
+  // Undo the global transforms
+  gi.undo_transforms(global_header.wp_header, pool);
+  JXL_DASSERT(global_transform.empty());
+  if (gi.error) return JXL_FAILURE("Undoing transforms failed");
+
+  for (size_t i = 0; i < dec_state->shared->frame_dim.num_groups; i++) {
+    dec_state->render_pipeline->ClearDone(i);
+  }
+  std::atomic<bool> has_error{false};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, dec_state->shared->frame_dim.num_groups,
+      [&](size_t num_threads) {
+        const auto& frame_header = dec_state->shared->frame_header;
+        bool use_group_ids = (frame_header.encoding == FrameEncoding::kVarDCT ||
+                              (frame_header.flags & FrameHeader::kNoise));
+        return dec_state->render_pipeline->PrepareForThreads(num_threads,
+                                                             use_group_ids);
+      },
+      [&](const uint32_t group, size_t thread_id) {
+        RenderPipelineInput input =
+            dec_state->render_pipeline->GetInputBuffers(group, thread_id);
+        if (!ModularImageToDecodedRect(gi, dec_state, nullptr, input,
+                                       dec_state->shared->GroupRect(group))) {
+          has_error = true;
+          return;
+        }
+        input.Done();
+      },
+      "ModularToRect"));
+  if (has_error) {
+    return JXL_FAILURE("Error producing input to render pipeline");
+  }
+  return true;
+}
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+Status ModularFrameDecoder::DecodeQuantTable(
+    size_t required_size_x, size_t required_size_y, BitReader* br,
+    QuantEncoding* encoding, size_t idx,
+    ModularFrameDecoder* modular_frame_decoder) {
+  JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->qraw.qtable_den));
+  if (encoding->qraw.qtable_den < kAlmostZero) {
+    // qtable[] values are already checked for <= 0 so the denominator may not
+    // be negative.
+    return JXL_FAILURE("Invalid qtable_den: value too small");
+  }
+  Image image(required_size_x, required_size_y, 8, 3);
+  ModularOptions options;
+  if (modular_frame_decoder) {
+    JXL_RETURN_IF_ERROR(ModularGenericDecompress(
+        br, image, /*header=*/nullptr,
+        ModularStreamId::QuantTable(idx).ID(modular_frame_decoder->frame_dim),
+        &options, /*undo_transforms=*/true, &modular_frame_decoder->tree,
+        &modular_frame_decoder->code, &modular_frame_decoder->context_map));
+  } else {
+    JXL_RETURN_IF_ERROR(ModularGenericDecompress(br, image, /*header=*/nullptr,
+                                                 0, &options,
+                                                 /*undo_transforms=*/true));
+  }
+  if (!encoding->qraw.qtable) {
+    encoding->qraw.qtable = new std::vector<int>();
+  }
+  encoding->qraw.qtable->resize(required_size_x * required_size_y * 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < required_size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < required_size_x; x++) {
+        (*encoding->qraw.qtable)[c * required_size_x * required_size_y +
+                                 y * required_size_x + x] = row[x];
+        if (row[x] <= 0) {
+          return JXL_FAILURE("Invalid raw quantization table");
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.h b/third_party/jpeg-xl/lib/jxl/dec_modular.h
new file mode 100644
index 0000000000..aae643cf1f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.h
@@ -0,0 +1,140 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_MODULAR_H_
+#define LIB_JXL_DEC_MODULAR_H_
+
+#include <stddef.h>
+
+#include <string>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct ModularStreamId {
+  enum Kind {
+    kGlobalData,
+    kVarDCTDC,
+    kModularDC,
+    kACMetadata,
+    kQuantTable,
+    kModularAC
+  };
+  Kind kind;
+  size_t quant_table_id;
+  size_t group_id;  // DC or AC group id.
+  size_t pass_id;   // Only for kModularAC.
+  size_t ID(const FrameDimensions& frame_dim) const {
+    size_t id = 0;
+    switch (kind) {
+      case kGlobalData:
+        id = 0;
+        break;
+      case kVarDCTDC:
+        id = 1 + group_id;
+        break;
+      case kModularDC:
+        id = 1 + frame_dim.num_dc_groups + group_id;
+        break;
+      case kACMetadata:
+        id = 1 + 2 * frame_dim.num_dc_groups + group_id;
+        break;
+      case kQuantTable:
+        id = 1 + 3 * frame_dim.num_dc_groups + quant_table_id;
+        break;
+      case kModularAC:
+        id = 1 + 3 * frame_dim.num_dc_groups + DequantMatrices::kNum +
+             frame_dim.num_groups * pass_id + group_id;
+        break;
+    };
+    return id;
+  }
+  static ModularStreamId Global() {
+    return ModularStreamId{kGlobalData, 0, 0, 0};
+  }
+  static ModularStreamId VarDCTDC(size_t group_id) {
+    return ModularStreamId{kVarDCTDC, 0, group_id, 0};
+  }
+  static ModularStreamId ModularDC(size_t group_id) {
+    return ModularStreamId{kModularDC, 0, group_id, 0};
+  }
+  static ModularStreamId ACMetadata(size_t group_id) {
+    return ModularStreamId{kACMetadata, 0, group_id, 0};
+  }
+  static ModularStreamId QuantTable(size_t quant_table_id) {
+    JXL_ASSERT(quant_table_id < DequantMatrices::kNum);
+    return ModularStreamId{kQuantTable, quant_table_id, 0, 0};
+  }
+  static ModularStreamId ModularAC(size_t group_id, size_t pass_id) {
+    return ModularStreamId{kModularAC, 0, group_id, pass_id};
+  }
+  static size_t Num(const FrameDimensions& frame_dim, size_t passes) {
+    return ModularAC(0, passes).ID(frame_dim);
+  }
+  std::string DebugString() const;
+};
+
+class ModularFrameDecoder {
+ public:
+  void Init(const FrameDimensions& frame_dim) { this->frame_dim = frame_dim; }
+  Status DecodeGlobalInfo(BitReader* reader, const FrameHeader& frame_header,
+                          bool allow_truncated_group);
+  Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift,
+                     int maxShift, const ModularStreamId& stream, bool zerofill,
+                     PassesDecoderState* dec_state,
+                     RenderPipelineInput* render_pipeline_input,
+                     bool allow_truncated, bool* should_run_pipeline = nullptr);
+  // Decodes a VarDCT DC group (`group_id`) from the given `reader`.
+  Status DecodeVarDCTDC(size_t group_id, BitReader* reader,
+                        PassesDecoderState* dec_state);
+  // Decodes a VarDCT AC Metadata group (`group_id`) from the given `reader`.
+  Status DecodeAcMetadata(size_t group_id, BitReader* reader,
+                          PassesDecoderState* dec_state);
+  // Decodes a RAW quant table from `br` into the given `encoding`, of size
+  // `required_size_x x required_size_y`. If `modular_frame_decoder` is passed,
+  // its global tree is used, otherwise no global tree is used.
+  static Status DecodeQuantTable(size_t required_size_x, size_t required_size_y,
+                                 BitReader* br, QuantEncoding* encoding,
+                                 size_t idx,
+                                 ModularFrameDecoder* modular_frame_decoder);
+  // if inplace is true, this can only be called once
+  // if it is false, it can be called multiple times (e.g. for progressive
+  // steps)
+  Status FinalizeDecoding(PassesDecoderState* dec_state, jxl::ThreadPool* pool,
+                          bool inplace);
+  bool have_dc() const { return have_something; }
+  void MaybeDropFullImage();
+  bool UsesFullImage() const { return use_full_image; }
+
+ private:
+  Status ModularImageToDecodedRect(Image& gi, PassesDecoderState* dec_state,
+                                   jxl::ThreadPool* pool,
+                                   RenderPipelineInput& render_pipeline_input,
+                                   Rect modular_rect);
+
+  Image full_image;
+  std::vector<Transform> global_transform;
+  FrameDimensions frame_dim;
+  bool do_color;
+  bool have_something;
+  bool use_full_image = true;
+  bool all_same_shift;
+  Tree tree;
+  ANSCode code;
+  std::vector<uint8_t> context_map;
+  GroupHeader global_header;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_MODULAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.cc b/third_party/jpeg-xl/lib/jxl/dec_noise.cc
new file mode 100644
index 0000000000..275a6d0b21
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_noise.cc
@@ -0,0 +1,131 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_noise.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/xorshift128plus-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// Converts one vector's worth of random bits to floats in [1, 2).
+// NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in
+// [0, 1) or in [1, 2).
+void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits,
+                 float* JXL_RESTRICT floats) {
+  const HWY_FULL(float) df;
+  const HWY_FULL(uint32_t) du;
+
+  const auto bits = Load(du, random_bits);
+  // 1.0 + 23 random mantissa bits = [1, 2)
+  const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000)));
+  Store(rand12, df, floats);
+}
+
+void RandomImage(Xorshift128Plus* rng, const Rect& rect,
+                 ImageF* JXL_RESTRICT noise) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+
+  // May exceed the vector size, hence we have two loops over x below.
+  constexpr size_t kFloatsPerBatch =
+      Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float);
+  HWY_ALIGN uint64_t batch[Xorshift128Plus::N] = {};
+
+  const HWY_FULL(float) df;
+  const size_t N = Lanes(df);
+
+  for (size_t y = 0; y < ysize; ++y) {
+    float* JXL_RESTRICT row = rect.Row(noise, y);
+
+    size_t x = 0;
+    // Only entire batches (avoids exceeding the image padding).
+    for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) {
+      rng->Fill(batch);
+      for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) {
+        BitsToFloat(reinterpret_cast<const uint32_t*>(batch) + i, row + x + i);
+      }
+    }
+
+    // Any remaining pixels, rounded up to vectors (safe due to padding).
+    rng->Fill(batch);
+    size_t batch_pos = 0;  // < kFloatsPerBatch
+    for (; x < xsize; x += N) {
+      BitsToFloat(reinterpret_cast<const uint32_t*>(batch) + batch_pos,
+                  row + x);
+      batch_pos += N;
+    }
+  }
+}
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2) {
+  HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0,
+                                y0);
+  RandomImage(&rng, plane0.second, plane0.first);
+  RandomImage(&rng, plane1.second, plane1.first);
+  RandomImage(&rng, plane2.second, plane2.first);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Random3Planes);
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2) {
+  return HWY_DYNAMIC_DISPATCH(Random3Planes)(visible_frame_index,
+                                             nonvisible_frame_index, x0, y0,
+                                             plane0, plane1, plane2);
+}
+
+void DecodeFloatParam(float precision, float* val, BitReader* br) {
+  const int absval_quant = br->ReadFixedBits<10>();
+  *val = absval_quant / precision;
+}
+
+Status DecodeNoise(BitReader* br, NoiseParams* noise_params) {
+  for (float& i : noise_params->lut) {
+    DecodeFloatParam(kNoisePrecision, &i, br);
+  }
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_noise.h b/third_party/jpeg-xl/lib/jxl/dec_noise.h
new file mode 100644
index 0000000000..ac05866470
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_NOISE_H_
+#define LIB_JXL_DEC_NOISE_H_
+
+// Noise synthesis. Currently disabled.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index,
+                   size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0,
+                   const std::pair<ImageF*, Rect>& plane1,
+                   const std::pair<ImageF*, Rect>& plane2);
+
+// Must only call if FrameHeader.flags.kNoise.
+Status DecodeNoise(BitReader* br, NoiseParams* noise_params);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
new file mode 100644
index 0000000000..85e5de3c8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
@@ -0,0 +1,347 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_patch_dictionary.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/blending.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/patch_dictionary_internal.h"
+
+namespace jxl {
+
+Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
+                               bool* uses_extra_channels) {
+  positions_.clear();
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumPatchDictionaryContexts, &code, &context_map));
+  ANSSymbolReader decoder(&code, br);
+
+  auto read_num = [&](size_t context) {
+    size_t r = decoder.ReadHybridUint(context, br, context_map);
+    return r;
+  };
+
+  size_t num_ref_patch = read_num(kNumRefPatchContext);
+  // Limit max memory usage of patches to about 66 bytes per pixel (assuming 8
+  // bytes per size_t)
+  const size_t num_pixels = xsize * ysize;
+  const size_t max_ref_patches = 1024 + num_pixels / 4;
+  const size_t max_patches = max_ref_patches * 4;
+  const size_t max_blending_infos = max_patches * 4;
+  if (num_ref_patch > max_ref_patches) {
+    return JXL_FAILURE("Too many patches in dictionary");
+  }
+  size_t num_ec = shared_->metadata->m.num_extra_channels;
+
+  size_t total_patches = 0;
+  size_t next_size = 1;
+
+  for (size_t id = 0; id < num_ref_patch; id++) {
+    PatchReferencePosition ref_pos;
+    ref_pos.ref = read_num(kReferenceFrameContext);
+    if (ref_pos.ref >= kMaxNumReferenceFrames ||
+        shared_->reference_frames[ref_pos.ref].frame.xsize() == 0) {
+      return JXL_FAILURE("Invalid reference frame ID");
+    }
+    if (!shared_->reference_frames[ref_pos.ref].ib_is_in_xyb) {
+      return JXL_FAILURE(
+          "Patches cannot use frames saved post color transforms");
+    }
+    const ImageBundle& ib = shared_->reference_frames[ref_pos.ref].frame;
+    ref_pos.x0 = read_num(kPatchReferencePositionContext);
+    ref_pos.y0 = read_num(kPatchReferencePositionContext);
+    ref_pos.xsize = read_num(kPatchSizeContext) + 1;
+    ref_pos.ysize = read_num(kPatchSizeContext) + 1;
+    if (ref_pos.x0 + ref_pos.xsize > ib.xsize()) {
+      return JXL_FAILURE("Invalid position specified in reference frame");
+    }
+    if (ref_pos.y0 + ref_pos.ysize > ib.ysize()) {
+      return JXL_FAILURE("Invalid position specified in reference frame");
+    }
+    size_t id_count = read_num(kPatchCountContext) + 1;
+    total_patches += id_count;
+    if (total_patches > max_patches) {
+      return JXL_FAILURE("Too many patches in dictionary");
+    }
+    if (next_size < total_patches) {
+      next_size *= 2;
+      next_size = std::min<size_t>(next_size, max_patches);
+    }
+    if (next_size * (num_ec + 1) > max_blending_infos) {
+      return JXL_FAILURE("Too many patches in dictionary");
+    }
+    positions_.reserve(next_size);
+    blendings_.reserve(next_size * (num_ec + 1));
+    for (size_t i = 0; i < id_count; i++) {
+      PatchPosition pos;
+      pos.ref_pos_idx = ref_positions_.size();
+      if (i == 0) {
+        pos.x = read_num(kPatchPositionContext);
+        pos.y = read_num(kPatchPositionContext);
+      } else {
+        pos.x =
+            positions_.back().x + UnpackSigned(read_num(kPatchOffsetContext));
+        pos.y =
+            positions_.back().y + UnpackSigned(read_num(kPatchOffsetContext));
+      }
+      if (pos.x + ref_pos.xsize > xsize) {
+        return JXL_FAILURE("Invalid patch x: at %" PRIuS " + %" PRIuS
+                           " > %" PRIuS,
+                           pos.x, ref_pos.xsize, xsize);
+      }
+      if (pos.y + ref_pos.ysize > ysize) {
+        return JXL_FAILURE("Invalid patch y: at %" PRIuS " + %" PRIuS
+                           " > %" PRIuS,
+                           pos.y, ref_pos.ysize, ysize);
+      }
+      for (size_t j = 0; j < num_ec + 1; j++) {
+        uint32_t blend_mode = read_num(kPatchBlendModeContext);
+        if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) {
+          return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode);
+        }
+        PatchBlending info;
+        info.mode = static_cast<PatchBlendMode>(blend_mode);
+        if (UsesAlpha(info.mode)) {
+          *uses_extra_channels = true;
+        }
+        if (info.mode != PatchBlendMode::kNone && j > 0) {
+          *uses_extra_channels = true;
+        }
+        if (UsesAlpha(info.mode) &&
+            shared_->metadata->m.extra_channel_info.size() > 1) {
+          info.alpha_channel = read_num(kPatchAlphaChannelContext);
+          if (info.alpha_channel >=
+              shared_->metadata->m.extra_channel_info.size()) {
+            return JXL_FAILURE(
+                "Invalid alpha channel for blending: %u out of %u\n",
+                info.alpha_channel,
+                (uint32_t)shared_->metadata->m.extra_channel_info.size());
+          }
+        } else {
+          info.alpha_channel = 0;
+        }
+        if (UsesClamp(info.mode)) {
+          info.clamp = read_num(kPatchClampContext);
+        } else {
+          info.clamp = false;
+        }
+        blendings_.push_back(info);
+      }
+      positions_.push_back(std::move(pos));
+    }
+    ref_positions_.emplace_back(std::move(ref_pos));
+  }
+  positions_.shrink_to_fit();
+
+  if (!decoder.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS checksum failure.");
+  }
+
+  ComputePatchTree();
+  return true;
+}
+
+int PatchDictionary::GetReferences() const {
+  int result = 0;
+  for (size_t i = 0; i < ref_positions_.size(); ++i) {
+    result |= (1 << static_cast<int>(ref_positions_[i].ref));
+  }
+  return result;
+}
+
+namespace {
+struct PatchInterval {
+  size_t idx;
+  size_t y0, y1;
+};
+}  // namespace
+
+void PatchDictionary::ComputePatchTree() {
+  patch_tree_.clear();
+  num_patches_.clear();
+  sorted_patches_y0_.clear();
+  sorted_patches_y1_.clear();
+  if (positions_.empty()) {
+    return;
+  }
+  // Create a y-interval for each patch.
+  std::vector<PatchInterval> intervals(positions_.size());
+  for (size_t i = 0; i < positions_.size(); ++i) {
+    const auto& pos = positions_[i];
+    intervals[i].idx = i;
+    intervals[i].y0 = pos.y;
+    intervals[i].y1 = pos.y + ref_positions_[pos.ref_pos_idx].ysize;
+  }
+  auto sort_by_y0 = [&intervals](size_t start, size_t end) {
+    std::sort(intervals.data() + start, intervals.data() + end,
+              [](const PatchInterval& i0, const PatchInterval& i1) {
+                return i0.y0 < i1.y0;
+              });
+  };
+  auto sort_by_y1 = [&intervals](size_t start, size_t end) {
+    std::sort(intervals.data() + start, intervals.data() + end,
+              [](const PatchInterval& i0, const PatchInterval& i1) {
+                return i0.y1 < i1.y1;
+              });
+  };
+  // Count the number of patches for each row.
+  sort_by_y1(0, intervals.size());
+  num_patches_.resize(intervals.back().y1);
+  for (auto iv : intervals) {
+    for (size_t y = iv.y0; y < iv.y1; ++y) num_patches_[y]++;
+  }
+  PatchTreeNode root;
+  root.start = 0;
+  root.num = intervals.size();
+  patch_tree_.push_back(root);
+  size_t next = 0;
+  while (next < patch_tree_.size()) {
+    auto& node = patch_tree_[next];
+    size_t start = node.start;
+    size_t end = node.start + node.num;
+    // Choose the y_center for this node to be the median of interval starts.
+    sort_by_y0(start, end);
+    size_t middle_idx = start + node.num / 2;
+    node.y_center = intervals[middle_idx].y0;
+    // Divide the intervals in [start, end) into three groups:
+    //   * those completely to the right of y_center: [right_start, end)
+    //   * those overlapping y_center: [left_end, right_start)
+    //   * those completely to the left of y_center: [start, left_end)
+    size_t right_start = middle_idx;
+    while (right_start < end && intervals[right_start].y0 == node.y_center) {
+      ++right_start;
+    }
+    sort_by_y1(start, right_start);
+    size_t left_end = right_start;
+    while (left_end > start && intervals[left_end - 1].y1 > node.y_center) {
+      --left_end;
+    }
+    // Fill in sorted_patches_y0_ and sorted_patches_y1_ for the current node.
+    node.num = right_start - left_end;
+    node.start = sorted_patches_y0_.size();
+    for (ssize_t i = static_cast<ssize_t>(right_start) - 1;
+         i >= static_cast<ssize_t>(left_end); --i) {
+      sorted_patches_y1_.push_back({intervals[i].y1, intervals[i].idx});
+    }
+    sort_by_y0(left_end, right_start);
+    for (size_t i = left_end; i < right_start; ++i) {
+      sorted_patches_y0_.push_back({intervals[i].y0, intervals[i].idx});
+    }
+    // Create the left and right nodes (if not empty).
+    node.left_child = node.right_child = -1;
+    if (left_end > start) {
+      PatchTreeNode left;
+      left.start = start;
+      left.num = left_end - left.start;
+      patch_tree_[next].left_child = patch_tree_.size();
+      patch_tree_.push_back(left);
+    }
+    if (right_start < end) {
+      PatchTreeNode right;
+      right.start = right_start;
+      right.num = end - right.start;
+      patch_tree_[next].right_child = patch_tree_.size();
+      patch_tree_.push_back(right);
+    }
+    ++next;
+  }
+}
+
+std::vector<size_t> PatchDictionary::GetPatchesForRow(size_t y) const {
+  std::vector<size_t> result;
+  if (y < num_patches_.size() && num_patches_[y] > 0) {
+    result.reserve(num_patches_[y]);
+    for (ssize_t tree_idx = 0; tree_idx != -1;) {
+      JXL_DASSERT(tree_idx < (ssize_t)patch_tree_.size());
+      const auto& node = patch_tree_[tree_idx];
+      if (y <= node.y_center) {
+        for (size_t i = 0; i < node.num; ++i) {
+          const auto& p = sorted_patches_y0_[node.start + i];
+          if (y < p.first) break;
+          result.push_back(p.second);
+        }
+        tree_idx = y < node.y_center ? node.left_child : -1;
+      } else {
+        for (size_t i = 0; i < node.num; ++i) {
+          const auto& p = sorted_patches_y1_[node.start + i];
+          if (y >= p.first) break;
+          result.push_back(p.second);
+        }
+        tree_idx = node.right_child;
+      }
+    }
+    // Ensure that he relative order of patches that affect the same pixels is
+    // preserved. This is important for patches that have a blend mode
+    // different from kAdd.
+    std::sort(result.begin(), result.end());
+  }
+  return result;
+}
+
+// Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed
+// to be located at position (x0, y) in the frame.
+void PatchDictionary::AddOneRow(float* const* inout, size_t y, size_t x0,
+                                size_t xsize) const {
+  size_t num_ec = shared_->metadata->m.num_extra_channels;
+  std::vector<const float*> fg_ptrs(3 + num_ec);
+  for (size_t pos_idx : GetPatchesForRow(y)) {
+    const size_t blending_idx = pos_idx * (num_ec + 1);
+    const PatchPosition& pos = positions_[pos_idx];
+    const PatchReferencePosition& ref_pos = ref_positions_[pos.ref_pos_idx];
+    size_t by = pos.y;
+    size_t bx = pos.x;
+    size_t patch_xsize = ref_pos.xsize;
+    JXL_DASSERT(y >= by);
+    JXL_DASSERT(y < by + ref_pos.ysize);
+    size_t iy = y - by;
+    size_t ref = ref_pos.ref;
+    if (bx >= x0 + xsize) continue;
+    if (bx + patch_xsize < x0) continue;
+    size_t patch_x0 = std::max(bx, x0);
+    size_t patch_x1 = std::min(bx + patch_xsize, x0 + xsize);
+    for (size_t c = 0; c < 3; c++) {
+      fg_ptrs[c] = shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+                       c, ref_pos.y0 + iy) +
+                   ref_pos.x0 + x0 - bx;
+    }
+    for (size_t i = 0; i < num_ec; i++) {
+      fg_ptrs[3 + i] =
+          shared_->reference_frames[ref].frame.extra_channels()[i].ConstRow(
+              ref_pos.y0 + iy) +
+          ref_pos.x0 + x0 - bx;
+    }
+    PerformBlending(inout, fg_ptrs.data(), inout, patch_x0 - x0,
+                    patch_x1 - patch_x0, blendings_[blending_idx],
+                    blendings_.data() + blending_idx + 1,
+                    shared_->metadata->m.extra_channel_info);
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h
new file mode 100644
index 0000000000..a950e83e85
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.h
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_PATCH_DICTIONARY_H_
+#define LIB_JXL_DEC_PATCH_DICTIONARY_H_
+
+// Chooses reference patches, and avoids encoding them once per occurrence.
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <tuple>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+enum class PatchBlendMode : uint8_t {
+  // The new values are the old ones. Useful to skip some channels.
+  kNone = 0,
+  // The new values (in the crop) replace the old ones: sample = new
+  kReplace = 1,
+  // The new values (in the crop) get added to the old ones: sample = old + new
+  kAdd = 2,
+  // The new values (in the crop) get multiplied by the old ones:
+  // sample = old * new
+  // This blend mode is only supported if BlendColorSpace is kEncoded. The
+  // range of the new value matters for multiplication purposes, and its
+  // nominal range of 0..1 is computed the same way as this is done for the
+  // alpha values in kBlend and kAlphaWeightedAdd.
+  kMul = 3,
+  // The new values (in the crop) replace the old ones if alpha>0:
+  // For first alpha channel:
+  // alpha = old + new * (1 - old)
+  // For other channels if !alpha_associated:
+  // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha
+  // For other channels if alpha_associated:
+  // sample = (1 - new_alpha) * old + new
+  // The alpha formula applies to the alpha used for the division in the other
+  // channels formula, and applies to the alpha channel itself if its
+  // blend_channel value matches itself.
+  // If using kBlendAbove, new is the patch and old is the original image; if
+  // using kBlendBelow, the meaning is inverted.
+  kBlendAbove = 4,
+  kBlendBelow = 5,
+  // The new values (in the crop) are added to the old ones if alpha>0:
+  // For first alpha channel: sample = sample = old + new * (1 - old)
+  // For other channels: sample = old + alpha * new
+  kAlphaWeightedAddAbove = 6,
+  kAlphaWeightedAddBelow = 7,
+  kNumBlendModes,
+};
+
+inline bool UsesAlpha(PatchBlendMode mode) {
+  return mode == PatchBlendMode::kBlendAbove ||
+         mode == PatchBlendMode::kBlendBelow ||
+         mode == PatchBlendMode::kAlphaWeightedAddAbove ||
+         mode == PatchBlendMode::kAlphaWeightedAddBelow;
+}
+inline bool UsesClamp(PatchBlendMode mode) {
+  return UsesAlpha(mode) || mode == PatchBlendMode::kMul;
+}
+
+struct PatchBlending {
+  PatchBlendMode mode;
+  uint32_t alpha_channel;
+  bool clamp;
+};
+
+// Position and size of the patch in the reference frame.
+struct PatchReferencePosition {
+  size_t ref, x0, y0, xsize, ysize;
+};
+
+struct PatchPosition {
+  // Position of top-left corner of the patch in the image.
+  size_t x, y;
+  size_t ref_pos_idx;
+};
+
+struct PassesSharedState;
+
+// Encoder-side helper class to encode the PatchesDictionary.
+class PatchDictionaryEncoder;
+
+class PatchDictionary {
+ public:
+  PatchDictionary() = default;
+
+  void SetPassesSharedState(const PassesSharedState* shared) {
+    shared_ = shared;
+  }
+
+  bool HasAny() const { return !positions_.empty(); }
+
+  Status Decode(BitReader* br, size_t xsize, size_t ysize,
+                bool* uses_extra_channels);
+
+  void Clear() {
+    positions_.clear();
+    ComputePatchTree();
+  }
+
+  // Adds patches to a segment of `xsize` pixels, starting at `inout`, assumed
+  // to be located at position (x0, y) in the frame.
+  void AddOneRow(float* const* inout, size_t y, size_t x0, size_t xsize) const;
+
+  // Returns dependencies of this patch dictionary on reference frame ids as a
+  // bit mask: bits 0-3 indicate reference frame 0-3.
+  int GetReferences() const;
+
+  std::vector<size_t> GetPatchesForRow(size_t y) const;
+
+ private:
+  friend class PatchDictionaryEncoder;
+
+  const PassesSharedState* shared_;
+  std::vector<PatchPosition> positions_;
+  std::vector<PatchReferencePosition> ref_positions_;
+  std::vector<PatchBlending> blendings_;
+
+  // Interval tree on the y coordinates of the patches.
+  struct PatchTreeNode {
+    ssize_t left_child;
+    ssize_t right_child;
+    size_t y_center;
+    // Range of patches in sorted_patches_y0_ and sorted_patches_y1_ that
+    // contain the row y_center.
+    size_t start;
+    size_t num;
+  };
+  std::vector<PatchTreeNode> patch_tree_;
+  // Number of patches for each row.
+  std::vector<size_t> num_patches_;
+  std::vector<std::pair<size_t, size_t>> sorted_patches_y0_;
+  std::vector<std::pair<size_t, size_t>> sorted_patches_y1_;
+
+  void ComputePatchTree();
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_PATCH_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
new file mode 100644
index 0000000000..26bf643152
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_tone_mapping-inl.h
@@ -0,0 +1,234 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_DEC_TONE_MAPPING_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#undef LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#else
+#define LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+template <typename D>
+class Rec2408ToneMapper {
+ private:
+  using V = hwy::HWY_NAMESPACE::Vec<D>;
+
+ public:
+  explicit Rec2408ToneMapper(std::pair<float, float> source_range,
+                             std::pair<float, float> target_range,
+                             const float primaries_luminances[3])
+      : source_range_(source_range),
+        target_range_(target_range),
+        red_Y_(primaries_luminances[0]),
+        green_Y_(primaries_luminances[1]),
+        blue_Y_(primaries_luminances[2]) {}
+
+  void ToneMap(V* red, V* green, V* blue) const {
+    const V luminance = Mul(Set(df_, source_range_.second),
+                            (MulAdd(Set(df_, red_Y_), *red,
+                                    MulAdd(Set(df_, green_Y_), *green,
+                                           Mul(Set(df_, blue_Y_), *blue)))));
+    const V pq_mastering_min = Set(df_, pq_mastering_min_);
+    const V inv_pq_mastering_range = Set(df_, inv_pq_mastering_range_);
+    const V normalized_pq = Min(
+        Set(df_, 1.f),
+        Mul(Sub(InvEOTF(luminance), pq_mastering_min), inv_pq_mastering_range));
+    const V ks = Set(df_, ks_);
+    const V e2 =
+        IfThenElse(Lt(normalized_pq, ks), normalized_pq, P(normalized_pq));
+    const V one_minus_e2 = Sub(Set(df_, 1), e2);
+    const V one_minus_e2_2 = Mul(one_minus_e2, one_minus_e2);
+    const V one_minus_e2_4 = Mul(one_minus_e2_2, one_minus_e2_2);
+    const V b = Set(df_, min_lum_);
+    const V e3 = MulAdd(b, one_minus_e2_4, e2);
+    const V pq_mastering_range = Set(df_, pq_mastering_range_);
+    const V e4 = MulAdd(e3, pq_mastering_range, pq_mastering_min);
+    const V new_luminance =
+        Min(Set(df_, target_range_.second),
+            ZeroIfNegative(
+                Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4))));
+
+    const V ratio = Div(new_luminance, luminance);
+    const V inv_target_peak = Set(df_, inv_target_peak_);
+    const V normalizer = Set(df_, normalizer_);
+    const V multiplier = Mul(ratio, normalizer);
+    for (V* const val : {red, green, blue}) {
+      *val = IfThenElse(Le(luminance, Set(df_, 1e-6f)),
+                        Mul(new_luminance, inv_target_peak),
+                        Mul(*val, multiplier));
+    }
+  }
+
+ private:
+  V InvEOTF(const V luminance) const {
+    return TF_PQ().EncodedFromDisplay(df_,
+                                      Mul(luminance, Set(df_, 1. / 10000)));
+  }
+  float InvEOTF(const float luminance) const {
+    return TF_PQ().EncodedFromDisplay(luminance / 10000.0f);
+  }
+  V T(const V a) const {
+    const V ks = Set(df_, ks_);
+    const V inv_one_minus_ks = Set(df_, inv_one_minus_ks_);
+    return Mul(Sub(a, ks), inv_one_minus_ks);
+  }
+  V P(const V b) const {
+    const V t_b = T(b);
+    const V t_b_2 = Mul(t_b, t_b);
+    const V t_b_3 = Mul(t_b_2, t_b);
+    const V ks = Set(df_, ks_);
+    const V max_lum = Set(df_, max_lum_);
+    return MulAdd(
+        MulAdd(Set(df_, 2), t_b_3, MulAdd(Set(df_, -3), t_b_2, Set(df_, 1))),
+        ks,
+        MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)),
+               Sub(Set(df_, 1), ks),
+               Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)),
+                   max_lum)));
+  }
+
+  D df_;
+  const std::pair<float, float> source_range_;
+  const std::pair<float, float> target_range_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+
+  const float pq_mastering_min_ = InvEOTF(source_range_.first);
+  const float pq_mastering_max_ = InvEOTF(source_range_.second);
+  const float pq_mastering_range_ = pq_mastering_max_ - pq_mastering_min_;
+  const float inv_pq_mastering_range_ = 1.0f / pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float min_lum_ = (InvEOTF(target_range_.first) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float max_lum_ = (InvEOTF(target_range_.second) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  const float ks_ = 1.5f * max_lum_ - 0.5f;
+  const float b_ = min_lum_;
+
+  const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);
+
+  const float normalizer_ = source_range_.second / target_range_.second;
+  const float inv_target_peak_ = 1.f / target_range_.second;
+};
+
+class HlgOOTF {
+ public:
+  explicit HlgOOTF(float source_luminance, float target_luminance,
+                   const float primaries_luminances[3])
+      : HlgOOTF(/*gamma=*/std::pow(
+                    1.111f, std::log2(target_luminance / source_luminance)),
+                primaries_luminances) {}
+
+  static HlgOOTF FromSceneLight(float display_luminance,
+                                const float primaries_luminances[3]) {
+    return HlgOOTF(/*gamma=*/1.2f *
+                       std::pow(1.111f, std::log2(display_luminance / 1000.f)),
+                   primaries_luminances);
+  }
+
+  static HlgOOTF ToSceneLight(float display_luminance,
+                              const float primaries_luminances[3]) {
+    return HlgOOTF(
+        /*gamma=*/(1 / 1.2f) *
+            std::pow(1.111f, -std::log2(display_luminance / 1000.f)),
+        primaries_luminances);
+  }
+
+  template <typename V>
+  void Apply(V* red, V* green, V* blue) const {
+    hwy::HWY_NAMESPACE::DFromV<V> df;
+    if (!apply_ootf_) return;
+    const V luminance =
+        MulAdd(Set(df, red_Y_), *red,
+               MulAdd(Set(df, green_Y_), *green, Mul(Set(df, blue_Y_), *blue)));
+    const V ratio =
+        Min(FastPowf(df, luminance, Set(df, exponent_)), Set(df, 1e9));
+    *red = Mul(*red, ratio);
+    *green = Mul(*green, ratio);
+    *blue = Mul(*blue, ratio);
+  }
+
+  bool WarrantsGamutMapping() const { return apply_ootf_ && exponent_ < 0; }
+
+ private:
+  explicit HlgOOTF(float gamma, const float luminances[3])
+      : exponent_(gamma - 1),
+        red_Y_(luminances[0]),
+        green_Y_(luminances[1]),
+        blue_Y_(luminances[2]) {}
+  const float exponent_;
+  const bool apply_ootf_ = exponent_ < -0.01f || 0.01f < exponent_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+};
+
+template <typename V>
+void GamutMap(V* red, V* green, V* blue, const float primaries_luminances[3],
+              float preserve_saturation = 0.1f) {
+  hwy::HWY_NAMESPACE::DFromV<V> df;
+  const V luminance =
+      MulAdd(Set(df, primaries_luminances[0]), *red,
+             MulAdd(Set(df, primaries_luminances[1]), *green,
+                    Mul(Set(df, primaries_luminances[2]), *blue)));
+
+  // Desaturate out-of-gamut pixels. This is done by mixing each pixel
+  // with just enough gray of the target luminance to make all
+  // components non-negative.
+  // - For saturation preservation, if a component is still larger than
+  // 1 then the pixel is normalized to have a maximum component of 1.
+  // That will reduce its luminance.
+  // - For luminance preservation, getting all components below 1 is
+  // done by mixing in yet more gray. That will desaturate it further.
+  V gray_mix_saturation = Zero(df);
+  V gray_mix_luminance = Zero(df);
+  for (const V* ch : {red, green, blue}) {
+    const V& val = *ch;
+    const V inv_val_minus_gray = Div(Set(df, 1), (Sub(val, luminance)));
+    gray_mix_saturation =
+        IfThenElse(Ge(val, luminance), gray_mix_saturation,
+                   Max(gray_mix_saturation, Mul(val, inv_val_minus_gray)));
+    gray_mix_luminance =
+        Max(gray_mix_luminance,
+            IfThenElse(Le(val, luminance), gray_mix_saturation,
+                       Mul(Sub(val, Set(df, 1)), inv_val_minus_gray)));
+  }
+  const V gray_mix = Clamp(
+      MulAdd(Set(df, preserve_saturation),
+             Sub(gray_mix_saturation, gray_mix_luminance), gray_mix_luminance),
+      Zero(df), Set(df, 1));
+  for (V* const val : {red, green, blue}) {
+    *val = MulAdd(gray_mix, Sub(luminance, *val), *val);
+  }
+  const V normalizer =
+      Div(Set(df, 1), Max(Set(df, 1), Max(*red, Max(*green, *blue))));
+  for (V* const val : {red, green, blue}) {
+    *val = Mul(*val, normalizer);
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_TONE_MAPPING_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h
new file mode 100644
index 0000000000..075619b3b9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms-inl.h
@@ -0,0 +1,853 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_DEC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_TRANSFORMS_INL_H_
+#undef LIB_JXL_DEC_TRANSFORMS_INL_H_
+#else
+#define LIB_JXL_DEC_TRANSFORMS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_scales.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::MulAdd;
+
+// Computes the lowest-frequency LF_ROWSxLF_COLS-sized square in output, which
+// is a DCT_ROWS*DCT_COLS-sized DCT block, by doing a ROWS*COLS DCT on the
+// input block.
+template <size_t DCT_ROWS, size_t DCT_COLS, size_t LF_ROWS, size_t LF_COLS,
+          size_t ROWS, size_t COLS>
+JXL_INLINE void ReinterpretingDCT(const float* input, const size_t input_stride,
+                                  float* output, const size_t output_stride) {
+  static_assert(LF_ROWS == ROWS,
+                "ReinterpretingDCT should only be called with LF == N");
+  static_assert(LF_COLS == COLS,
+                "ReinterpretingDCT should only be called with LF == N");
+  HWY_ALIGN float block[ROWS * COLS];
+
+  // ROWS, COLS <= 8, so we can put scratch space on the stack.
+  HWY_ALIGN float scratch_space[ROWS * COLS];
+  ComputeScaledDCT<ROWS, COLS>()(DCTFrom(input, input_stride), block,
+                                 scratch_space);
+  if (ROWS < COLS) {
+    for (size_t y = 0; y < LF_ROWS; y++) {
+      for (size_t x = 0; x < LF_COLS; x++) {
+        output[y * output_stride + x] =
+            block[y * COLS + x] * DCTTotalResampleScale<ROWS, DCT_ROWS>(y) *
+            DCTTotalResampleScale<COLS, DCT_COLS>(x);
+      }
+    }
+  } else {
+    for (size_t y = 0; y < LF_COLS; y++) {
+      for (size_t x = 0; x < LF_ROWS; x++) {
+        output[y * output_stride + x] =
+            block[y * ROWS + x] * DCTTotalResampleScale<COLS, DCT_COLS>(y) *
+            DCTTotalResampleScale<ROWS, DCT_ROWS>(x);
+      }
+    }
+  }
+}
+
+template <size_t S>
+void IDCT2TopBlock(const float* block, size_t stride_out, float* out) {
+  static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim");
+  static_assert(S % 2 == 0, "S should be even");
+  float temp[kDCTBlockSize];
+  constexpr size_t num_2x2 = S / 2;
+  for (size_t y = 0; y < num_2x2; y++) {
+    for (size_t x = 0; x < num_2x2; x++) {
+      float c00 = block[y * kBlockDim + x];
+      float c01 = block[y * kBlockDim + num_2x2 + x];
+      float c10 = block[(y + num_2x2) * kBlockDim + x];
+      float c11 = block[(y + num_2x2) * kBlockDim + num_2x2 + x];
+      float r00 = c00 + c01 + c10 + c11;
+      float r01 = c00 + c01 - c10 - c11;
+      float r10 = c00 - c01 + c10 - c11;
+      float r11 = c00 - c01 - c10 + c11;
+      temp[y * 2 * kBlockDim + x * 2] = r00;
+      temp[y * 2 * kBlockDim + x * 2 + 1] = r01;
+      temp[(y * 2 + 1) * kBlockDim + x * 2] = r10;
+      temp[(y * 2 + 1) * kBlockDim + x * 2 + 1] = r11;
+    }
+  }
+  for (size_t y = 0; y < S; y++) {
+    for (size_t x = 0; x < S; x++) {
+      out[y * stride_out + x] = temp[y * kBlockDim + x];
+    }
+  }
+}
+
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) {
+  HWY_ALIGN static constexpr float k4x4AFVBasis[16][16] = {
+      {
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+          0.25,
+      },
+      {
+          0.876902929799142f,
+          0.2206518106944235f,
+          -0.10140050393753763f,
+          -0.1014005039375375f,
+          0.2206518106944236f,
+          -0.10140050393753777f,
+          -0.10140050393753772f,
+          -0.10140050393753763f,
+          -0.10140050393753758f,
+          -0.10140050393753769f,
+          -0.1014005039375375f,
+          -0.10140050393753768f,
+          -0.10140050393753768f,
+          -0.10140050393753759f,
+          -0.10140050393753763f,
+          -0.10140050393753741f,
+      },
+      {
+          0.0,
+          0.0,
+          0.40670075830260755f,
+          0.44444816619734445f,
+          0.0,
+          0.0,
+          0.19574399372042936f,
+          0.2929100136981264f,
+          -0.40670075830260716f,
+          -0.19574399372042872f,
+          0.0,
+          0.11379074460448091f,
+          -0.44444816619734384f,
+          -0.29291001369812636f,
+          -0.1137907446044814f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.21255748058288748f,
+          0.3085497062849767f,
+          0.0,
+          0.4706702258572536f,
+          -0.1621205195722993f,
+          0.0,
+          -0.21255748058287047f,
+          -0.16212051957228327f,
+          -0.47067022585725277f,
+          -0.1464291867126764f,
+          0.3085497062849487f,
+          0.0,
+          -0.14642918671266536f,
+          0.4251149611657548f,
+      },
+      {
+          0.0,
+          -0.7071067811865474f,
+          0.0,
+          0.0,
+          0.7071067811865476f,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+      },
+      {
+          -0.4105377591765233f,
+          0.6235485373547691f,
+          -0.06435071657946274f,
+          -0.06435071657946266f,
+          0.6235485373547694f,
+          -0.06435071657946284f,
+          -0.0643507165794628f,
+          -0.06435071657946274f,
+          -0.06435071657946272f,
+          -0.06435071657946279f,
+          -0.06435071657946266f,
+          -0.06435071657946277f,
+          -0.06435071657946277f,
+          -0.06435071657946273f,
+          -0.06435071657946274f,
+          -0.0643507165794626f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.4517556589999482f,
+          0.15854503551840063f,
+          0.0,
+          -0.04038515160822202f,
+          0.0074182263792423875f,
+          0.39351034269210167f,
+          -0.45175565899994635f,
+          0.007418226379244351f,
+          0.1107416575309343f,
+          0.08298163094882051f,
+          0.15854503551839705f,
+          0.3935103426921022f,
+          0.0829816309488214f,
+          -0.45175565899994796f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.304684750724869f,
+          0.5112616136591823f,
+          0.0,
+          0.0,
+          -0.290480129728998f,
+          -0.06578701549142804f,
+          0.304684750724884f,
+          0.2904801297290076f,
+          0.0,
+          -0.23889773523344604f,
+          -0.5112616136592012f,
+          0.06578701549142545f,
+          0.23889773523345467f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          0.3017929516615495f,
+          0.25792362796341184f,
+          0.0,
+          0.16272340142866204f,
+          0.09520022653475037f,
+          0.0,
+          0.3017929516615503f,
+          0.09520022653475055f,
+          -0.16272340142866173f,
+          -0.35312385449816297f,
+          0.25792362796341295f,
+          0.0,
+          -0.3531238544981624f,
+          -0.6035859033230976f,
+      },
+      {
+          0.0,
+          0.0,
+          0.40824829046386274f,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          -0.4082482904638628f,
+          -0.4082482904638635f,
+          0.0,
+          0.0,
+          -0.40824829046386296f,
+          0.0,
+          0.4082482904638634f,
+          0.408248290463863f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          0.1747866975480809f,
+          0.0812611176717539f,
+          0.0,
+          0.0,
+          -0.3675398009862027f,
+          -0.307882213957909f,
+          -0.17478669754808135f,
+          0.3675398009862011f,
+          0.0,
+          0.4826689115059883f,
+          -0.08126111767175039f,
+          0.30788221395790305f,
+          -0.48266891150598584f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.21105601049335784f,
+          0.18567180916109802f,
+          0.0,
+          0.0,
+          0.49215859013738733f,
+          -0.38525013709251915f,
+          0.21105601049335806f,
+          -0.49215859013738905f,
+          0.0,
+          0.17419412659916217f,
+          -0.18567180916109904f,
+          0.3852501370925211f,
+          -0.1741941265991621f,
+          0.0,
+      },
+      {
+          0.0,
+          0.0,
+          -0.14266084808807264f,
+          -0.3416446842253372f,
+          0.0,
+          0.7367497537172237f,
+          0.24627107722075148f,
+          -0.08574019035519306f,
+          -0.14266084808807344f,
+          0.24627107722075137f,
+          0.14883399227113567f,
+          -0.04768680350229251f,
+          -0.3416446842253373f,
+          -0.08574019035519267f,
+          -0.047686803502292804f,
+          -0.14266084808807242f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.13813540350758585f,
+          0.3302282550303788f,
+          0.0,
+          0.08755115000587084f,
+          -0.07946706605909573f,
+          -0.4613374887461511f,
+          -0.13813540350758294f,
+          -0.07946706605910261f,
+          0.49724647109535086f,
+          0.12538059448563663f,
+          0.3302282550303805f,
+          -0.4613374887461554f,
+          0.12538059448564315f,
+          -0.13813540350758452f,
+      },
+      {
+          0.0,
+          0.0,
+          -0.17437602599651067f,
+          0.0702790691196284f,
+          0.0,
+          -0.2921026642334881f,
+          0.3623817333531167f,
+          0.0,
+          -0.1743760259965108f,
+          0.36238173335311646f,
+          0.29210266423348785f,
+          -0.4326608024727445f,
+          0.07027906911962818f,
+          0.0,
+          -0.4326608024727457f,
+          0.34875205199302267f,
+      },
+      {
+          0.0,
+          0.0,
+          0.11354987314994337f,
+          -0.07417504595810355f,
+          0.0,
+          0.19402893032594343f,
+          -0.435190496523228f,
+          0.21918684838857466f,
+          0.11354987314994257f,
+          -0.4351904965232251f,
+          0.5550443808910661f,
+          -0.25468277124066463f,
+          -0.07417504595810233f,
+          0.2191868483885728f,
+          -0.25468277124066413f,
+          0.1135498731499429f,
+      },
+  };
+
+  const HWY_CAPPED(float, 16) d;
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    auto pixel = Zero(d);
+    for (size_t j = 0; j < 16; j++) {
+      auto cf = Set(d, coeffs[j]);
+      auto basis = Load(d, k4x4AFVBasis[j] + i);
+      pixel = MulAdd(cf, basis, pixel);
+    }
+    Store(pixel, d, pixels + i);
+  }
+}
+
+template <size_t afv_kind>
+void AFVTransformToPixels(const float* JXL_RESTRICT coefficients,
+                          float* JXL_RESTRICT pixels, size_t pixels_stride) {
+  HWY_ALIGN float scratch_space[4 * 8];
+  size_t afv_x = afv_kind & 1;
+  size_t afv_y = afv_kind / 2;
+  float dcs[3] = {};
+  float block00 = coefficients[0];
+  float block01 = coefficients[1];
+  float block10 = coefficients[8];
+  dcs[0] = (block00 + block10 + block01) * 4.0f;
+  dcs[1] = (block00 + block10 - block01);
+  dcs[2] = block00 - block10;
+  // IAFV: (even, even) positions.
+  HWY_ALIGN float coeff[4 * 4];
+  coeff[0] = dcs[0];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      coeff[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2];
+    }
+  }
+  HWY_ALIGN float block[4 * 8];
+  AFVIDCT4x4(coeff, block);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      pixels[(iy + afv_y * 4) * pixels_stride + afv_x * 4 + ix] =
+          block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)];
+    }
+  }
+  // IDCT4x4 in (odd, even) positions.
+  block[0] = dcs[1];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      block[iy * 4 + ix] = coefficients[iy * 2 * 8 + ix * 2 + 1];
+    }
+  }
+  ComputeScaledIDCT<4, 4>()(
+      block,
+      DCTTo(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4),
+            pixels_stride),
+      scratch_space);
+  // IDCT4x8.
+  block[0] = dcs[2];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      if (ix == 0 && iy == 0) continue;
+      block[iy * 8 + ix] = coefficients[(1 + iy * 2) * 8 + ix];
+    }
+  }
+  ComputeScaledIDCT<4, 8>()(
+      block,
+      DCTTo(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride),
+      scratch_space);
+}
+
+HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
+                                        float* JXL_RESTRICT coefficients,
+                                        float* JXL_RESTRICT pixels,
+                                        size_t pixels_stride,
+                                        float* scratch_space) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::IDENTITY: {
+      PROFILER_ZONE("IDCT Identity");
+      float dcs[4] = {};
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      dcs[0] = block00 + block01 + block10 + block11;
+      dcs[1] = block00 + block01 - block10 - block11;
+      dcs[2] = block00 - block01 + block10 - block11;
+      dcs[3] = block00 - block01 - block10 + block11;
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          float block_dc = dcs[y * 2 + x];
+          float residual_sum = 0;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 0 && iy == 0) continue;
+              residual_sum += coefficients[(y + iy * 2) * 8 + x + ix * 2];
+            }
+          }
+          pixels[(4 * y + 1) * pixels_stride + 4 * x + 1] =
+              block_dc - residual_sum * (1.0f / 16);
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 1 && iy == 1) continue;
+              pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] =
+                  coefficients[(y + iy * 2) * 8 + x + ix * 2] +
+                  pixels[(4 * y + 1) * pixels_stride + 4 * x + 1];
+            }
+          }
+          pixels[y * 4 * pixels_stride + x * 4] =
+              coefficients[(y + 2) * 8 + x + 2] +
+              pixels[(4 * y + 1) * pixels_stride + 4 * x + 1];
+        }
+      }
+      break;
+    }
+    case Type::DCT8X4: {
+      PROFILER_ZONE("IDCT 8x4");
+      float dcs[2] = {};
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      dcs[0] = block0 + block1;
+      dcs[1] = block0 - block1;
+      for (size_t x = 0; x < 2; x++) {
+        HWY_ALIGN float block[4 * 8];
+        block[0] = dcs[x];
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            if (ix == 0 && iy == 0) continue;
+            block[iy * 8 + ix] = coefficients[(x + iy * 2) * 8 + ix];
+          }
+        }
+        ComputeScaledIDCT<8, 4>()(block, DCTTo(pixels + x * 4, pixels_stride),
+                                  scratch_space);
+      }
+      break;
+    }
+    case Type::DCT4X8: {
+      PROFILER_ZONE("IDCT 4x8");
+      float dcs[2] = {};
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      dcs[0] = block0 + block1;
+      dcs[1] = block0 - block1;
+      for (size_t y = 0; y < 2; y++) {
+        HWY_ALIGN float block[4 * 8];
+        block[0] = dcs[y];
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            if (ix == 0 && iy == 0) continue;
+            block[iy * 8 + ix] = coefficients[(y + iy * 2) * 8 + ix];
+          }
+        }
+        ComputeScaledIDCT<4, 8>()(
+            block, DCTTo(pixels + y * 4 * pixels_stride, pixels_stride),
+            scratch_space);
+      }
+      break;
+    }
+    case Type::DCT4X4: {
+      PROFILER_ZONE("IDCT 4");
+      float dcs[4] = {};
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      dcs[0] = block00 + block01 + block10 + block11;
+      dcs[1] = block00 + block01 - block10 - block11;
+      dcs[2] = block00 - block01 + block10 - block11;
+      dcs[3] = block00 - block01 - block10 + block11;
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          HWY_ALIGN float block[4 * 4];
+          block[0] = dcs[y * 2 + x];
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 0 && iy == 0) continue;
+              block[iy * 4 + ix] = coefficients[(y + iy * 2) * 8 + x + ix * 2];
+            }
+          }
+          ComputeScaledIDCT<4, 4>()(
+              block,
+              DCTTo(pixels + y * 4 * pixels_stride + x * 4, pixels_stride),
+              scratch_space);
+        }
+      }
+      break;
+    }
+    case Type::DCT2X2: {
+      PROFILER_ZONE("IDCT 2");
+      HWY_ALIGN float coeffs[kDCTBlockSize];
+      memcpy(coeffs, coefficients, sizeof(float) * kDCTBlockSize);
+      IDCT2TopBlock<2>(coeffs, kBlockDim, coeffs);
+      IDCT2TopBlock<4>(coeffs, kBlockDim, coeffs);
+      IDCT2TopBlock<8>(coeffs, kBlockDim, coeffs);
+      for (size_t y = 0; y < kBlockDim; y++) {
+        for (size_t x = 0; x < kBlockDim; x++) {
+          pixels[y * pixels_stride + x] = coeffs[y * kBlockDim + x];
+        }
+      }
+      break;
+    }
+    case Type::DCT16X16: {
+      PROFILER_ZONE("IDCT 16");
+      ComputeScaledIDCT<16, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT16X8: {
+      PROFILER_ZONE("IDCT 16x8");
+      ComputeScaledIDCT<16, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT8X16: {
+      PROFILER_ZONE("IDCT 8x16");
+      ComputeScaledIDCT<8, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X8: {
+      PROFILER_ZONE("IDCT 32x8");
+      ComputeScaledIDCT<32, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT8X32: {
+      PROFILER_ZONE("IDCT 8x32");
+      ComputeScaledIDCT<8, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X16: {
+      PROFILER_ZONE("IDCT 32x16");
+      ComputeScaledIDCT<32, 16>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT16X32: {
+      PROFILER_ZONE("IDCT 16x32");
+      ComputeScaledIDCT<16, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT32X32: {
+      PROFILER_ZONE("IDCT 32");
+      ComputeScaledIDCT<32, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT: {
+      PROFILER_ZONE("IDCT 8");
+      ComputeScaledIDCT<8, 8>()(coefficients, DCTTo(pixels, pixels_stride),
+                                scratch_space);
+      break;
+    }
+    case Type::AFV0: {
+      PROFILER_ZONE("IAFV0");
+      AFVTransformToPixels<0>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV1: {
+      PROFILER_ZONE("IAFV1");
+      AFVTransformToPixels<1>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV2: {
+      PROFILER_ZONE("IAFV2");
+      AFVTransformToPixels<2>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::AFV3: {
+      PROFILER_ZONE("IAFV3");
+      AFVTransformToPixels<3>(coefficients, pixels, pixels_stride);
+      break;
+    }
+    case Type::DCT64X32: {
+      PROFILER_ZONE("IDCT 64x32");
+      ComputeScaledIDCT<64, 32>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT32X64: {
+      PROFILER_ZONE("IDCT 32x64");
+      ComputeScaledIDCT<32, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT64X64: {
+      PROFILER_ZONE("IDCT 64");
+      ComputeScaledIDCT<64, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT128X64: {
+      PROFILER_ZONE("IDCT 128x64");
+      ComputeScaledIDCT<128, 64>()(coefficients, DCTTo(pixels, pixels_stride),
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT64X128: {
+      PROFILER_ZONE("IDCT 64x128");
+      ComputeScaledIDCT<64, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X128: {
+      PROFILER_ZONE("IDCT 128");
+      ComputeScaledIDCT<128, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT256X128: {
+      PROFILER_ZONE("IDCT 256x128");
+      ComputeScaledIDCT<256, 128>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT128X256: {
+      PROFILER_ZONE("IDCT 128x256");
+      ComputeScaledIDCT<128, 256>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::DCT256X256: {
+      PROFILER_ZONE("IDCT 256");
+      ComputeScaledIDCT<256, 256>()(coefficients, DCTTo(pixels, pixels_stride),
+                                    scratch_space);
+      break;
+    }
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy,
+                                              const float* dc, size_t dc_stride,
+                                              float* llf) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::DCT16X8: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/1, /*ROWS=*/2, /*COLS=*/1>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT8X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/1, /*LF_COLS=*/2, /*ROWS=*/1, /*COLS=*/2>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT16X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/2, /*ROWS=*/2, /*COLS=*/2>(
+          dc, dc_stride, llf, 2 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X8: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/1, /*ROWS=*/4, /*COLS=*/1>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT8X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/1, /*LF_COLS=*/4, /*ROWS=*/1, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X16: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/2, /*ROWS=*/4, /*COLS=*/2>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT16X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/2, /*LF_COLS=*/4, /*ROWS=*/2, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/4, /*ROWS=*/4, /*COLS=*/4>(
+          dc, dc_stride, llf, 4 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X32: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/4, /*ROWS=*/8, /*COLS=*/4>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT32X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/4, /*LF_COLS=*/8, /*ROWS=*/4, /*COLS=*/8>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/8, /*ROWS=*/8, /*COLS=*/8>(
+          dc, dc_stride, llf, 8 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X64: {
+      ReinterpretingDCT</*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                        /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT64X128: {
+      ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+                        /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X128: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>(
+          dc, dc_stride, llf, 16 * kBlockDim);
+      break;
+    }
+    case Type::DCT256X128: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT128X256: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT256X256: {
+      ReinterpretingDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>(
+          dc, dc_stride, llf, 32 * kBlockDim);
+      break;
+    }
+    case Type::DCT:
+    case Type::DCT2X2:
+    case Type::DCT4X4:
+    case Type::DCT4X8:
+    case Type::DCT8X4:
+    case Type::AFV0:
+    case Type::AFV1:
+    case Type::AFV2:
+    case Type::AFV3:
+    case Type::IDENTITY:
+      llf[0] = dc[0];
+      break;
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  };
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_TRANSFORMS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc
new file mode 100644
index 0000000000..9ee80c59dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.cc
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_transforms_testonly.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_transforms_testonly.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms-inl.h"
+
+namespace jxl {
+
+#if HWY_ONCE
+HWY_EXPORT(TransformToPixels);
+void TransformToPixels(AcStrategy::Type strategy,
+                       float* JXL_RESTRICT coefficients,
+                       float* JXL_RESTRICT pixels, size_t pixels_stride,
+                       float* scratch_space) {
+  return HWY_DYNAMIC_DISPATCH(TransformToPixels)(strategy, coefficients, pixels,
+                                                 pixels_stride, scratch_space);
+}
+
+HWY_EXPORT(LowestFrequenciesFromDC);
+void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
+                             const float* dc, size_t dc_stride, float* llf) {
+  return HWY_DYNAMIC_DISPATCH(LowestFrequenciesFromDC)(strategy, dc, dc_stride,
+                                                       llf);
+}
+
+HWY_EXPORT(AFVIDCT4x4);
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) {
+  return HWY_DYNAMIC_DISPATCH(AFVIDCT4x4)(coeffs, pixels);
+}
+#endif  // HWY_ONCE
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h
new file mode 100644
index 0000000000..97c4ca543d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_transforms_testonly.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
+#define LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
+
+// Facade for (non-inlined) inverse integral transforms.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+void TransformToPixels(AcStrategy::Type strategy,
+                       float* JXL_RESTRICT coefficients,
+                       float* JXL_RESTRICT pixels, size_t pixels_stride,
+                       float* JXL_RESTRICT scratch_space);
+
+// Equivalent of the above for DC image.
+void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
+                             const float* dc, size_t dc_stride, float* llf);
+
+void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_TRANSFORMS_TESTONLY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h
new file mode 100644
index 0000000000..a4f24cd123
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb-inl.h
@@ -0,0 +1,346 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// XYB -> linear sRGB helper function.
+
+#if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_DEC_XYB_INL_H_
+#undef LIB_JXL_DEC_XYB_INL_H_
+#else
+#define LIB_JXL_DEC_XYB_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Broadcast;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including
+// the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB)
+// gamut values may be in-gamut after transforming to a wider space.
+// "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries
+// of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its
+// entries by c is equivalent to multiplying linear_* by c afterwards.
+template <class D, class V>
+HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y,
+                                          const V opsin_b,
+                                          const OpsinParams& opsin_params,
+                                          V* const HWY_RESTRICT linear_r,
+                                          V* const HWY_RESTRICT linear_g,
+                                          V* const HWY_RESTRICT linear_b) {
+#if HWY_TARGET == HWY_SCALAR
+  const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]);
+  const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]);
+  const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]);
+#else
+  const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases);
+  const auto neg_bias_r = Broadcast<0>(neg_bias_rgb);
+  const auto neg_bias_g = Broadcast<1>(neg_bias_rgb);
+  const auto neg_bias_b = Broadcast<2>(neg_bias_rgb);
+#endif
+
+  // Color space: XYB -> RGB
+  auto gamma_r = Add(opsin_y, opsin_x);
+  auto gamma_g = Sub(opsin_y, opsin_x);
+  auto gamma_b = opsin_b;
+
+  gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0]));
+  gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1]));
+  gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2]));
+
+  // Undo gamma compression: linear = gamma^3 for efficiency.
+  const auto gamma_r2 = Mul(gamma_r, gamma_r);
+  const auto gamma_g2 = Mul(gamma_g, gamma_g);
+  const auto gamma_b2 = Mul(gamma_b, gamma_b);
+  const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r);
+  const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g);
+  const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b);
+
+  const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix;
+
+  // Unmix (multiply by 3x3 inverse_matrix)
+  // TODO(eustas): ref would be more readable than pointer
+  *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r);
+  *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r);
+  *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r);
+  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r);
+  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g);
+  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b);
+  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r);
+  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g);
+  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b);
+}
+
+static inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() {
+#if HWY_TARGET == HWY_NEON
+  return true;
+#else
+  return false;
+#endif
+}
+
+static inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4],
+                                                   uint8_t* output,
+                                                   bool is_rgba, size_t xsize) {
+  // This function is very NEON-specific. As such, it uses intrinsics directly.
+#if HWY_TARGET == HWY_NEON
+  // WARNING: doing fixed point arithmetic correctly is very complicated.
+  // Changes to this function should be thoroughly tested.
+
+  // Note that the input is assumed to have 13 bits of mantissa, and the output
+  // will have 14 bits.
+  auto srgb_tf = [&](int16x8_t v16) {
+    int16x8_t clz = vclzq_s16(v16);
+    // Convert to [0.25, 0.5) range.
+    int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2)));
+
+    // third degree polynomial approximation between 0.25 and 0.5
+    // of 1.055/2^(7/2.4) * x^(1/2.4) / 32.
+    // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29
+    // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72
+    // overflow our fixed point representation.
+
+    int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16);
+
+    // 0.47 * x
+    int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706);
+    // - 0.87
+    int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546));
+    // * x
+    int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16);
+    // + 0.86
+    int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302));
+    // * 2x
+    int16x8_t step5 = vqrdmulhq_s16(step4, twov);
+    // + 0.29
+    int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485));
+
+    int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz);
+    // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are
+    // capped to 1.
+    // Generated with the following Python script:
+    // a = []
+    // b = []
+    //
+    // for i in range(0, 16):
+    //   v = 2**(5/12.*i)
+    //   v /= 16
+    //   v *= 256 * 128
+    //   v = int(v)
+    //   a.append(v // 256)
+    //   b.append(v % 256)
+    //
+    // print(", ".join("0x%02x" % x for x in a))
+    //
+    // print(", ".join("0x%02x" % x for x in b))
+
+    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = {
+        0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c,
+        0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f,
+    };
+    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = {
+        0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68,
+        0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    };
+    // Using the highway implementation here since vqtbl1q is aarch64-only.
+    using hwy::HWY_NAMESPACE::Vec128;
+    uint8x16_t pow_low =
+        TableLookupBytes(
+            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)),
+            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
+            .raw;
+    uint8x16_t pow_high =
+        TableLookupBytes(
+            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)),
+            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
+            .raw;
+    int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16(
+        vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8));
+
+    // approximation of v * 12.92, divided by 2
+    // Note that our input is using 13 mantissa bits instead of 15.
+    int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5);
+    // 1.055*pow(v, 1/2.4) - 0.055, divided by 2
+    auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901));
+    // > 0.0031308f (note that v16 has 13 mantissa bits)
+    return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear);
+  };
+
+  const float* JXL_RESTRICT row_in_x = input[0];
+  const float* JXL_RESTRICT row_in_y = input[1];
+  const float* JXL_RESTRICT row_in_b = input[2];
+  const float* JXL_RESTRICT row_in_a = input[3];
+  for (size_t x = 0; x < xsize; x += 8) {
+    // Normal ranges for xyb for in-gamut sRGB colors:
+    // x: -0.015386 0.028100
+    // y: 0.000000 0.845308
+    // b: 0.000000 0.845308
+
+    // We actually want x * 8 to have some extra precision.
+    // TODO(veluca): consider different approaches here, like vld1q_f32_x2.
+    float32x4_t opsin_x_left = vld1q_f32(row_in_x + x);
+    int16x4_t opsin_x16_times8_left =
+        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18));
+    float32x4_t opsin_x_right =
+        vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_x16_times8_right =
+        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18));
+    int16x8_t opsin_x16_times8 =
+        vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right);
+
+    float32x4_t opsin_y_left = vld1q_f32(row_in_y + x);
+    int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15));
+    float32x4_t opsin_y_right =
+        vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15));
+    int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right);
+
+    float32x4_t opsin_b_left = vld1q_f32(row_in_b + x);
+    int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15));
+    float32x4_t opsin_b_right =
+        vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0));
+    int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15));
+    int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right);
+
+    int16x8_t neg_bias16 = vdupq_n_s16(-124);        // -0.0037930732552754493
+    int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110);  // -0.155954201
+    int16x8_t neg_bias_half16 = vdupq_n_s16(-62);
+
+    // Color space: XYB -> RGB
+    // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2,
+    // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias.
+    // Note that ignoring x2 in the formulas below (as x << y) results in
+    // errors of at least 3 in the final sRGB values.
+    int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16);
+    int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16);
+    int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3);
+    int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16);
+    int16x8_t threexsq16 =
+        vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6);
+
+    // We can ignore x^3 here. Note that this is multiplied by 8.
+    int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16);
+
+    int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16);
+    int16x8_t mixed_rpg16 = vhaddq_s16(
+        vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16);
+
+    int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16);
+    int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16);
+    int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16);
+    int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16);
+    // mixed_rpg and mixed_b are in 0-1 range.
+    // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note
+    // that at this point it is already multiplied by 8.
+
+    // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit
+    // fixed point) to ensure intermediate quantities are in range. Note that
+    // r-g is not shifted, and was x8 before here; this corresponds to a x32
+    // overall multiplicative factor and ensures that all the matrix constants
+    // are in 0-1 range.
+    // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two
+    // vhadd + using neg_bias_half.
+    mixed_b16 = vshrq_n_s16(mixed_b16, 2);
+
+    // Unmix (multiply by 3x3 inverse_matrix)
+    // For increased precision, we use a matrix for converting from
+    // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This
+    // avoids cancellation effects when computing (y+x)^3-(y-x)^3.
+    // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c *
+    // mixed_b pattern is repeated frequently in the code below. This allows
+    // us to save a multiply per channel, and removes the presence of
+    // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1)
+    // range, so the subtraction is safe.
+    // All the magic-looking constants here are derived by computing the
+    // inverse opsin matrix for the transformation modified as described
+    // above.
+
+    // Precomputation common to multiple color values.
+    int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16);
+    int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394);
+    int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16);
+
+    // R
+    int16x8_t linear_r16 =
+        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400));
+
+    // G
+    int16x8_t linear_g16 =
+        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857));
+
+    // B
+    int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996);
+    linear_b16 = vqaddq_s16(linear_b16, mixed_b16);
+    linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525));
+
+    // Apply SRGB transfer function.
+    int16x8_t r = srgb_tf(linear_r16);
+    int16x8_t g = srgb_tf(linear_g16);
+    int16x8_t b = srgb_tf(linear_b16);
+
+    uint8x8_t r8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6));
+    uint8x8_t g8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6));
+    uint8x8_t b8 =
+        vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6));
+
+    size_t n = xsize - x;
+    if (is_rgba) {
+      float32x4_t a_f32_left =
+          row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f);
+      float32x4_t a_f32_right =
+          row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0))
+                   : vdupq_n_f32(1.0f);
+      int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8));
+      int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8));
+      uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right));
+      uint8_t* buf = output + 4 * x;
+      uint8x8x4_t data = {r8, g8, b8, a8};
+      if (n >= 8) {
+        vst4_u8(buf, data);
+      } else {
+        uint8_t tmp[8 * 4];
+        vst4_u8(tmp, data);
+        memcpy(buf, tmp, n * 4);
+      }
+    } else {
+      uint8_t* buf = output + 3 * x;
+      uint8x8x3_t data = {r8, g8, b8};
+      if (n >= 8) {
+        vst3_u8(buf, data);
+      } else {
+        uint8_t tmp[8 * 3];
+        vst3_u8(tmp, data);
+        memcpy(buf, tmp, n * 3);
+      }
+    }
+  }
+#else
+  (void)input;
+  (void)output;
+  (void)is_rgba;
+  (void)xsize;
+  JXL_ABORT("Unreachable");
+#endif
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_DEC_XYB_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.cc b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc
new file mode 100644
index 0000000000..46fc63c49e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc
@@ -0,0 +1,329 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/dec_xyb.h"
+
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/dec_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/sanitizers.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::MulAdd;
+
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params) {
+  PROFILER_FUNC;
+  JXL_CHECK_IMAGE_INITIALIZED(*inout, Rect(*inout));
+
+  const size_t xsize = inout->xsize();  // not padded
+  JXL_CHECK(RunOnPool(
+      pool, 0, inout->ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /* thread */) {
+        const size_t y = task;
+
+        // Faster than adding via ByteOffset at end of loop.
+        float* JXL_RESTRICT row0 = inout->PlaneRow(0, y);
+        float* JXL_RESTRICT row1 = inout->PlaneRow(1, y);
+        float* JXL_RESTRICT row2 = inout->PlaneRow(2, y);
+
+        const HWY_FULL(float) d;
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_opsin_x = Load(d, row0 + x);
+          const auto in_opsin_y = Load(d, row1 + x);
+          const auto in_opsin_b = Load(d, row2 + x);
+          auto linear_r = Undefined(d);
+          auto linear_g = Undefined(d);
+          auto linear_b = Undefined(d);
+          XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params,
+                   &linear_r, &linear_g, &linear_b);
+
+          Store(linear_r, d, row0 + x);
+          Store(linear_g, d, row1 + x);
+          Store(linear_b, d, row2 + x);
+        }
+      },
+      "OpsinToLinear"));
+}
+
+// Same, but not in-place.
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params) {
+  PROFILER_FUNC;
+
+  JXL_ASSERT(SameSize(rect, *linear));
+  JXL_CHECK_IMAGE_INITIALIZED(opsin, rect);
+
+  JXL_CHECK(RunOnPool(
+      pool, 0, static_cast<int>(rect.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+
+        // Faster than adding via ByteOffset at end of loop.
+        const float* JXL_RESTRICT row_opsin_0 = rect.ConstPlaneRow(opsin, 0, y);
+        const float* JXL_RESTRICT row_opsin_1 = rect.ConstPlaneRow(opsin, 1, y);
+        const float* JXL_RESTRICT row_opsin_2 = rect.ConstPlaneRow(opsin, 2, y);
+        float* JXL_RESTRICT row_linear_0 = linear->PlaneRow(0, y);
+        float* JXL_RESTRICT row_linear_1 = linear->PlaneRow(1, y);
+        float* JXL_RESTRICT row_linear_2 = linear->PlaneRow(2, y);
+
+        const HWY_FULL(float) d;
+
+        for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
+          const auto in_opsin_x = Load(d, row_opsin_0 + x);
+          const auto in_opsin_y = Load(d, row_opsin_1 + x);
+          const auto in_opsin_b = Load(d, row_opsin_2 + x);
+          auto linear_r = Undefined(d);
+          auto linear_g = Undefined(d);
+          auto linear_b = Undefined(d);
+          XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params,
+                   &linear_r, &linear_g, &linear_b);
+
+          Store(linear_r, d, row_linear_0 + x);
+          Store(linear_g, d, row_linear_1 + x);
+          Store(linear_b, d, row_linear_2 + x);
+        }
+      },
+      "OpsinToLinear(Rect)"));
+  JXL_CHECK_IMAGE_INITIALIZED(*linear, rect);
+}
+
+// Transform YCbCr to RGB.
+// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) {
+  JXL_CHECK_IMAGE_INITIALIZED(ycbcr, rect);
+  const HWY_CAPPED(float, kBlockDim) df;
+  const size_t S = Lanes(df);  // Step.
+
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  if ((xsize == 0) || (ysize == 0)) return;
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto c128 = Set(df, 128.0f / 255);
+  const auto crcr = Set(df, 1.402f);
+  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+  const auto cbcb = Set(df, 1.772f);
+
+  for (size_t y = 0; y < ysize; y++) {
+    const float* y_row = rect.ConstPlaneRow(ycbcr, 1, y);
+    const float* cb_row = rect.ConstPlaneRow(ycbcr, 0, y);
+    const float* cr_row = rect.ConstPlaneRow(ycbcr, 2, y);
+    float* r_row = rect.PlaneRow(rgb, 0, y);
+    float* g_row = rect.PlaneRow(rgb, 1, y);
+    float* b_row = rect.PlaneRow(rgb, 2, y);
+    for (size_t x = 0; x < xsize; x += S) {
+      const auto y_vec = Add(Load(df, y_row + x), c128);
+      const auto cb_vec = Load(df, cb_row + x);
+      const auto cr_vec = Load(df, cr_row + x);
+      const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+      const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+      const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+      Store(r_vec, df, r_row + x);
+      Store(g_vec, df, g_row + x);
+      Store(b_vec, df, b_row + x);
+    }
+  }
+  JXL_CHECK_IMAGE_INITIALIZED(*rgb, rect);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(OpsinToLinearInplace);
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params) {
+  return HWY_DYNAMIC_DISPATCH(OpsinToLinearInplace)(inout, pool, opsin_params);
+}
+
+HWY_EXPORT(OpsinToLinear);
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params) {
+  return HWY_DYNAMIC_DISPATCH(OpsinToLinear)(opsin, rect, pool, linear,
+                                             opsin_params);
+}
+
+HWY_EXPORT(YcbcrToRgb);
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect) {
+  return HWY_DYNAMIC_DISPATCH(YcbcrToRgb)(ycbcr, rgb, rect);
+}
+
+HWY_EXPORT(HasFastXYBTosRGB8);
+bool HasFastXYBTosRGB8() { return HWY_DYNAMIC_DISPATCH(HasFastXYBTosRGB8)(); }
+
+HWY_EXPORT(FastXYBTosRGB8);
+void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba,
+                    size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(FastXYBTosRGB8)(input, output, is_rgba, xsize);
+}
+
+void OpsinParams::Init(float intensity_target) {
+  InitSIMDInverseMatrix(GetOpsinAbsorbanceInverseMatrix(), inverse_opsin_matrix,
+                        intensity_target);
+  memcpy(opsin_biases, kNegOpsinAbsorbanceBiasRGB,
+         sizeof(kNegOpsinAbsorbanceBiasRGB));
+  memcpy(quant_biases, kDefaultQuantBias, sizeof(kDefaultQuantBias));
+  for (size_t c = 0; c < 4; c++) {
+    opsin_biases_cbrt[c] = cbrtf(opsin_biases[c]);
+  }
+}
+
+bool CanOutputToColorEncoding(const ColorEncoding& c_desired) {
+  if (!c_desired.HaveFields()) {
+    return false;
+  }
+  // TODO(veluca): keep in sync with dec_reconstruct.cc
+  if (!c_desired.tf.IsPQ() && !c_desired.tf.IsSRGB() &&
+      !c_desired.tf.IsGamma() && !c_desired.tf.IsLinear() &&
+      !c_desired.tf.IsHLG() && !c_desired.tf.IsDCI() && !c_desired.tf.Is709()) {
+    return false;
+  }
+  if (c_desired.IsGray() && c_desired.white_point != WhitePoint::kD65) {
+    // TODO(veluca): figure out what should happen here.
+    return false;
+  }
+  return true;
+}
+
+Status OutputEncodingInfo::SetFromMetadata(const CodecMetadata& metadata) {
+  orig_color_encoding = metadata.m.color_encoding;
+  orig_intensity_target = metadata.m.IntensityTarget();
+  desired_intensity_target = orig_intensity_target;
+  const auto& im = metadata.transform_data.opsin_inverse_matrix;
+  memcpy(orig_inverse_matrix, im.inverse_matrix, sizeof(orig_inverse_matrix));
+  default_transform = im.all_default;
+  xyb_encoded = metadata.m.xyb_encoded;
+  std::copy(std::begin(im.opsin_biases), std::end(im.opsin_biases),
+            opsin_params.opsin_biases);
+  for (int i = 0; i < 3; ++i) {
+    opsin_params.opsin_biases_cbrt[i] = cbrtf(opsin_params.opsin_biases[i]);
+  }
+  opsin_params.opsin_biases_cbrt[3] = opsin_params.opsin_biases[3] = 1;
+  std::copy(std::begin(im.quant_biases), std::end(im.quant_biases),
+            opsin_params.quant_biases);
+  bool orig_ok = CanOutputToColorEncoding(orig_color_encoding);
+  bool orig_grey = orig_color_encoding.IsGray();
+  return SetColorEncoding(!xyb_encoded || orig_ok
+                              ? orig_color_encoding
+                              : ColorEncoding::LinearSRGB(orig_grey));
+}
+
+Status OutputEncodingInfo::MaybeSetColorEncoding(
+    const ColorEncoding& c_desired) {
+  if (c_desired.GetColorSpace() == ColorSpace::kXYB &&
+      ((color_encoding.GetColorSpace() == ColorSpace::kRGB &&
+        color_encoding.primaries != Primaries::kSRGB) ||
+       color_encoding.tf.IsPQ())) {
+    return false;
+  }
+  if (!xyb_encoded && !CanOutputToColorEncoding(c_desired)) {
+    return false;
+  }
+  return SetColorEncoding(c_desired);
+}
+
+Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) {
+  color_encoding = c_desired;
+  color_encoding_is_original = orig_color_encoding.SameColorEncoding(c_desired);
+
+  // Compute the opsin inverse matrix and luminances based on primaries and
+  // white point.
+  float inverse_matrix[9];
+  bool inverse_matrix_is_default = default_transform;
+  memcpy(inverse_matrix, orig_inverse_matrix, sizeof(inverse_matrix));
+  constexpr float kSRGBLuminances[3] = {0.2126, 0.7152, 0.0722};
+  memcpy(luminances, kSRGBLuminances, sizeof(luminances));
+  if ((c_desired.primaries != Primaries::kSRGB ||
+       c_desired.white_point != WhitePoint::kD65) &&
+      !c_desired.IsGray()) {
+    float srgb_to_xyzd50[9];
+    const auto& srgb = ColorEncoding::SRGB(/*is_gray=*/false);
+    JXL_CHECK(PrimariesToXYZD50(
+        srgb.GetPrimaries().r.x, srgb.GetPrimaries().r.y,
+        srgb.GetPrimaries().g.x, srgb.GetPrimaries().g.y,
+        srgb.GetPrimaries().b.x, srgb.GetPrimaries().b.y,
+        srgb.GetWhitePoint().x, srgb.GetWhitePoint().y, srgb_to_xyzd50));
+    float original_to_xyz[3][3];
+    JXL_RETURN_IF_ERROR(PrimariesToXYZ(
+        c_desired.GetPrimaries().r.x, c_desired.GetPrimaries().r.y,
+        c_desired.GetPrimaries().g.x, c_desired.GetPrimaries().g.y,
+        c_desired.GetPrimaries().b.x, c_desired.GetPrimaries().b.y,
+        c_desired.GetWhitePoint().x, c_desired.GetWhitePoint().y,
+        &original_to_xyz[0][0]));
+    memcpy(luminances, original_to_xyz[1], sizeof luminances);
+    if (xyb_encoded) {
+      float adapt_to_d50[9];
+      JXL_RETURN_IF_ERROR(AdaptToXYZD50(c_desired.GetWhitePoint().x,
+                                        c_desired.GetWhitePoint().y,
+                                        adapt_to_d50));
+      float xyzd50_to_original[9];
+      Mul3x3Matrix(adapt_to_d50, &original_to_xyz[0][0], xyzd50_to_original);
+      JXL_RETURN_IF_ERROR(Inv3x3Matrix(xyzd50_to_original));
+      float srgb_to_original[9];
+      Mul3x3Matrix(xyzd50_to_original, srgb_to_xyzd50, srgb_to_original);
+      Mul3x3Matrix(srgb_to_original, orig_inverse_matrix, inverse_matrix);
+      inverse_matrix_is_default = false;
+    }
+  }
+
+  if (c_desired.IsGray()) {
+    float tmp_inv_matrix[9];
+    memcpy(tmp_inv_matrix, inverse_matrix, sizeof(inverse_matrix));
+    float srgb_to_luma[9];
+    memcpy(&srgb_to_luma[0], luminances, sizeof(luminances));
+    memcpy(&srgb_to_luma[3], luminances, sizeof(luminances));
+    memcpy(&srgb_to_luma[6], luminances, sizeof(luminances));
+    Mul3x3Matrix(srgb_to_luma, tmp_inv_matrix, inverse_matrix);
+  }
+
+  // The internal XYB color space uses absolute luminance, so we scale back the
+  // opsin inverse matrix to relative luminance where 1.0 corresponds to the
+  // original intensity target, or to absolute luminance for PQ, where 1.0
+  // corresponds to 10000 nits.
+  if (xyb_encoded) {
+    float intensity_target =
+        (c_desired.tf.IsPQ() ? 10000 : orig_intensity_target);
+    InitSIMDInverseMatrix(inverse_matrix, opsin_params.inverse_opsin_matrix,
+                          intensity_target);
+    all_default_opsin = (std::abs(intensity_target - 255.0) <= 0.1f &&
+                         inverse_matrix_is_default);
+  }
+
+  // Set the inverse gamma based on color space transfer function.
+  inverse_gamma = (c_desired.tf.IsGamma() ? c_desired.tf.GetGamma()
+                   : c_desired.tf.IsDCI() ? 1.0f / 2.6f
+                                          : 1.0);
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.h b/third_party/jpeg-xl/lib/jxl/dec_xyb.h
new file mode 100644
index 0000000000..ebaae9a176
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.h
@@ -0,0 +1,89 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DEC_XYB_H_
+#define LIB_JXL_DEC_XYB_H_
+
+// XYB -> linear sRGB.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+// Parameters for XYB->sRGB conversion.
+struct OpsinParams {
+  float inverse_opsin_matrix[9 * 4];
+  float opsin_biases[4];
+  float opsin_biases_cbrt[4];
+  float quant_biases[4];
+  void Init(float intensity_target);
+};
+
+struct OutputEncodingInfo {
+  //
+  // Fields depending only on image metadata
+  //
+  ColorEncoding orig_color_encoding;
+  // Used for the HLG OOTF and PQ tone mapping.
+  float orig_intensity_target;
+  // Opsin inverse matrix taken from the metadata.
+  float orig_inverse_matrix[9];
+  bool default_transform;
+  bool xyb_encoded;
+  //
+  // Fields depending on output color encoding
+  //
+  ColorEncoding color_encoding;
+  bool color_encoding_is_original;
+  // Contains an opsin matrix that converts to the primaries of the output
+  // encoding.
+  OpsinParams opsin_params;
+  bool all_default_opsin;
+  // Used for Gamma and DCI transfer functions.
+  float inverse_gamma;
+  // Luminances of color_encoding's primaries, used for the HLG inverse OOTF and
+  // for PQ tone mapping.
+  // Default to sRGB's.
+  float luminances[3];
+  // Used for the HLG inverse OOTF and PQ tone mapping.
+  float desired_intensity_target;
+
+  Status SetFromMetadata(const CodecMetadata& metadata);
+  Status MaybeSetColorEncoding(const ColorEncoding& c_desired);
+
+ private:
+  Status SetColorEncoding(const ColorEncoding& c_desired);
+};
+
+// Converts `inout` (not padded) from opsin to linear sRGB in-place. Called from
+// per-pass postprocessing, hence parallelized.
+void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
+                          const OpsinParams& opsin_params);
+
+// Converts `opsin:rect` (opsin may be padded, rect.x0 must be vector-aligned)
+// to linear sRGB. Called from whole-frame encoder, hence parallelized.
+void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
+                   Image3F* JXL_RESTRICT linear,
+                   const OpsinParams& opsin_params);
+
+// Bt.601 to match JPEG/JFIF. Inputs are _signed_ YCbCr values suitable for DCT,
+// see F.1.1.3 of T.81 (because our data type is float, there is no need to add
+// a bias to make the values unsigned).
+void YcbcrToRgb(const Image3F& ycbcr, Image3F* rgb, const Rect& rect);
+
+bool HasFastXYBTosRGB8();
+void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba,
+                    size_t xsize);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DEC_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/decode.cc b/third_party/jpeg-xl/lib/jxl/decode.cc
new file mode 100644
index 0000000000..5476f686f6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode.cc
@@ -0,0 +1,2809 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#if JPEGXL_ENABLE_BOXES || JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/box_content_decoder.h"
+#endif
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/dec_modular.h"
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/decode_to_jpeg.h"
+#endif
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/memory_manager_internal.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/toc.h"
+
+namespace {
+
+// Checks if a + b > size, taking possible integer overflow into account.
+bool OutOfBounds(size_t a, size_t b, size_t size) {
+  size_t pos = a + b;
+  if (pos > size) return true;
+  if (pos < a) return true;  // overflow happened
+  return false;
+}
+
+bool SumOverflows(size_t a, size_t b, size_t c) {
+  size_t sum = a + b;
+  if (sum < b) return true;
+  sum += c;
+  if (sum < c) return true;
+  return false;
+}
+
+JXL_INLINE size_t InitialBasicInfoSizeHint() {
+  // Amount of bytes before the start of the codestream in the container format,
+  // assuming that the codestream is the first box after the signature and
+  // filetype boxes. 12 bytes signature box + 20 bytes filetype box + 16 bytes
+  // codestream box length + name + optional XLBox length.
+  const size_t container_header_size = 48;
+
+  // Worst-case amount of bytes for basic info of the JPEG XL codestream header,
+  // that is all information up to and including extra_channel_bits. Up to
+  // around 2 bytes signature + 8 bytes SizeHeader + 31 bytes ColorEncoding + 4
+  // bytes rest of ImageMetadata + 5 bytes part of ImageMetadata2.
+  // TODO(lode): recompute and update this value when alpha_bits is moved to
+  // extra channels info.
+  const size_t max_codestream_basic_info_size = 50;
+
+  return container_header_size + max_codestream_basic_info_size;
+}
+
+// Debug-printing failure macro similar to JXL_FAILURE, but for the status code
+// JXL_DEC_ERROR
+#ifdef JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(format, ...)                                           \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_DEC_ERROR)
+#else  // JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(format, ...)                                             \
+  (((JXL_DEBUG_ON_ERROR) &&                                                    \
+    ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \
+   JXL_DEC_ERROR)
+#endif  // JXL_CRASH_ON_ERROR
+
+JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; }
+
+JxlDecoderStatus ConvertStatus(jxl::Status status) {
+  return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
+}
+
+JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) {
+  if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES;
+
+  buf += *pos;
+  len -= *pos;
+
+  // JPEG XL codestream: 0xff 0x0a
+  if (len >= 1 && buf[0] == 0xff) {
+    if (len < 2) {
+      return JXL_SIG_NOT_ENOUGH_BYTES;
+    } else if (buf[1] == jxl::kCodestreamMarker) {
+      *pos += 2;
+      return JXL_SIG_CODESTREAM;
+    } else {
+      return JXL_SIG_INVALID;
+    }
+  }
+
+  // JPEG XL container
+  if (len >= 1 && buf[0] == 0) {
+    if (len < 12) {
+      return JXL_SIG_NOT_ENOUGH_BYTES;
+    } else if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0xC && buf[4] == 'J' &&
+               buf[5] == 'X' && buf[6] == 'L' && buf[7] == ' ' &&
+               buf[8] == 0xD && buf[9] == 0xA && buf[10] == 0x87 &&
+               buf[11] == 0xA) {
+      *pos += 12;
+      return JXL_SIG_CONTAINER;
+    } else {
+      return JXL_SIG_INVALID;
+    }
+  }
+
+  return JXL_SIG_INVALID;
+}
+
+}  // namespace
+
+uint32_t JxlDecoderVersion(void) {
+  return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 +
+         JPEGXL_PATCH_VERSION;
+}
+
+JxlSignature JxlSignatureCheck(const uint8_t* buf, size_t len) {
+  size_t pos = 0;
+  return ReadSignature(buf, len, &pos);
+}
+
+namespace {
+
+size_t BitsPerChannel(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      return 0;  // signals unhandled JxlDataType
+  }
+}
+
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
+enum class DecoderStage : uint32_t {
+  kInited,              // Decoder created, no JxlDecoderProcessInput called yet
+  kStarted,             // Running JxlDecoderProcessInput calls
+  kCodestreamFinished,  // Codestream done, but other boxes could still occur.
+                        // This stage can also occur before having seen the
+                        // entire codestream if the user didn't subscribe to any
+                        // codestream events at all, e.g. only to box events,
+                        // or, the user only subscribed to basic info, and only
+                        // the header of the codestream was parsed.
+  kError,               // Error occurred, decoder object no longer usable
+};
+
+enum class FrameStage : uint32_t {
+  kHeader,  // Must parse frame header.
+  kTOC,     // Must parse TOC
+  kFull,    // Must parse full pixels
+};
+
+enum class BoxStage : uint32_t {
+  kHeader,      // Parsing box header of the next box, or start of non-container
+                // stream
+  kFtyp,        // The ftyp box
+  kSkip,        // Box whose contents are skipped
+  kCodestream,  // Handling codestream box contents, or non-container stream
+  kPartialCodestream,  // Handling the extra header of partial codestream box
+  kJpegRecon,          // Handling jpeg reconstruction box
+};
+
+enum class JpegReconStage : uint32_t {
+  kNone,             // Not outputting
+  kSettingMetadata,  // Ready to output, must set metadata to the jpeg_data
+  kOutputting,       // Currently outputting the JPEG bytes
+  kFinished,         // JPEG reconstruction fully handled
+};
+
+/*
+Given list of frame references to storage slots, and storage slots in which this
+frame is saved, computes which frames are required to decode the frame at the
+given index and any frames after it. The frames on which this depends are
+returned as a vector of their indices, in no particular order. The given index
+must be smaller than saved_as.size(), and references.size() must equal
+saved_as.size(). Any frames beyond saved_as and references are considered
+unknown future frames and must be treated as if something depends on them.
+*/
+std::vector<size_t> GetFrameDependencies(size_t index,
+                                         const std::vector<int>& saved_as,
+                                         const std::vector<int>& references) {
+  JXL_ASSERT(references.size() == saved_as.size());
+  JXL_ASSERT(index < references.size());
+
+  std::vector<size_t> result;
+
+  constexpr size_t kNumStorage = 8;
+
+  // value which indicates nothing is stored in this storage slot
+  const size_t invalid = references.size();
+  // for each of the 8 storage slots, a vector that translates frame index to
+  // frame stored in this storage slot at this point, that is, the last
+  // frame that was stored in this slot before or at this index.
+  std::array<std::vector<size_t>, kNumStorage> storage;
+  for (size_t s = 0; s < kNumStorage; ++s) {
+    storage[s].resize(saved_as.size());
+    int mask = 1 << s;
+    size_t id = invalid;
+    for (size_t i = 0; i < saved_as.size(); ++i) {
+      if (saved_as[i] & mask) {
+        id = i;
+      }
+      storage[s][i] = id;
+    }
+  }
+
+  std::vector<char> seen(index + 1, 0);
+  std::vector<size_t> stack;
+  stack.push_back(index);
+  seen[index] = 1;
+
+  // For frames after index, assume they can depend on any of the 8 storage
+  // slots, so push the frame for each stored reference to the stack and result.
+  // All frames after index are treated as having unknown references and with
+  // the possibility that there are more frames after the last known.
+  // TODO(lode): take values of saved_as and references after index, and a
+  // input flag indicating if they are all frames of the image, to further
+  // optimize this.
+  for (size_t s = 0; s < kNumStorage; ++s) {
+    size_t frame_ref = storage[s][index];
+    if (frame_ref == invalid) continue;
+    if (seen[frame_ref]) continue;
+    stack.push_back(frame_ref);
+    seen[frame_ref] = 1;
+    result.push_back(frame_ref);
+  }
+
+  while (!stack.empty()) {
+    size_t frame_index = stack.back();
+    stack.pop_back();
+    if (frame_index == 0) continue;  // first frame cannot have references
+    for (size_t s = 0; s < kNumStorage; ++s) {
+      int mask = 1 << s;
+      if (!(references[frame_index] & mask)) continue;
+      size_t frame_ref = storage[s][frame_index - 1];
+      if (frame_ref == invalid) continue;
+      if (seen[frame_ref]) continue;
+      stack.push_back(frame_ref);
+      seen[frame_ref] = 1;
+      result.push_back(frame_ref);
+    }
+  }
+
+  return result;
+}
+
+// Parameters for user-requested extra channel output.
+struct ExtraChannelOutput {
+  JxlPixelFormat format;
+  void* buffer;
+  size_t buffer_size;
+};
+
+}  // namespace
+
+namespace jxl {
+
+typedef struct JxlDecoderFrameIndexBoxEntryStruct {
+  // OFFi: offset of start byte of this frame compared to start
+  // byte of previous frame from this index in the JPEG XL codestream. For the
+  // first frame, this is the offset from the first byte of the JPEG XL
+  // codestream.
+  uint64_t OFFi;
+  // Ti: duration in ticks between the start of this frame and
+  // the start of the next frame in the index. If this is the last frame in the
+  // index, this is the duration in ticks between the start of this frame and
+  // the end of the stream. A tick lasts TNUM / TDEN seconds.
+  uint32_t Ti;
+  // Fi: amount of frames the next frame in the index occurs
+  // after this frame. If this is the last frame in the index, this is the
+  // amount of frames after this frame in the remainder of the stream. Only
+  // frames that are presented by the decoder are counted for this purpose, this
+  // excludes frames that are not intended for display but for compositing with
+  // other frames, such as frames that aren't the last frame with a duration of
+  // 0 ticks.
+  uint32_t Fi;
+} JxlDecoderFrameIndexBoxEntry;
+
+typedef struct JxlDecoderFrameIndexBoxStruct {
+  int64_t NF() const { return entries.size(); }
+  int32_t TNUM = 1;
+  int32_t TDEN = 1000;
+
+  std::vector<JxlDecoderFrameIndexBoxEntry> entries;
+
+  // That way we can ensure that every index box will have the first frame.
+  // If the API user decides to mark it as an indexed frame, we call
+  // the AddFrame again, this time with requested.
+  void AddFrame(uint64_t OFFi, uint32_t Ti, uint32_t Fi) {
+    JxlDecoderFrameIndexBoxEntry e;
+    e.OFFi = OFFi;
+    e.Ti = Ti;
+    e.Fi = Fi;
+    entries.push_back(e);
+  }
+} JxlDecoderFrameIndexBox;
+
+}  // namespace jxl
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct JxlDecoderStruct {
+  JxlDecoderStruct() = default;
+
+  JxlMemoryManager memory_manager;
+  std::unique_ptr<jxl::ThreadPool> thread_pool;
+
+  DecoderStage stage;
+
+  // Status of progression, internal.
+  bool got_signature;
+  // Indicates we know that we've seen the last codestream box: either this
+  // was a jxlc box, or a jxlp box that has its index indicated as last by
+  // having its most significant bit set, or no boxes are used at all. This
+  // does not indicate the full codestream has already been seen, only the
+  // last box of it has been initiated.
+  bool last_codestream_seen;
+  bool got_codestream_signature;
+  bool got_basic_info;
+  bool got_transform_data;  // To skip everything before ICC.
+  bool got_all_headers;     // Codestream metadata headers.
+  bool post_headers;        // Already decoding pixels.
+  jxl::ICCReader icc_reader;
+  jxl::JxlDecoderFrameIndexBox frame_index_box;
+  // This means either we actually got the preview image, or determined we
+  // cannot get it or there is none.
+  bool got_preview_image;
+  bool preview_frame;
+
+  // Position of next_in in the original file including box format if present
+  // (as opposed to position in the codestream)
+  size_t file_pos;
+
+  size_t box_contents_begin;
+  size_t box_contents_end;
+  size_t box_contents_size;
+  size_t box_size;
+  size_t header_size;
+  // Either a final box that runs until EOF, or the case of no container format
+  // at all.
+  bool box_contents_unbounded;
+
+  JxlBoxType box_type;
+  JxlBoxType box_decoded_type;  // Underlying type for brob boxes
+  // Set to true right after a JXL_DEC_BOX event only.
+  bool box_event;
+  bool decompress_boxes;
+
+  bool box_out_buffer_set;
+  // Whether the out buffer is set for the current box, if the user did not yet
+  // release the buffer while the next box is encountered, this will be set to
+  // false. If this is false, no JXL_DEC_NEED_MORE_INPUT is emitted
+  // (irrespective of the value of box_out_buffer_set), because not setting
+  // output indicates the user does not wish the data of this box.
+  bool box_out_buffer_set_current_box;
+  uint8_t* box_out_buffer;
+  size_t box_out_buffer_size;
+  // which byte of the full box content the start of the out buffer points to
+  size_t box_out_buffer_begin;
+  // which byte of box_out_buffer to write to next
+  size_t box_out_buffer_pos;
+
+  // Settings
+  bool keep_orientation;
+  bool unpremul_alpha;
+  bool render_spotcolors;
+  bool coalescing;
+  float desired_intensity_target;
+
+  // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the
+  // decoder returns a status. By default, do not return for any of the events,
+  // only return when the decoder cannot continue because it needs more input or
+  // output data.
+  int events_wanted;
+  int orig_events_wanted;
+
+  // Fields for reading the basic info from the header.
+  size_t basic_info_size_hint;
+  bool have_container;
+  size_t box_count;
+
+  // The level of progressive detail in frame decoding.
+  JxlProgressiveDetail prog_detail = kDC;
+  // The progressive detail of the current frame.
+  JxlProgressiveDetail frame_prog_detail;
+  // The intended downsampling ratio for the current progression step.
+  size_t downsampling_target;
+
+  // Set to true if either an image out buffer or an image out callback was set.
+  bool image_out_buffer_set;
+
+  // Owned by the caller, buffer for preview or full resolution image.
+  void* image_out_buffer;
+  JxlImageOutInitCallback image_out_init_callback;
+  JxlImageOutRunCallback image_out_run_callback;
+  JxlImageOutDestroyCallback image_out_destroy_callback;
+  void* image_out_init_opaque;
+  struct SimpleImageOutCallback {
+    JxlImageOutCallback callback;
+    void* opaque;
+  };
+  SimpleImageOutCallback simple_image_out_callback;
+
+  size_t image_out_size;
+
+  JxlPixelFormat image_out_format;
+  JxlBitDepth image_out_bit_depth;
+
+  // For extra channels. Empty if no extra channels are requested, and they are
+  // reset each frame
+  std::vector<ExtraChannelOutput> extra_channel_output;
+
+  jxl::CodecMetadata metadata;
+  // Same as metadata.m, except for the color_encoding, which is set to the
+  // output encoding.
+  jxl::ImageMetadata image_metadata;
+  std::unique_ptr<jxl::ImageBundle> ib;
+
+  std::unique_ptr<jxl::PassesDecoderState> passes_state;
+  std::unique_ptr<jxl::FrameDecoder> frame_dec;
+  size_t next_section;
+  std::vector<char> section_processed;
+
+  // headers and TOC for the current frame. When got_toc is true, this is
+  // always the frame header of the last frame of the current still series,
+  // that is, the displayed frame.
+  std::unique_ptr<jxl::FrameHeader> frame_header;
+
+  size_t remaining_frame_size;
+  FrameStage frame_stage;
+  bool dc_frame_progression_done;
+  // The currently processed frame is the last of the current composite still,
+  // and so must be returned as pixels
+  bool is_last_of_still;
+  // The currently processed frame is the last of the codestream
+  bool is_last_total;
+  // How many frames to skip.
+  size_t skip_frames;
+  // Skipping the current frame. May be false if skip_frames was just set to
+  // a positive value while already processing a current frame, then
+  // skipping_frame will be enabled only for the next frame.
+  bool skipping_frame;
+
+  // Amount of internal frames and external frames started. External frames are
+  // user-visible frames, internal frames includes all external frames and
+  // also invisible frames such as patches, blending-only and dc_level frames.
+  size_t internal_frames;
+  size_t external_frames;
+
+  // For each internal frame, which storage locations it references, and which
+  // storage locations it is stored in, using the bit mask as defined in
+  // FrameDecoder::References and FrameDecoder::SaveAs.
+  std::vector<int> frame_references;
+  std::vector<int> frame_saved_as;
+
+  // Translates external frame index to internal frame index. The external
+  // index is the index of user-visible frames. The internal index can be larger
+  // since non-visible frames (such as frames with patches, ...) are included.
+  std::vector<size_t> frame_external_to_internal;
+
+  // Whether the frame with internal index is required to decode the frame
+  // being skipped to or any frames after that. If no skipping is active,
+  // this vector is ignored. If the current internal frame index is beyond this
+  // vector, it must be treated as a required frame.
+  std::vector<char> frame_required;
+
+  // Codestream input data is copied here temporarily when the decoder needs
+  // more input bytes to process the next part of the stream. We copy the input
+  // data in order to be able to release it all through the API it when
+  // returning JXL_DEC_NEED_MORE_INPUT.
+  std::vector<uint8_t> codestream_copy;
+  // Number of bytes at the end of codestream_copy that were not yet consumed
+  // by calling AdvanceInput().
+  size_t codestream_unconsumed;
+  // Position in the codestream_copy vector that the decoder already finished
+  // processing. It can be greater than the current size of codestream_copy in
+  // case where the decoder skips some parts of the frame that were not yet
+  // provided.
+  size_t codestream_pos;
+  // Number of bits after codestream_pos that were already processed.
+  size_t codestream_bits_ahead;
+
+  BoxStage box_stage;
+
+#if JPEGXL_ENABLE_BOXES
+  jxl::JxlBoxContentDecoder box_content_decoder;
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  jxl::JxlToJpegDecoder jpeg_decoder;
+  // Decodes Exif or XMP metadata for JPEG reconstruction
+  jxl::JxlBoxContentDecoder metadata_decoder;
+  std::vector<uint8_t> exif_metadata;
+  std::vector<uint8_t> xmp_metadata;
+  // must store JPEG reconstruction metadata from the current box
+  // 0 = not stored, 1 = currently storing, 2 = finished
+  int store_exif;
+  int store_xmp;
+  size_t recon_out_buffer_pos;
+  size_t recon_exif_size;  // Expected exif size as read from the jbrd box
+  size_t recon_xmp_size;   // Expected exif size as read from the jbrd box
+  JpegReconStage recon_output_jpeg;
+
+  bool JbrdNeedMoreBoxes() const {
+    // jbrd box wants exif but exif box not yet seen
+    if (store_exif < 2 && recon_exif_size > 0) return true;
+    // jbrd box wants xmp but xmp box not yet seen
+    if (store_xmp < 2 && recon_xmp_size > 0) return true;
+    return false;
+  }
+#endif
+
+  const uint8_t* next_in;
+  size_t avail_in;
+  bool input_closed;
+
+  void AdvanceInput(size_t size) {
+    JXL_DASSERT(avail_in >= size);
+    next_in += size;
+    avail_in -= size;
+    file_pos += size;
+  }
+
+  size_t AvailableCodestream() const {
+    size_t avail_codestream = avail_in;
+    if (!box_contents_unbounded) {
+      avail_codestream =
+          std::min<size_t>(avail_codestream, box_contents_end - file_pos);
+    }
+    return avail_codestream;
+  }
+
+  void AdvanceCodestream(size_t size) {
+    size_t avail_codestream = AvailableCodestream();
+    if (codestream_copy.empty()) {
+      if (size <= avail_codestream) {
+        AdvanceInput(size);
+      } else {
+        codestream_pos = size - avail_codestream;
+        AdvanceInput(avail_codestream);
+      }
+    } else {
+      codestream_pos += size;
+      if (codestream_pos + codestream_unconsumed >= codestream_copy.size()) {
+        size_t advance = std::min(
+            codestream_unconsumed,
+            codestream_unconsumed + codestream_pos - codestream_copy.size());
+        AdvanceInput(advance);
+        codestream_pos -= std::min(codestream_pos, codestream_copy.size());
+        codestream_unconsumed = 0;
+        codestream_copy.clear();
+      }
+    }
+  }
+
+  JxlDecoderStatus RequestMoreInput() {
+    if (codestream_copy.empty()) {
+      size_t avail_codestream = AvailableCodestream();
+      codestream_copy.insert(codestream_copy.end(), next_in,
+                             next_in + avail_codestream);
+      AdvanceInput(avail_codestream);
+    } else {
+      AdvanceInput(codestream_unconsumed);
+      codestream_unconsumed = 0;
+    }
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+
+  JxlDecoderStatus GetCodestreamInput(jxl::Span<const uint8_t>* span) {
+    if (codestream_copy.empty() && codestream_pos > 0) {
+      size_t avail_codestream = AvailableCodestream();
+      size_t skip = std::min<size_t>(codestream_pos, avail_codestream);
+      AdvanceInput(skip);
+      codestream_pos -= skip;
+      if (codestream_pos > 0) {
+        return RequestMoreInput();
+      }
+    }
+    JXL_ASSERT(codestream_pos <= codestream_copy.size());
+    JXL_ASSERT(codestream_unconsumed <= codestream_copy.size());
+    size_t avail_codestream = AvailableCodestream();
+    if (codestream_copy.empty()) {
+      if (avail_codestream == 0) {
+        return RequestMoreInput();
+      }
+      *span = jxl::Span<const uint8_t>(next_in, avail_codestream);
+      return JXL_DEC_SUCCESS;
+    } else {
+      codestream_copy.insert(codestream_copy.end(),
+                             next_in + codestream_unconsumed,
+                             next_in + avail_codestream);
+      codestream_unconsumed = avail_codestream;
+      *span = jxl::Span<const uint8_t>(codestream_copy.data() + codestream_pos,
+                                       codestream_copy.size() - codestream_pos);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+
+  // Whether the decoder can use more codestream input for a purpose it needs.
+  // This returns false if the user didn't subscribe to any events that
+  // require the codestream (e.g. only subscribed to metadata boxes), or all
+  // parts of the codestream that are subscribed to (e.g. only basic info) have
+  // already occurred.
+  bool CanUseMoreCodestreamInput() const {
+    // The decoder can set this to finished early if all relevant events were
+    // processed, so this check works.
+    return stage != DecoderStage::kCodestreamFinished;
+  }
+
+  // If set then some operations will fail, if those would require
+  // allocating large objects. Actual memory usage might be two orders of
+  // magnitude bigger.
+  // TODO(eustas): remove once there is working API for memory / CPU limit.
+  size_t memory_limit_base = 0;
+  size_t cpu_limit_base = 0;
+  size_t used_cpu_base = 0;
+};
+
+namespace {
+
+bool CheckSizeLimit(JxlDecoder* dec, size_t xsize, size_t ysize) {
+  if (!dec->memory_limit_base) return true;
+  if (xsize == 0 || ysize == 0) return true;
+  if (xsize >= dec->memory_limit_base || ysize >= dec->memory_limit_base) {
+    return false;
+  }
+  // Rough estimate of real row length.
+  xsize = jxl::DivCeil(xsize, 32) * 32;
+  size_t num_pixels = xsize * ysize;
+  if (num_pixels / xsize != ysize) return false;  // overflow
+  if (num_pixels > dec->memory_limit_base) return false;
+  return true;
+}
+
+}  // namespace
+
+// TODO(zond): Make this depend on the data loaded into the decoder.
+JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec,
+                                              JxlPixelFormat* format) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+  *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  return JXL_DEC_SUCCESS;
+}
+
+// Resets the state that must be reset for both Rewind and Reset
+void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
+  dec->stage = DecoderStage::kInited;
+  dec->got_signature = false;
+  dec->last_codestream_seen = false;
+  dec->got_codestream_signature = false;
+  dec->got_basic_info = false;
+  dec->got_transform_data = false;
+  dec->got_all_headers = false;
+  dec->post_headers = false;
+  dec->icc_reader.Reset();
+  dec->got_preview_image = false;
+  dec->preview_frame = false;
+  dec->file_pos = 0;
+  dec->box_contents_begin = 0;
+  dec->box_contents_end = 0;
+  dec->box_contents_size = 0;
+  dec->box_size = 0;
+  dec->header_size = 0;
+  dec->box_contents_unbounded = false;
+  memset(dec->box_type, 0, sizeof(dec->box_type));
+  memset(dec->box_decoded_type, 0, sizeof(dec->box_decoded_type));
+  dec->box_event = false;
+  dec->box_stage = BoxStage::kHeader;
+  dec->box_out_buffer_set = false;
+  dec->box_out_buffer_set_current_box = false;
+  dec->box_out_buffer = nullptr;
+  dec->box_out_buffer_size = 0;
+  dec->box_out_buffer_begin = 0;
+  dec->box_out_buffer_pos = 0;
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  dec->exif_metadata.clear();
+  dec->xmp_metadata.clear();
+  dec->store_exif = 0;
+  dec->store_xmp = 0;
+  dec->recon_out_buffer_pos = 0;
+  dec->recon_exif_size = 0;
+  dec->recon_xmp_size = 0;
+  dec->recon_output_jpeg = JpegReconStage::kNone;
+#endif
+
+  dec->events_wanted = dec->orig_events_wanted;
+  dec->basic_info_size_hint = InitialBasicInfoSizeHint();
+  dec->have_container = 0;
+  dec->box_count = 0;
+  dec->downsampling_target = 8;
+  dec->image_out_buffer_set = false;
+  dec->image_out_buffer = nullptr;
+  dec->image_out_init_callback = nullptr;
+  dec->image_out_run_callback = nullptr;
+  dec->image_out_destroy_callback = nullptr;
+  dec->image_out_init_opaque = nullptr;
+  dec->image_out_size = 0;
+  dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT;
+  dec->extra_channel_output.clear();
+  dec->next_in = 0;
+  dec->avail_in = 0;
+  dec->input_closed = false;
+
+  dec->passes_state.reset(nullptr);
+  dec->frame_dec.reset(nullptr);
+  dec->next_section = 0;
+  dec->section_processed.clear();
+
+  dec->ib.reset();
+  dec->metadata = jxl::CodecMetadata();
+  dec->image_metadata = dec->metadata.m;
+  dec->frame_header.reset(new jxl::FrameHeader(&dec->metadata));
+
+  dec->codestream_copy.clear();
+  dec->codestream_unconsumed = 0;
+  dec->codestream_pos = 0;
+  dec->codestream_bits_ahead = 0;
+
+  dec->frame_stage = FrameStage::kHeader;
+  dec->remaining_frame_size = 0;
+  dec->is_last_of_still = false;
+  dec->is_last_total = false;
+  dec->skip_frames = 0;
+  dec->skipping_frame = false;
+  dec->internal_frames = 0;
+  dec->external_frames = 0;
+}
+
+void JxlDecoderReset(JxlDecoder* dec) {
+  JxlDecoderRewindDecodingState(dec);
+
+  dec->thread_pool.reset();
+  dec->keep_orientation = false;
+  dec->unpremul_alpha = false;
+  dec->render_spotcolors = true;
+  dec->coalescing = true;
+  dec->desired_intensity_target = 0;
+  dec->orig_events_wanted = 0;
+  dec->events_wanted = 0;
+  dec->frame_references.clear();
+  dec->frame_saved_as.clear();
+  dec->frame_external_to_internal.clear();
+  dec->frame_required.clear();
+  dec->decompress_boxes = false;
+}
+
+JxlDecoder* JxlDecoderCreate(const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager))
+    return nullptr;
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlDecoder));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  JxlDecoder* dec = new (alloc) JxlDecoder();
+  dec->memory_manager = local_memory_manager;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (!memory_manager) {
+    dec->memory_limit_base = 53 << 16;
+    // Allow 5 x max_image_size processing units; every frame is accounted
+    // as W x H CPU processing units, so there could be numerous small frames
+    // or few larger ones.
+    dec->cpu_limit_base = 5 * dec->memory_limit_base;
+  }
+#endif
+
+  JxlDecoderReset(dec);
+
+  return dec;
+}
+
+void JxlDecoderDestroy(JxlDecoder* dec) {
+  if (dec) {
+    JxlMemoryManager local_memory_manager = dec->memory_manager;
+    // Call destructor directly since custom free function is used.
+    dec->~JxlDecoder();
+    jxl::MemoryManagerFree(&local_memory_manager, dec);
+  }
+}
+
+void JxlDecoderRewind(JxlDecoder* dec) { JxlDecoderRewindDecodingState(dec); }
+
+void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) {
+  // Increment amount, rather than set it: making the amount smaller is
+  // impossible because the decoder may already have skipped frames required to
+  // decode earlier frames, and making the amount larger compared to an existing
+  // amount is impossible because if JxlDecoderSkipFrames is called in the
+  // middle of already skipping frames, the user cannot know how many frames
+  // have already been skipped internally so far so an absolute value cannot
+  // be defined.
+  dec->skip_frames += amount;
+
+  dec->frame_required.clear();
+  size_t next_frame = dec->external_frames + dec->skip_frames;
+
+  // A frame that has been seen before a rewind
+  if (next_frame < dec->frame_external_to_internal.size()) {
+    size_t internal_index = dec->frame_external_to_internal[next_frame];
+    if (internal_index < dec->frame_saved_as.size()) {
+      std::vector<size_t> deps = GetFrameDependencies(
+          internal_index, dec->frame_saved_as, dec->frame_references);
+
+      dec->frame_required.resize(internal_index + 1, 0);
+      for (size_t i = 0; i < deps.size(); i++) {
+        JXL_ASSERT(deps[i] < dec->frame_required.size());
+        dec->frame_required[deps[i]] = 1;
+      }
+    }
+  }
+}
+
+JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec) {
+  if (dec->frame_stage != FrameStage::kFull) {
+    return JXL_DEC_ERROR;
+  }
+  JXL_DASSERT(dec->frame_dec);
+  dec->frame_stage = FrameStage::kHeader;
+  dec->AdvanceCodestream(dec->remaining_frame_size);
+  if (dec->is_last_of_still) {
+    dec->image_out_buffer_set = false;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner,
+                            void* parallel_runner_opaque) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("parallel_runner must be set before starting");
+  }
+  dec->thread_pool.reset(
+      new jxl::ThreadPool(parallel_runner, parallel_runner_opaque));
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderSizeHintBasicInfo(const JxlDecoder* dec) {
+  if (dec->got_basic_info) return 0;
+  return dec->basic_info_size_hint;
+}
+
+JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_DEC_ERROR;  // Cannot subscribe to events after having started.
+  }
+  if (events_wanted & 63) {
+    return JXL_DEC_ERROR;  // Can only subscribe to informative events.
+  }
+  dec->events_wanted = events_wanted;
+  dec->orig_events_wanted = events_wanted;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec,
+                                              JXL_BOOL skip_reorientation) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set keep_orientation option before starting");
+  }
+  dec->keep_orientation = !!skip_reorientation;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetUnpremultiplyAlpha(JxlDecoder* dec,
+                                                 JXL_BOOL unpremul_alpha) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set unpremul_alpha option before starting");
+  }
+  dec->unpremul_alpha = !!unpremul_alpha;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetRenderSpotcolors(JxlDecoder* dec,
+                                               JXL_BOOL render_spotcolors) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set render_spotcolors option before starting");
+  }
+  dec->render_spotcolors = !!render_spotcolors;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set coalescing option before starting");
+  }
+  dec->coalescing = !!coalescing;
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+// helper function to get the dimensions of the current image buffer
+void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize) {
+  if (dec->frame_header->nonserialized_is_preview) {
+    xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation);
+    ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation);
+    return;
+  }
+  xsize = dec->metadata.oriented_xsize(dec->keep_orientation);
+  ysize = dec->metadata.oriented_ysize(dec->keep_orientation);
+  if (!dec->coalescing) {
+    const auto frame_dim = dec->frame_header->ToFrameDimensions();
+    xsize = frame_dim.xsize_upsampled;
+    ysize = frame_dim.ysize_upsampled;
+    if (!dec->keep_orientation &&
+        static_cast<int>(dec->metadata.m.GetOrientation()) > 4) {
+      std::swap(xsize, ysize);
+    }
+  }
+}
+}  // namespace
+
+namespace jxl {
+namespace {
+
+template <class T>
+bool CanRead(Span<const uint8_t> data, BitReader* reader, T* JXL_RESTRICT t) {
+  // Use a copy of the bit reader because CanRead advances bits.
+  BitReader reader2(data);
+  reader2.SkipBits(reader->TotalBitsConsumed());
+  bool result = Bundle::CanRead(&reader2, t);
+  JXL_ASSERT(reader2.Close());
+  return result;
+}
+
+// Returns JXL_DEC_SUCCESS if the full bundle was successfully read, status
+// indicating either error or need more input otherwise.
+template <class T>
+JxlDecoderStatus ReadBundle(JxlDecoder* dec, Span<const uint8_t> data,
+                            BitReader* reader, T* JXL_RESTRICT t) {
+  if (!CanRead(data, reader, t)) {
+    return dec->RequestMoreInput();
+  }
+  if (!Bundle::Read(reader, t)) {
+    return JXL_DEC_ERROR;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+#define JXL_API_RETURN_IF_ERROR(expr)               \
+  {                                                 \
+    JxlDecoderStatus status_ = ConvertStatus(expr); \
+    if (status_ != JXL_DEC_SUCCESS) return status_; \
+  }
+
+std::unique_ptr<BitReader, std::function<void(BitReader*)>> GetBitReader(
+    Span<const uint8_t> span) {
+  BitReader* reader = new BitReader(span);
+  return std::unique_ptr<BitReader, std::function<void(BitReader*)>>(
+      reader, [](BitReader* reader) {
+        // We can't allow Close to abort the program if the reader is out of
+        // bounds, or all return paths in the code, even those that already
+        // return failure, would have to manually call AllReadsWithinBounds().
+        // Invalid JXL codestream should not cause program to quit.
+        (void)reader->AllReadsWithinBounds();
+        (void)reader->Close();
+        delete reader;
+      });
+}
+
+JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec) {
+  if (!dec->got_codestream_signature) {
+    // Check and skip the codestream signature
+    Span<const uint8_t> span;
+    JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+    if (span.size() < 2) {
+      return dec->RequestMoreInput();
+    }
+    if (span.data()[0] != 0xff || span.data()[1] != jxl::kCodestreamMarker) {
+      return JXL_API_ERROR("invalid signature");
+    }
+    dec->got_codestream_signature = true;
+    dec->AdvanceCodestream(2);
+  }
+
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  auto reader = GetBitReader(span);
+  JXL_API_RETURN_IF_ERROR(
+      ReadBundle(dec, span, reader.get(), &dec->metadata.size));
+  JXL_API_RETURN_IF_ERROR(
+      ReadBundle(dec, span, reader.get(), &dec->metadata.m));
+  size_t total_bits = reader->TotalBitsConsumed();
+  dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte);
+  dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte;
+  dec->got_basic_info = true;
+  dec->basic_info_size_hint = 0;
+  dec->image_metadata = dec->metadata.m;
+  JXL_DEBUG_V(2, "Decoded BasicInfo: %s", dec->metadata.DebugString().c_str());
+
+  if (!CheckSizeLimit(dec, dec->metadata.size.xsize(),
+                      dec->metadata.size.ysize())) {
+    return JXL_API_ERROR("image is too large");
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+// Reads all codestream headers (but not frame headers)
+JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
+  if (!dec->got_transform_data) {
+    Span<const uint8_t> span;
+    JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+    auto reader = GetBitReader(span);
+    reader->SkipBits(dec->codestream_bits_ahead);
+    dec->metadata.transform_data.nonserialized_xyb_encoded =
+        dec->metadata.m.xyb_encoded;
+    JXL_API_RETURN_IF_ERROR(
+        ReadBundle(dec, span, reader.get(), &dec->metadata.transform_data));
+    size_t total_bits = reader->TotalBitsConsumed();
+    dec->AdvanceCodestream(total_bits / jxl::kBitsPerByte);
+    dec->codestream_bits_ahead = total_bits % jxl::kBitsPerByte;
+    dec->got_transform_data = true;
+  }
+
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  auto reader = GetBitReader(span);
+  reader->SkipBits(dec->codestream_bits_ahead);
+
+  if (dec->metadata.m.color_encoding.WantICC()) {
+    jxl::Status status =
+        dec->icc_reader.Init(reader.get(), dec->memory_limit_base);
+    // Always check AllReadsWithinBounds, not all the C++ decoder implementation
+    // handles reader out of bounds correctly  yet (e.g. context map). Not
+    // checking AllReadsWithinBounds can cause reader->Close() to trigger an
+    // assert, but we don't want library to quit program for invalid codestream.
+    if (!reader->AllReadsWithinBounds() ||
+        status.code() == StatusCode::kNotEnoughBytes) {
+      return dec->RequestMoreInput();
+    }
+    if (!status) {
+      // Other non-successful status is an error
+      return JXL_DEC_ERROR;
+    }
+    PaddedBytes icc;
+    status = dec->icc_reader.Process(reader.get(), &icc);
+    if (status.code() == StatusCode::kNotEnoughBytes) {
+      return dec->RequestMoreInput();
+    }
+    if (!status) {
+      // Other non-successful status is an error
+      return JXL_DEC_ERROR;
+    }
+    if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) {
+      return JXL_DEC_ERROR;
+    }
+  }
+
+  dec->got_all_headers = true;
+  JXL_API_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+
+  dec->AdvanceCodestream(reader->TotalBitsConsumed() / jxl::kBitsPerByte);
+  dec->codestream_bits_ahead = 0;
+
+  if (!dec->passes_state) {
+    dec->passes_state.reset(new jxl::PassesDecoderState());
+  }
+
+  JXL_API_RETURN_IF_ERROR(
+      dec->passes_state->output_encoding_info.SetFromMetadata(dec->metadata));
+  if (dec->desired_intensity_target > 0) {
+    dec->passes_state->output_encoding_info.desired_intensity_target =
+        dec->desired_intensity_target;
+  }
+  dec->image_metadata = dec->metadata.m;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
+  Span<const uint8_t> span;
+  JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+  const auto& toc = dec->frame_dec->Toc();
+  size_t pos = 0;
+  std::vector<jxl::FrameDecoder::SectionInfo> section_info;
+  std::vector<jxl::FrameDecoder::SectionStatus> section_status;
+  for (size_t i = dec->next_section; i < toc.size(); ++i) {
+    if (dec->section_processed[i]) {
+      pos += toc[i].size;
+      continue;
+    }
+    size_t id = toc[i].id;
+    size_t size = toc[i].size;
+    if (OutOfBounds(pos, size, span.size())) {
+      break;
+    }
+    auto br =
+        new jxl::BitReader(jxl::Span<const uint8_t>(span.data() + pos, size));
+    section_info.emplace_back(jxl::FrameDecoder::SectionInfo{br, id, i});
+    section_status.emplace_back();
+    pos += size;
+  }
+  jxl::Status status = dec->frame_dec->ProcessSections(
+      section_info.data(), section_info.size(), section_status.data());
+  bool out_of_bounds = false;
+  for (const auto& info : section_info) {
+    if (!info.br->AllReadsWithinBounds()) {
+      // Mark out of bounds section, but keep closing and deleting the next
+      // ones as well.
+      out_of_bounds = true;
+    }
+    JXL_ASSERT(info.br->Close());
+    delete info.br;
+  }
+  if (out_of_bounds) {
+    // If any bit reader indicates out of bounds, it's an error, not just
+    // needing more input, since we ensure only bit readers containing
+    // a complete section are provided to the FrameDecoder.
+    return JXL_API_ERROR("frame out of bounds");
+  }
+  if (!status) {
+    return JXL_API_ERROR("frame processing failed");
+  }
+  for (size_t i = 0; i < section_status.size(); ++i) {
+    auto status = section_status[i];
+    if (status == jxl::FrameDecoder::kDone) {
+      dec->section_processed[section_info[i].index] = 1;
+    } else if (status != jxl::FrameDecoder::kSkipped) {
+      return JXL_API_ERROR("unexpected section status");
+    }
+  }
+  size_t completed_prefix_bytes = 0;
+  while (dec->next_section < dec->section_processed.size() &&
+         dec->section_processed[dec->next_section] == 1) {
+    completed_prefix_bytes += toc[dec->next_section].size;
+    ++dec->next_section;
+  }
+  dec->remaining_frame_size -= completed_prefix_bytes;
+  dec->AdvanceCodestream(completed_prefix_bytes);
+  return JXL_DEC_SUCCESS;
+}
+
+// TODO(eustas): no CodecInOut -> no image size reinforcement -> possible OOM.
+JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
+  // If no parallel runner is set, use the default
+  // TODO(lode): move this initialization to an appropriate location once the
+  // runner is used to decode pixels.
+  if (!dec->thread_pool) {
+    dec->thread_pool.reset(new jxl::ThreadPool(nullptr, nullptr));
+  }
+
+  // No matter what events are wanted, the basic info is always required.
+  if (!dec->got_basic_info) {
+    JxlDecoderStatus status = JxlDecoderReadBasicInfo(dec);
+    if (status != JXL_DEC_SUCCESS) return status;
+  }
+
+  if (dec->events_wanted & JXL_DEC_BASIC_INFO) {
+    dec->events_wanted &= ~JXL_DEC_BASIC_INFO;
+    return JXL_DEC_BASIC_INFO;
+  }
+
+  if (!dec->events_wanted) {
+    dec->stage = DecoderStage::kCodestreamFinished;
+    return JXL_DEC_SUCCESS;
+  }
+
+  if (!dec->got_all_headers) {
+    JxlDecoderStatus status = JxlDecoderReadAllHeaders(dec);
+    if (status != JXL_DEC_SUCCESS) return status;
+  }
+
+  if (dec->events_wanted & JXL_DEC_COLOR_ENCODING) {
+    dec->events_wanted &= ~JXL_DEC_COLOR_ENCODING;
+    return JXL_DEC_COLOR_ENCODING;
+  }
+
+  if (!dec->events_wanted) {
+    dec->stage = DecoderStage::kCodestreamFinished;
+    return JXL_DEC_SUCCESS;
+  }
+
+  dec->post_headers = true;
+
+  if (!dec->got_preview_image && dec->metadata.m.have_preview) {
+    dec->preview_frame = true;
+  }
+
+  // Handle frames
+  for (;;) {
+    bool parse_frames =
+        (dec->events_wanted &
+         (JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    if (!parse_frames) {
+      break;
+    }
+    if (dec->frame_stage == FrameStage::kHeader && dec->is_last_total) {
+      break;
+    }
+    if (dec->frame_stage == FrameStage::kHeader) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata ||
+          dec->recon_output_jpeg == JpegReconStage::kOutputting) {
+        // The image bundle contains the JPEG reconstruction frame, but the
+        // decoder is still waiting to decode an EXIF or XMP box. It's not
+        // implemented to decode additional frames during this, and a JPEG
+        // reconstruction image should have only one frame.
+        return JXL_API_ERROR(
+            "cannot decode a next frame after JPEG reconstruction frame");
+      }
+#endif
+      if (!dec->ib) {
+        dec->ib.reset(new jxl::ImageBundle(&dec->image_metadata));
+      }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      // If JPEG reconstruction is wanted and possible, set the jpeg_data of
+      // the ImageBundle.
+      if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get()))
+        return JXL_DEC_ERROR;
+#endif
+      dec->frame_dec.reset(new FrameDecoder(
+          dec->passes_state.get(), dec->metadata, dec->thread_pool.get(),
+          /*use_slow_rendering_pipeline=*/false));
+      dec->frame_header.reset(new FrameHeader(&dec->metadata));
+      Span<const uint8_t> span;
+      JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
+      auto reader = GetBitReader(span);
+      jxl::Status status = dec->frame_dec->InitFrame(
+          reader.get(), dec->ib.get(), dec->preview_frame);
+      if (!reader->AllReadsWithinBounds() ||
+          status.code() == StatusCode::kNotEnoughBytes) {
+        return dec->RequestMoreInput();
+      } else if (!status) {
+        return JXL_API_ERROR("invalid frame header");
+      }
+      dec->AdvanceCodestream(reader->TotalBitsConsumed() / kBitsPerByte);
+      *dec->frame_header = dec->frame_dec->GetFrameHeader();
+      jxl::FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions();
+      if (!CheckSizeLimit(dec, frame_dim.xsize_upsampled_padded,
+                          frame_dim.ysize_upsampled_padded)) {
+        return JXL_API_ERROR("frame is too large");
+      }
+      bool output_needed =
+          (dec->preview_frame ? (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)
+                              : (dec->events_wanted & JXL_DEC_FULL_IMAGE));
+      if (output_needed) {
+        JXL_API_RETURN_IF_ERROR(dec->frame_dec->InitFrameOutput());
+      }
+      if (dec->cpu_limit_base != 0) {
+        // No overflow, checked in CheckSizeLimit.
+        size_t num_pixels = frame_dim.xsize * frame_dim.ysize;
+        if (dec->used_cpu_base + num_pixels < dec->used_cpu_base) {
+          return JXL_API_ERROR("used too much CPU");
+        }
+        dec->used_cpu_base += num_pixels;
+        if (dec->used_cpu_base > dec->cpu_limit_base) {
+          return JXL_API_ERROR("used too much CPU");
+        }
+      }
+      dec->remaining_frame_size = dec->frame_dec->SumSectionSizes();
+
+      dec->frame_stage = FrameStage::kTOC;
+      if (dec->preview_frame) {
+        if (!(dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)) {
+          dec->frame_stage = FrameStage::kHeader;
+          dec->AdvanceCodestream(dec->remaining_frame_size);
+          dec->got_preview_image = true;
+          dec->preview_frame = false;
+        }
+        continue;
+      }
+
+      int saved_as = FrameDecoder::SavedAs(*dec->frame_header);
+      // is last in entire codestream
+      dec->is_last_total = dec->frame_header->is_last;
+      // is last of current still
+      dec->is_last_of_still =
+          dec->is_last_total || dec->frame_header->animation_frame.duration > 0;
+      // is kRegularFrame and coalescing is disabled
+      dec->is_last_of_still |=
+          (!dec->coalescing &&
+           dec->frame_header->frame_type == FrameType::kRegularFrame);
+      const size_t internal_frame_index = dec->internal_frames;
+      const size_t external_frame_index = dec->external_frames;
+      if (dec->is_last_of_still) dec->external_frames++;
+      dec->internal_frames++;
+
+      if (dec->skip_frames > 0) {
+        dec->skipping_frame = true;
+        if (dec->is_last_of_still) {
+          dec->skip_frames--;
+        }
+      } else {
+        dec->skipping_frame = false;
+      }
+
+      if (external_frame_index >= dec->frame_external_to_internal.size()) {
+        dec->frame_external_to_internal.push_back(internal_frame_index);
+        JXL_ASSERT(dec->frame_external_to_internal.size() ==
+                   external_frame_index + 1);
+      }
+
+      if (internal_frame_index >= dec->frame_saved_as.size()) {
+        dec->frame_saved_as.push_back(saved_as);
+        JXL_ASSERT(dec->frame_saved_as.size() == internal_frame_index + 1);
+
+        // add the value 0xff (which means all references) to new slots: we only
+        // know the references of the frame at FinalizeFrame, and fill in the
+        // correct values there. As long as this information is not known, the
+        // worst case where the frame depends on all storage slots is assumed.
+        dec->frame_references.push_back(0xff);
+        JXL_ASSERT(dec->frame_references.size() == internal_frame_index + 1);
+      }
+
+      if (dec->skipping_frame) {
+        // Whether this frame could be referenced by any future frame: either
+        // because it's a frame saved for blending or patches, or because it's
+        // a DC frame.
+        bool referenceable =
+            dec->frame_header->CanBeReferenced() ||
+            dec->frame_header->frame_type == FrameType::kDCFrame;
+        if (internal_frame_index < dec->frame_required.size() &&
+            !dec->frame_required[internal_frame_index]) {
+          referenceable = false;
+        }
+        if (!referenceable) {
+          // Skip all decoding for this frame, since the user is skipping this
+          // frame and no future frames can reference it.
+          dec->frame_stage = FrameStage::kHeader;
+          dec->AdvanceCodestream(dec->remaining_frame_size);
+          continue;
+        }
+      }
+
+      if ((dec->events_wanted & JXL_DEC_FRAME) && dec->is_last_of_still) {
+        // Only return this for the last of a series of stills: patches frames
+        // etc... before this one do not contain the correct information such
+        // as animation timing, ...
+        if (!dec->skipping_frame) {
+          return JXL_DEC_FRAME;
+        }
+      }
+    }
+
+    if (dec->frame_stage == FrameStage::kTOC) {
+      dec->frame_dec->SetRenderSpotcolors(dec->render_spotcolors);
+      dec->frame_dec->SetCoalescing(dec->coalescing);
+
+      if (!dec->preview_frame &&
+          (dec->events_wanted & JXL_DEC_FRAME_PROGRESSION)) {
+        dec->frame_prog_detail =
+            dec->frame_dec->SetPauseAtProgressive(dec->prog_detail);
+      } else {
+        dec->frame_prog_detail = JxlProgressiveDetail::kFrames;
+      }
+      dec->dc_frame_progression_done = 0;
+
+      dec->next_section = 0;
+      dec->section_processed.clear();
+      dec->section_processed.resize(dec->frame_dec->Toc().size(), 0);
+
+      // If we don't need pixels, we can skip actually decoding the frames.
+      if (dec->preview_frame || (dec->events_wanted & JXL_DEC_FULL_IMAGE)) {
+        dec->frame_stage = FrameStage::kFull;
+      } else if (!dec->is_last_total) {
+        dec->frame_stage = FrameStage::kHeader;
+        dec->AdvanceCodestream(dec->remaining_frame_size);
+        continue;
+      } else {
+        break;
+      }
+    }
+
+    if (dec->frame_stage == FrameStage::kFull) {
+      if (!dec->image_out_buffer_set) {
+        if (dec->preview_frame) {
+          return JXL_DEC_NEED_PREVIEW_OUT_BUFFER;
+        }
+        if (
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+            (!dec->jpeg_decoder.IsOutputSet() ||
+             dec->ib->jpeg_data == nullptr) &&
+#endif
+            dec->is_last_of_still && !dec->skipping_frame) {
+          // TODO(lode): remove the dec->is_last_of_still condition if the
+          // frame decoder needs the image buffer as working space for decoding
+          // non-visible or blending frames too
+          return JXL_DEC_NEED_IMAGE_OUT_BUFFER;
+        }
+      }
+
+      if (dec->image_out_buffer_set) {
+        size_t xsize, ysize;
+        GetCurrentDimensions(dec, xsize, ysize);
+        size_t bits_per_sample = GetBitDepth(
+            dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format);
+        dec->frame_dec->SetImageOutput(
+            PixelCallback{
+                dec->image_out_init_callback, dec->image_out_run_callback,
+                dec->image_out_destroy_callback, dec->image_out_init_opaque},
+            reinterpret_cast<uint8_t*>(dec->image_out_buffer),
+            dec->image_out_size, xsize, ysize, dec->image_out_format,
+            bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation);
+        for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
+          const auto& extra = dec->extra_channel_output[i];
+          size_t ec_bits_per_sample =
+              GetBitDepth(dec->image_out_bit_depth,
+                          dec->metadata.m.extra_channel_info[i], extra.format);
+          dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size,
+                                                xsize, extra.format,
+                                                ec_bits_per_sample);
+        }
+      }
+
+      size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause();
+
+      JXL_API_RETURN_IF_ERROR(JxlDecoderProcessSections(dec));
+
+      bool all_sections_done = dec->frame_dec->HasDecodedAll();
+      bool got_dc_only = !all_sections_done && dec->frame_dec->HasDecodedDC();
+
+      if (dec->frame_prog_detail >= JxlProgressiveDetail::kDC &&
+          !dec->dc_frame_progression_done && got_dc_only) {
+        dec->dc_frame_progression_done = true;
+        dec->downsampling_target = 8;
+        return JXL_DEC_FRAME_PROGRESSION;
+      }
+
+      bool new_progression_step_done =
+          dec->frame_dec->NumCompletePasses() >= next_num_passes_to_pause;
+
+      if (!all_sections_done &&
+          dec->frame_prog_detail >= JxlProgressiveDetail::kLastPasses &&
+          new_progression_step_done) {
+        dec->downsampling_target =
+            dec->frame_header->passes.GetDownsamplingTargetForCompletedPasses(
+                dec->frame_dec->NumCompletePasses());
+        return JXL_DEC_FRAME_PROGRESSION;
+      }
+
+      if (!all_sections_done) {
+        // Not all sections have been processed yet
+        return dec->RequestMoreInput();
+      }
+
+      if (!dec->preview_frame) {
+        size_t internal_index = dec->internal_frames - 1;
+        JXL_ASSERT(dec->frame_references.size() > internal_index);
+        // Always fill this in, even if it was already written, it could be that
+        // this frame was skipped before and set to 255, while only now we know
+        // the true value.
+        dec->frame_references[internal_index] = dec->frame_dec->References();
+      }
+
+      if (!dec->frame_dec->FinalizeFrame()) {
+        return JXL_API_ERROR("decoding frame failed");
+      }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      // If jpeg output was requested, we merely return the JXL_DEC_FULL_IMAGE
+      // status without outputting pixels.
+      if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
+        dec->frame_stage = FrameStage::kHeader;
+        dec->recon_output_jpeg = JpegReconStage::kSettingMetadata;
+        return JXL_DEC_FULL_IMAGE;
+      }
+#endif
+      if (dec->preview_frame || dec->is_last_of_still) {
+        dec->image_out_buffer_set = false;
+        dec->extra_channel_output.clear();
+      }
+    }
+
+    dec->frame_stage = FrameStage::kHeader;
+
+    // The pixels have been output or are not needed, do not keep them in
+    // memory here.
+    dec->ib.reset();
+    if (dec->preview_frame) {
+      dec->got_preview_image = true;
+      dec->preview_frame = false;
+      dec->events_wanted &= ~JXL_DEC_PREVIEW_IMAGE;
+      return JXL_DEC_PREVIEW_IMAGE;
+    } else if (dec->is_last_of_still &&
+               (dec->events_wanted & JXL_DEC_FULL_IMAGE) &&
+               !dec->skipping_frame) {
+      return JXL_DEC_FULL_IMAGE;
+    }
+  }
+
+  dec->stage = DecoderStage::kCodestreamFinished;
+  // Return success, this means there is nothing more to do.
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+}  // namespace jxl
+
+JxlDecoderStatus JxlDecoderSetInput(JxlDecoder* dec, const uint8_t* data,
+                                    size_t size) {
+  if (dec->next_in) {
+    return JXL_API_ERROR("already set input, use JxlDecoderReleaseInput first");
+  }
+  if (dec->input_closed) {
+    return JXL_API_ERROR("input already closed");
+  }
+
+  dec->next_in = data;
+  dec->avail_in = size;
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderReleaseInput(JxlDecoder* dec) {
+  size_t result = dec->avail_in;
+  dec->next_in = nullptr;
+  dec->avail_in = 0;
+  return result;
+}
+
+void JxlDecoderCloseInput(JxlDecoder* dec) { dec->input_closed = true; }
+
+JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data,
+                                         size_t size) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  // JPEG reconstruction buffer can only set and updated before or during the
+  // first frame, the reconstruction box refers to the first frame and in
+  // theory multi-frame images should not be used with a jbrd box.
+  if (dec->internal_frames > 1) {
+    return JXL_API_ERROR("JPEG reconstruction only works for the first frame");
+  }
+  if (dec->jpeg_decoder.IsOutputSet()) {
+    return JXL_API_ERROR("Already set JPEG buffer");
+  }
+  return dec->jpeg_decoder.SetOutputBuffer(data, size);
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
+}
+
+size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  return dec->jpeg_decoder.ReleaseOutputBuffer();
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
+}
+
+// Parses the header of the box, outputting the 4-character type and the box
+// size, including header size, as stored in the box header.
+// @param in current input bytes.
+// @param size available input size.
+// @param pos position in the input, must begin at the header of the box.
+// @param file_pos position of pos since the start of the JXL file, rather than
+// the current input, used for integer overflow checking.
+// @param type the output box type.
+// @param box_size output the total box size, including header, in bytes, or 0
+// if it's a final unbounded box.
+// @param header_size output size of the box header.
+// @return JXL_DEC_SUCCESS if the box header was fully parsed. In that case the
+// parsing position must be incremented by header_size bytes.
+// JXL_DEC_NEED_MORE_INPUT if not enough input bytes available, in that case
+// header_size indicates a lower bound for the known size the header has to be
+// at least. JXL_DEC_ERROR if the box header is invalid.
+static JxlDecoderStatus ParseBoxHeader(const uint8_t* in, size_t size,
+                                       size_t pos, size_t file_pos,
+                                       JxlBoxType type, uint64_t* box_size,
+                                       uint64_t* header_size) {
+  if (OutOfBounds(pos, 8, size)) {
+    *header_size = 8;
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+  size_t box_start = pos;
+  // Box size, including this header itself.
+  *box_size = LoadBE32(in + pos);
+  pos += 4;
+  if (*box_size == 1) {
+    *header_size = 16;
+    if (OutOfBounds(pos, 12, size)) return JXL_DEC_NEED_MORE_INPUT;
+    *box_size = LoadBE64(in + pos);
+    pos += 8;
+  }
+  memcpy(type, in + pos, 4);
+  pos += 4;
+  *header_size = pos - box_start;
+  if (*box_size > 0 && *box_size < *header_size) {
+    return JXL_API_ERROR("invalid box size");
+  }
+  if (SumOverflows(file_pos, pos, *box_size)) {
+    return JXL_API_ERROR("Box size overflow");
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+// This includes handling the codestream if it is not a box-based jxl file.
+static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
+  // Box handling loop
+  for (;;) {
+    if (dec->box_stage != BoxStage::kHeader) {
+      dec->AdvanceInput(dec->header_size);
+      dec->header_size = 0;
+#if JPEGXL_ENABLE_BOXES
+      if ((dec->events_wanted & JXL_DEC_BOX) &&
+          dec->box_out_buffer_set_current_box) {
+        uint8_t* next_out = dec->box_out_buffer + dec->box_out_buffer_pos;
+        size_t avail_out = dec->box_out_buffer_size - dec->box_out_buffer_pos;
+
+        JxlDecoderStatus box_result = dec->box_content_decoder.Process(
+            dec->next_in, dec->avail_in,
+            dec->file_pos - dec->box_contents_begin, &next_out, &avail_out);
+        size_t produced =
+            next_out - (dec->box_out_buffer + dec->box_out_buffer_pos);
+        dec->box_out_buffer_pos += produced;
+
+        // Don't return JXL_DEC_NEED_MORE_INPUT: the box stages below, instead,
+        // handle the input progression, and the above only outputs the part of
+        // the box seen so far.
+        if (box_result != JXL_DEC_SUCCESS &&
+            box_result != JXL_DEC_NEED_MORE_INPUT) {
+          return box_result;
+        }
+      }
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->store_exif == 1 || dec->store_xmp == 1) {
+        std::vector<uint8_t>& metadata =
+            (dec->store_exif == 1) ? dec->exif_metadata : dec->xmp_metadata;
+        for (;;) {
+          if (metadata.empty()) metadata.resize(64);
+          uint8_t* orig_next_out = metadata.data() + dec->recon_out_buffer_pos;
+          uint8_t* next_out = orig_next_out;
+          size_t avail_out = metadata.size() - dec->recon_out_buffer_pos;
+          JxlDecoderStatus box_result = dec->metadata_decoder.Process(
+              dec->next_in, dec->avail_in,
+              dec->file_pos - dec->box_contents_begin, &next_out, &avail_out);
+          size_t produced = next_out - orig_next_out;
+          dec->recon_out_buffer_pos += produced;
+          if (box_result == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+            metadata.resize(metadata.size() * 2);
+          } else if (box_result == JXL_DEC_NEED_MORE_INPUT) {
+            break;  // box stage handling below will handle this instead
+          } else if (box_result == JXL_DEC_SUCCESS) {
+            size_t needed_size = (dec->store_exif == 1) ? dec->recon_exif_size
+                                                        : dec->recon_xmp_size;
+            if (dec->box_contents_unbounded &&
+                dec->recon_out_buffer_pos < needed_size) {
+              // Unbounded box, but we know the expected size due to the jbrd
+              // box's data. Treat this as the JXL_DEC_NEED_MORE_INPUT case.
+              break;
+            } else {
+              metadata.resize(dec->recon_out_buffer_pos);
+              if (dec->store_exif == 1) dec->store_exif = 2;
+              if (dec->store_xmp == 1) dec->store_xmp = 2;
+              break;
+            }
+          } else {
+            // error
+            return box_result;
+          }
+        }
+      }
+#endif
+    }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata &&
+        !dec->JbrdNeedMoreBoxes()) {
+      jxl::jpeg::JPEGData* jpeg_data = dec->ib->jpeg_data.get();
+      if (dec->recon_exif_size) {
+        JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetExif(
+            dec->exif_metadata.data(), dec->exif_metadata.size(), jpeg_data);
+        if (status != JXL_DEC_SUCCESS) return status;
+      }
+      if (dec->recon_xmp_size) {
+        JxlDecoderStatus status = jxl::JxlToJpegDecoder::SetXmp(
+            dec->xmp_metadata.data(), dec->xmp_metadata.size(), jpeg_data);
+        if (status != JXL_DEC_SUCCESS) return status;
+      }
+      dec->recon_output_jpeg = JpegReconStage::kOutputting;
+    }
+
+    if (dec->recon_output_jpeg == JpegReconStage::kOutputting &&
+        !dec->JbrdNeedMoreBoxes()) {
+      JxlDecoderStatus status =
+          dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data);
+      if (status != JXL_DEC_SUCCESS) return status;
+      dec->recon_output_jpeg = JpegReconStage::kFinished;
+      dec->ib.reset();
+      if (dec->events_wanted & JXL_DEC_FULL_IMAGE) {
+        // Return the full image event here now, this may be delayed if this
+        // could only be done after decoding an exif or xmp box after the
+        // codestream.
+        return JXL_DEC_FULL_IMAGE;
+      }
+    }
+#endif
+
+    if (dec->box_stage == BoxStage::kHeader) {
+      if (!dec->have_container) {
+        if (dec->stage == DecoderStage::kCodestreamFinished)
+          return JXL_DEC_SUCCESS;
+        dec->box_stage = BoxStage::kCodestream;
+        dec->box_contents_unbounded = true;
+        continue;
+      }
+      if (dec->avail_in == 0) {
+        if (dec->stage != DecoderStage::kCodestreamFinished) {
+          // Not yet seen (all) codestream boxes.
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+        if (dec->JbrdNeedMoreBoxes()) {
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+#endif
+        if (dec->input_closed) {
+          return JXL_DEC_SUCCESS;
+        }
+        if (!(dec->events_wanted & JXL_DEC_BOX)) {
+          // All codestream and jbrd metadata boxes finished, and no individual
+          // boxes requested by user, so no need to request any more input.
+          // This returns success for backwards compatibility, when
+          // JxlDecoderCloseInput and JXL_DEC_BOX did not exist, as well
+          // as for efficiency.
+          return JXL_DEC_SUCCESS;
+        }
+        // Even though we are exactly at a box end, there still may be more
+        // boxes. The user may call JxlDecoderCloseInput to indicate the input
+        // is finished and get success instead.
+        return JXL_DEC_NEED_MORE_INPUT;
+      }
+
+      bool boxed_codestream_done =
+          ((dec->events_wanted & JXL_DEC_BOX) &&
+           dec->stage == DecoderStage::kCodestreamFinished &&
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+           !dec->JbrdNeedMoreBoxes() &&
+#endif
+           dec->last_codestream_seen);
+      if (boxed_codestream_done && dec->avail_in >= 2 &&
+          dec->next_in[0] == 0xff &&
+          dec->next_in[1] == jxl::kCodestreamMarker) {
+        // We detected the start of the next naked codestream, so we can return
+        // success here.
+        return JXL_DEC_SUCCESS;
+      }
+
+      uint64_t box_size, header_size;
+      JxlDecoderStatus status =
+          ParseBoxHeader(dec->next_in, dec->avail_in, 0, dec->file_pos,
+                         dec->box_type, &box_size, &header_size);
+      if (status != JXL_DEC_SUCCESS) {
+        if (status == JXL_DEC_NEED_MORE_INPUT) {
+          dec->basic_info_size_hint =
+              InitialBasicInfoSizeHint() + header_size - dec->file_pos;
+        }
+        return status;
+      }
+      if (memcmp(dec->box_type, "brob", 4) == 0) {
+        if (dec->avail_in < header_size + 4) {
+          return JXL_DEC_NEED_MORE_INPUT;
+        }
+        memcpy(dec->box_decoded_type, dec->next_in + header_size,
+               sizeof(dec->box_decoded_type));
+      } else {
+        memcpy(dec->box_decoded_type, dec->box_type,
+               sizeof(dec->box_decoded_type));
+      }
+
+      // Box order validity checks
+      // The signature box at box_count == 1 is not checked here since that's
+      // already done at the beginning.
+      dec->box_count++;
+      if (boxed_codestream_done && memcmp(dec->box_type, "JXL ", 4) == 0) {
+        // We detected the start of the next boxed stream, so we can return
+        // success here.
+        return JXL_DEC_SUCCESS;
+      }
+      if (dec->box_count == 2 && memcmp(dec->box_type, "ftyp", 4) != 0) {
+        return JXL_API_ERROR("the second box must be the ftyp box");
+      }
+      if (memcmp(dec->box_type, "ftyp", 4) == 0 && dec->box_count != 2) {
+        return JXL_API_ERROR("the ftyp box must come second");
+      }
+
+      dec->box_contents_unbounded = (box_size == 0);
+      dec->box_contents_begin = dec->file_pos + header_size;
+      dec->box_contents_end =
+          dec->box_contents_unbounded ? 0 : (dec->file_pos + box_size);
+      dec->box_contents_size =
+          dec->box_contents_unbounded ? 0 : (box_size - header_size);
+      dec->box_size = box_size;
+      dec->header_size = header_size;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) {
+        // Initiate storing of Exif or XMP data for JPEG reconstruction
+        if (dec->store_exif == 0 &&
+            memcmp(dec->box_decoded_type, "Exif", 4) == 0) {
+          dec->store_exif = 1;
+          dec->recon_out_buffer_pos = 0;
+        }
+        if (dec->store_xmp == 0 &&
+            memcmp(dec->box_decoded_type, "xml ", 4) == 0) {
+          dec->store_xmp = 1;
+          dec->recon_out_buffer_pos = 0;
+        }
+      }
+#endif
+#if JPEGXL_ENABLE_BOXES
+      if (dec->events_wanted & JXL_DEC_BOX) {
+        bool decompress =
+            dec->decompress_boxes && memcmp(dec->box_type, "brob", 4) == 0;
+        dec->box_content_decoder.StartBox(
+            decompress, dec->box_contents_unbounded, dec->box_contents_size);
+      }
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (dec->store_exif == 1 || dec->store_xmp == 1) {
+        bool brob = memcmp(dec->box_type, "brob", 4) == 0;
+        dec->metadata_decoder.StartBox(brob, dec->box_contents_unbounded,
+                                       dec->box_contents_size);
+      }
+#endif
+      if (memcmp(dec->box_type, "ftyp", 4) == 0) {
+        dec->box_stage = BoxStage::kFtyp;
+      } else if (memcmp(dec->box_type, "jxlc", 4) == 0) {
+        if (dec->last_codestream_seen) {
+          return JXL_API_ERROR("there can only be one jxlc box");
+        }
+        dec->last_codestream_seen = true;
+        dec->box_stage = BoxStage::kCodestream;
+      } else if (memcmp(dec->box_type, "jxlp", 4) == 0) {
+        dec->box_stage = BoxStage::kPartialCodestream;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      } else if ((dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) &&
+                 memcmp(dec->box_type, "jbrd", 4) == 0) {
+        if (!(dec->events_wanted & JXL_DEC_JPEG_RECONSTRUCTION)) {
+          return JXL_API_ERROR(
+              "multiple JPEG reconstruction boxes not supported");
+        }
+        dec->box_stage = BoxStage::kJpegRecon;
+#endif
+      } else {
+        dec->box_stage = BoxStage::kSkip;
+      }
+
+      if (dec->events_wanted & JXL_DEC_BOX) {
+        dec->box_event = true;
+        dec->box_out_buffer_set_current_box = false;
+        return JXL_DEC_BOX;
+      }
+    } else if (dec->box_stage == BoxStage::kFtyp) {
+      if (dec->box_contents_size < 12) {
+        return JXL_API_ERROR("file type box too small");
+      }
+      if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (memcmp(dec->next_in, "jxl ", 4) != 0) {
+        return JXL_API_ERROR("file type box major brand must be \"jxl \"");
+      }
+      dec->AdvanceInput(4);
+      dec->box_stage = BoxStage::kSkip;
+    } else if (dec->box_stage == BoxStage::kPartialCodestream) {
+      if (dec->last_codestream_seen) {
+        return JXL_API_ERROR("cannot have jxlp box after last jxlp box");
+      }
+      // TODO(lode): error if box is unbounded but last bit not set
+      if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
+      if (!dec->box_contents_unbounded && dec->box_contents_size < 4) {
+        return JXL_API_ERROR("jxlp box too small to contain index");
+      }
+      size_t jxlp_index = LoadBE32(dec->next_in);
+      // The high bit of jxlp_index indicates whether this is the last
+      // jxlp box.
+      if (jxlp_index & 0x80000000) {
+        dec->last_codestream_seen = true;
+      }
+      dec->AdvanceInput(4);
+      dec->box_stage = BoxStage::kCodestream;
+    } else if (dec->box_stage == BoxStage::kCodestream) {
+      JxlDecoderStatus status = jxl::JxlDecoderProcessCodestream(dec);
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      if (status == JXL_DEC_FULL_IMAGE) {
+        if (dec->recon_output_jpeg != JpegReconStage::kNone) {
+          continue;
+        }
+      }
+#endif
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (dec->file_pos == dec->box_contents_end &&
+            !dec->box_contents_unbounded) {
+          dec->box_stage = BoxStage::kHeader;
+          continue;
+        }
+      }
+
+      if (status == JXL_DEC_SUCCESS) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+        if (dec->JbrdNeedMoreBoxes()) {
+          dec->box_stage = BoxStage::kSkip;
+          continue;
+        }
+#endif
+        if (dec->box_contents_unbounded) {
+          // Last box reached and codestream done, nothing more to do.
+          break;
+        }
+        if (dec->events_wanted & JXL_DEC_BOX) {
+          // Codestream done, but there may be more other boxes.
+          dec->box_stage = BoxStage::kSkip;
+          continue;
+        }
+      }
+      return status;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    } else if (dec->box_stage == BoxStage::kJpegRecon) {
+      if (!dec->jpeg_decoder.IsParsingBox()) {
+        // This is a new JPEG reconstruction metadata box.
+        dec->jpeg_decoder.StartBox(dec->box_contents_unbounded,
+                                   dec->box_contents_size);
+      }
+      const uint8_t* next_in = dec->next_in;
+      size_t avail_in = dec->avail_in;
+      JxlDecoderStatus recon_result =
+          dec->jpeg_decoder.Process(&next_in, &avail_in);
+      size_t consumed = next_in - dec->next_in;
+      dec->AdvanceInput(consumed);
+      if (recon_result == JXL_DEC_JPEG_RECONSTRUCTION) {
+        jxl::jpeg::JPEGData* jpeg_data = dec->jpeg_decoder.GetJpegData();
+        size_t num_exif = jxl::JxlToJpegDecoder::NumExifMarkers(*jpeg_data);
+        size_t num_xmp = jxl::JxlToJpegDecoder::NumXmpMarkers(*jpeg_data);
+        if (num_exif) {
+          if (num_exif > 1) {
+            return JXL_API_ERROR(
+                "multiple exif markers for JPEG reconstruction not supported");
+          }
+          if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::ExifBoxContentSize(
+                                     *jpeg_data, &dec->recon_exif_size)) {
+            return JXL_API_ERROR("invalid jbrd exif size");
+          }
+        }
+        if (num_xmp) {
+          if (num_xmp > 1) {
+            return JXL_API_ERROR(
+                "multiple XMP markers for JPEG reconstruction not supported");
+          }
+          if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::XmlBoxContentSize(
+                                     *jpeg_data, &dec->recon_xmp_size)) {
+            return JXL_API_ERROR("invalid jbrd XMP size");
+          }
+        }
+
+        dec->box_stage = BoxStage::kHeader;
+        // If successful JPEG reconstruction, return the success if the user
+        // cares about it, otherwise continue.
+        if (dec->events_wanted & recon_result) {
+          dec->events_wanted &= ~recon_result;
+          return recon_result;
+        }
+      } else {
+        // If anything else, return the result.
+        return recon_result;
+      }
+#endif
+    } else if (dec->box_stage == BoxStage::kSkip) {
+      if (dec->box_contents_unbounded) {
+        if (dec->input_closed) {
+          return JXL_DEC_SUCCESS;
+        }
+        if (!(dec->box_out_buffer_set)) {
+          // An unbounded box is always the last box. Not requesting box data,
+          // so return success even if JxlDecoderCloseInput was not called for
+          // backwards compatibility as well as efficiency since this box is
+          // being skipped.
+          return JXL_DEC_SUCCESS;
+        }
+        // Arbitrarily more bytes may follow, only JxlDecoderCloseInput can
+        // mark the end.
+        dec->AdvanceInput(dec->avail_in);
+        return JXL_DEC_NEED_MORE_INPUT;
+      }
+      // Amount of remaining bytes in the box that is being skipped.
+      size_t remaining = dec->box_contents_end - dec->file_pos;
+      if (dec->avail_in < remaining) {
+        // Indicate how many more bytes needed starting from next_in.
+        dec->basic_info_size_hint =
+            InitialBasicInfoSizeHint() + dec->box_contents_end - dec->file_pos;
+        // Don't have the full box yet, skip all we have so far
+        dec->AdvanceInput(dec->avail_in);
+        return JXL_DEC_NEED_MORE_INPUT;
+      } else {
+        // Full box available, skip all its remaining bytes
+        dec->AdvanceInput(remaining);
+        dec->box_stage = BoxStage::kHeader;
+      }
+    } else {
+      JXL_DASSERT(false);  // unknown box stage
+    }
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
+  if (dec->stage == DecoderStage::kInited) {
+    dec->stage = DecoderStage::kStarted;
+  }
+  if (dec->stage == DecoderStage::kError) {
+    return JXL_API_ERROR(
+        "Cannot keep using decoder after it encountered an error, use "
+        "JxlDecoderReset to reset it");
+  }
+
+  if (!dec->got_signature) {
+    JxlSignature sig = JxlSignatureCheck(dec->next_in, dec->avail_in);
+    if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature");
+    if (sig == JXL_SIG_NOT_ENOUGH_BYTES) {
+      if (dec->input_closed) {
+        return JXL_API_ERROR("file too small for signature");
+      }
+      return JXL_DEC_NEED_MORE_INPUT;
+    }
+
+    dec->got_signature = true;
+
+    if (sig == JXL_SIG_CONTAINER) {
+      dec->have_container = 1;
+    } else {
+      dec->last_codestream_seen = true;
+    }
+  }
+
+  JxlDecoderStatus status = HandleBoxes(dec);
+
+  if (status == JXL_DEC_NEED_MORE_INPUT && dec->input_closed) {
+    return JXL_API_ERROR("missing input");
+  }
+
+  // Even if the box handling returns success, certain types of
+  // data may be missing.
+  if (status == JXL_DEC_SUCCESS) {
+    if (dec->CanUseMoreCodestreamInput()) {
+      return JXL_API_ERROR("codestream never finished");
+    }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    if (dec->JbrdNeedMoreBoxes()) {
+      return JXL_API_ERROR("missing metadata boxes for jpeg reconstruction");
+    }
+#endif
+  }
+
+  return status;
+}
+
+// To ensure ABI forward-compatibility, this struct has a constant size.
+static_assert(sizeof(JxlBasicInfo) == 204,
+              "JxlBasicInfo struct size should remain constant");
+
+JxlDecoderStatus JxlDecoderGetBasicInfo(const JxlDecoder* dec,
+                                        JxlBasicInfo* info) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  if (info) {
+    memset(info, 0, sizeof(*info));
+
+    const jxl::ImageMetadata& meta = dec->metadata.m;
+
+    info->have_container = dec->have_container;
+    info->xsize = dec->metadata.size.xsize();
+    info->ysize = dec->metadata.size.ysize();
+    info->uses_original_profile = !meta.xyb_encoded;
+
+    info->bits_per_sample = meta.bit_depth.bits_per_sample;
+    info->exponent_bits_per_sample = meta.bit_depth.exponent_bits_per_sample;
+
+    info->have_preview = meta.have_preview;
+    info->have_animation = meta.have_animation;
+    info->orientation = static_cast<JxlOrientation>(meta.orientation);
+
+    if (!dec->keep_orientation) {
+      if (info->orientation >= JXL_ORIENT_TRANSPOSE) {
+        std::swap(info->xsize, info->ysize);
+      }
+      info->orientation = JXL_ORIENT_IDENTITY;
+    }
+
+    info->intensity_target = meta.IntensityTarget();
+    if (dec->desired_intensity_target > 0) {
+      info->intensity_target = dec->desired_intensity_target;
+    }
+    info->min_nits = meta.tone_mapping.min_nits;
+    info->relative_to_max_display = meta.tone_mapping.relative_to_max_display;
+    info->linear_below = meta.tone_mapping.linear_below;
+
+    const jxl::ExtraChannelInfo* alpha = meta.Find(jxl::ExtraChannel::kAlpha);
+    if (alpha != nullptr) {
+      info->alpha_bits = alpha->bit_depth.bits_per_sample;
+      info->alpha_exponent_bits = alpha->bit_depth.exponent_bits_per_sample;
+      info->alpha_premultiplied = alpha->alpha_associated;
+    } else {
+      info->alpha_bits = 0;
+      info->alpha_exponent_bits = 0;
+      info->alpha_premultiplied = 0;
+    }
+
+    info->num_color_channels =
+        meta.color_encoding.GetColorSpace() == jxl::ColorSpace::kGray ? 1 : 3;
+
+    info->num_extra_channels = meta.num_extra_channels;
+
+    if (info->have_preview) {
+      info->preview.xsize = dec->metadata.m.preview_size.xsize();
+      info->preview.ysize = dec->metadata.m.preview_size.ysize();
+    }
+
+    if (info->have_animation) {
+      info->animation.tps_numerator = dec->metadata.m.animation.tps_numerator;
+      info->animation.tps_denominator =
+          dec->metadata.m.animation.tps_denominator;
+      info->animation.num_loops = dec->metadata.m.animation.num_loops;
+      info->animation.have_timecodes = dec->metadata.m.animation.have_timecodes;
+    }
+
+    if (meta.have_intrinsic_size) {
+      info->intrinsic_xsize = dec->metadata.m.intrinsic_size.xsize();
+      info->intrinsic_ysize = dec->metadata.m.intrinsic_size.ysize();
+    } else {
+      info->intrinsic_xsize = info->xsize;
+      info->intrinsic_ysize = info->ysize;
+    }
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelInfo(const JxlDecoder* dec,
+                                               size_t index,
+                                               JxlExtraChannelInfo* info) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  const std::vector<jxl::ExtraChannelInfo>& channels =
+      dec->metadata.m.extra_channel_info;
+
+  if (index >= channels.size()) return JXL_DEC_ERROR;  // out of bounds
+  const jxl::ExtraChannelInfo& channel = channels[index];
+
+  info->type = static_cast<JxlExtraChannelType>(channel.type);
+  info->bits_per_sample = channel.bit_depth.bits_per_sample;
+  info->exponent_bits_per_sample =
+      channel.bit_depth.floating_point_sample
+          ? channel.bit_depth.exponent_bits_per_sample
+          : 0;
+  info->dim_shift = channel.dim_shift;
+  info->name_length = channel.name.size();
+  info->alpha_premultiplied = channel.alpha_associated;
+  info->spot_color[0] = channel.spot_color[0];
+  info->spot_color[1] = channel.spot_color[1];
+  info->spot_color[2] = channel.spot_color[2];
+  info->spot_color[3] = channel.spot_color[3];
+  info->cfa_channel = channel.cfa_channel;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelName(const JxlDecoder* dec,
+                                               size_t index, char* name,
+                                               size_t size) {
+  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
+
+  const std::vector<jxl::ExtraChannelInfo>& channels =
+      dec->metadata.m.extra_channel_info;
+
+  if (index >= channels.size()) return JXL_DEC_ERROR;  // out of bounds
+  const jxl::ExtraChannelInfo& channel = channels[index];
+
+  // Also need null-termination character
+  if (channel.name.size() + 1 > size) return JXL_DEC_ERROR;
+
+  memcpy(name, channel.name.c_str(), channel.name.size() + 1);
+
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+// Gets the jxl::ColorEncoding for the desired target, and checks errors.
+// Returns the object regardless of whether the actual color space is in ICC,
+// but ensures that if the color encoding is not the encoding from the
+// codestream header metadata, it cannot require ICC profile.
+JxlDecoderStatus GetColorEncodingForTarget(
+    const JxlDecoder* dec, JxlColorProfileTarget target,
+    const jxl::ColorEncoding** encoding) {
+  if (!dec->got_all_headers) return JXL_DEC_NEED_MORE_INPUT;
+  *encoding = nullptr;
+  if (target == JXL_COLOR_PROFILE_TARGET_DATA && dec->metadata.m.xyb_encoded) {
+    *encoding = &dec->passes_state->output_encoding_info.color_encoding;
+  } else {
+    *encoding = &dec->metadata.m.color_encoding;
+  }
+  return JXL_DEC_SUCCESS;
+}
+}  // namespace
+
+JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, JxlColorEncoding* color_encoding) {
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  JxlDecoderStatus status =
+      GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status) return status;
+
+  if (jxl_color_encoding->WantICC())
+    return JXL_DEC_ERROR;  // Indicate no encoded profile available.
+
+  if (color_encoding) {
+    ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding);
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetICCProfileSize(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, size_t* size) {
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  JxlDecoderStatus status =
+      GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (jxl_color_encoding->WantICC()) {
+    jxl::ColorSpace color_space =
+        dec->metadata.m.color_encoding.GetColorSpace();
+    if (color_space == jxl::ColorSpace::kUnknown ||
+        color_space == jxl::ColorSpace::kXYB) {
+      // This indicates there's no ICC profile available
+      // TODO(lode): for the XYB case, do we want to craft an ICC profile that
+      // represents XYB as an RGB profile? It may be possible, but not with
+      // only 1D transfer functions.
+      return JXL_DEC_ERROR;
+    }
+  }
+
+  if (size) {
+    *size = jxl_color_encoding->ICC().size();
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetColorAsICCProfile(
+    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
+    JxlColorProfileTarget target, uint8_t* icc_profile, size_t size) {
+  size_t wanted_size;
+  // This also checks the NEED_MORE_INPUT and the unknown/xyb cases
+  JxlDecoderStatus status =
+      JxlDecoderGetICCProfileSize(dec, nullptr, target, &wanted_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small");
+
+  const jxl::ColorEncoding* jxl_color_encoding = nullptr;
+  status = GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  memcpy(icc_profile, jxl_color_encoding->ICC().data(),
+         jxl_color_encoding->ICC().size());
+
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+// Returns the amount of bits needed for getting memory buffer size, and does
+// all error checking required for size checking and format validity.
+JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,
+                                  const JxlPixelFormat* format, size_t* bits) {
+  if (!dec->got_basic_info) {
+    // Don't know image dimensions yet, cannot check for valid size.
+    return JXL_DEC_NEED_MORE_INPUT;
+  }
+  if (!dec->coalescing &&
+      (!dec->frame_header || dec->frame_stage == FrameStage::kHeader)) {
+    return JXL_API_ERROR("Don't know frame dimensions yet");
+  }
+  if (format->num_channels > 4) {
+    return JXL_API_ERROR("More than 4 channels not supported");
+  }
+
+  *bits = BitsPerChannel(format->data_type);
+
+  if (*bits == 0) {
+    return JXL_API_ERROR("Invalid/unsupported data type");
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec) {
+  return dec->downsampling_target;
+}
+
+JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
+  if (!dec->image_out_buffer_set) return JXL_DEC_ERROR;
+  if (dec->frame_stage != FrameStage::kFull) {
+    return JXL_DEC_ERROR;
+  }
+  JXL_DASSERT(dec->frame_dec);
+  if (!dec->frame_dec->HasDecodedDC()) {
+    // FrameDecoder::Flush currently requires DC to have been decoded already
+    // to work correctly.
+    return JXL_DEC_ERROR;
+  }
+
+  if (!dec->frame_dec->Flush()) {
+    return JXL_DEC_ERROR;
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderPreviewOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) {
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+
+  size_t xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation);
+  size_t ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation);
+
+  size_t row_size =
+      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
+  size_t last_row_size = row_size;
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * (ysize - 1) + last_row_size;
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer(
+    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) {
+  if (!dec->got_basic_info || !dec->metadata.m.have_preview ||
+      !(dec->orig_events_wanted & JXL_DEC_PREVIEW_IMAGE)) {
+    return JXL_API_ERROR("No preview out buffer needed at this time");
+  }
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+
+  size_t min_size;
+  // This also checks whether the format is valid and supported and basic info
+  // is available.
+  JxlDecoderStatus status =
+      JxlDecoderPreviewOutBufferSize(dec, format, &min_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_buffer = buffer;
+  dec->image_out_size = size;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
+    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) {
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  size_t row_size =
+      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * ysize;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetImageOutBuffer(JxlDecoder* dec,
+                                             const JxlPixelFormat* format,
+                                             void* buffer, size_t size) {
+  if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) {
+    return JXL_API_ERROR("No image out buffer needed at this time");
+  }
+  if (dec->image_out_buffer_set && !!dec->image_out_run_callback) {
+    return JXL_API_ERROR(
+        "Cannot change from image out callback to image out buffer");
+  }
+  if (format->num_channels < 3 &&
+      !dec->image_metadata.color_encoding.IsGray()) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  size_t min_size;
+  // This also checks whether the format is valid and supported and basic info
+  // is available.
+  JxlDecoderStatus status =
+      JxlDecoderImageOutBufferSize(dec, format, &min_size);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_buffer = buffer;
+  dec->image_out_size = size;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec,
+                                                  const JxlPixelFormat* format,
+                                                  size_t* size,
+                                                  uint32_t index) {
+  if (!dec->got_basic_info || !(dec->orig_events_wanted & JXL_DEC_FULL_IMAGE)) {
+    return JXL_API_ERROR("No extra channel buffer needed at this time");
+  }
+
+  if (index >= dec->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR("Invalid extra channel index");
+  }
+
+  size_t num_channels = 1;  // Do not use format's num_channels
+
+  size_t bits;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  size_t row_size =
+      jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte);
+  if (format->align > 1) {
+    row_size = jxl::DivCeil(row_size, format->align) * format->align;
+  }
+  *size = row_size * ysize;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetExtraChannelBuffer(JxlDecoder* dec,
+                                                 const JxlPixelFormat* format,
+                                                 void* buffer, size_t size,
+                                                 uint32_t index) {
+  size_t min_size;
+  // This also checks whether the format and index are valid and supported and
+  // basic info is available.
+  JxlDecoderStatus status =
+      JxlDecoderExtraChannelBufferSize(dec, format, &min_size, index);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  if (size < min_size) return JXL_DEC_ERROR;
+
+  if (dec->extra_channel_output.size() <= index) {
+    dec->extra_channel_output.resize(dec->metadata.m.num_extra_channels,
+                                     {{}, nullptr, 0});
+  }
+  // Guaranteed correct thanks to check in JxlDecoderExtraChannelBufferSize.
+  JXL_ASSERT(index < dec->extra_channel_output.size());
+
+  dec->extra_channel_output[index].format = *format;
+  dec->extra_channel_output[index].format.num_channels = 1;
+  dec->extra_channel_output[index].buffer = buffer;
+  dec->extra_channel_output[index].buffer_size = size;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetImageOutCallback(JxlDecoder* dec,
+                                               const JxlPixelFormat* format,
+                                               JxlImageOutCallback callback,
+                                               void* opaque) {
+  dec->simple_image_out_callback.callback = callback;
+  dec->simple_image_out_callback.opaque = opaque;
+  const auto init_callback =
+      +[](void* init_opaque, size_t num_threads, size_t num_pixels_per_thread) {
+        // No initialization to do, just reuse init_opaque as run_opaque.
+        return init_opaque;
+      };
+  const auto run_callback =
+      +[](void* run_opaque, size_t thread_id, size_t x, size_t y,
+          size_t num_pixels, const void* pixels) {
+        const auto* const simple_callback =
+            static_cast<const JxlDecoder::SimpleImageOutCallback*>(run_opaque);
+        simple_callback->callback(simple_callback->opaque, x, y, num_pixels,
+                                  pixels);
+      };
+  const auto destroy_callback = +[](void* run_opaque) {};
+  return JxlDecoderSetMultithreadedImageOutCallback(
+      dec, format, init_callback, run_callback,
+      /*destroy_callback=*/destroy_callback, &dec->simple_image_out_callback);
+}
+
+JxlDecoderStatus JxlDecoderSetMultithreadedImageOutCallback(
+    JxlDecoder* dec, const JxlPixelFormat* format,
+    JxlImageOutInitCallback init_callback, JxlImageOutRunCallback run_callback,
+    JxlImageOutDestroyCallback destroy_callback, void* init_opaque) {
+  if (dec->image_out_buffer_set && !!dec->image_out_buffer) {
+    return JXL_API_ERROR(
+        "Cannot change from image out buffer to image out callback");
+  }
+
+  if (init_callback == nullptr || run_callback == nullptr ||
+      destroy_callback == nullptr) {
+    return JXL_API_ERROR("All callbacks are required");
+  }
+
+  // Perform error checking for invalid format.
+  size_t bits_dummy;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy);
+  if (status != JXL_DEC_SUCCESS) return status;
+
+  dec->image_out_buffer_set = true;
+  dec->image_out_init_callback = init_callback;
+  dec->image_out_run_callback = run_callback;
+  dec->image_out_destroy_callback = destroy_callback;
+  dec->image_out_init_opaque = init_opaque;
+  dec->image_out_format = *format;
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
+                                          JxlFrameHeader* header) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  const auto& metadata = dec->metadata.m;
+  memset(header, 0, sizeof(*header));
+  if (metadata.have_animation) {
+    header->duration = dec->frame_header->animation_frame.duration;
+    if (metadata.animation.have_timecodes) {
+      header->timecode = dec->frame_header->animation_frame.timecode;
+    }
+  }
+  header->name_length = dec->frame_header->name.size();
+  header->is_last = dec->frame_header->is_last;
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize);
+  header->layer_info.xsize = xsize;
+  header->layer_info.ysize = ysize;
+  if (!dec->coalescing && dec->frame_header->custom_size_or_origin) {
+    header->layer_info.crop_x0 = dec->frame_header->frame_origin.x0;
+    header->layer_info.crop_y0 = dec->frame_header->frame_origin.y0;
+    header->layer_info.have_crop = JXL_TRUE;
+  } else {
+    header->layer_info.crop_x0 = 0;
+    header->layer_info.crop_y0 = 0;
+    header->layer_info.have_crop = JXL_FALSE;
+  }
+  if (!dec->keep_orientation && !dec->coalescing) {
+    // orient the crop offset
+    size_t W = dec->metadata.oriented_xsize(false);
+    size_t H = dec->metadata.oriented_ysize(false);
+    if (metadata.orientation > 4) {
+      std::swap(header->layer_info.crop_x0, header->layer_info.crop_y0);
+    }
+    size_t o = (metadata.orientation - 1) & 3;
+    if (o > 0 && o < 3) {
+      header->layer_info.crop_x0 = W - xsize - header->layer_info.crop_x0;
+    }
+    if (o > 1) {
+      header->layer_info.crop_y0 = H - ysize - header->layer_info.crop_y0;
+    }
+  }
+  if (dec->coalescing) {
+    header->layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
+    header->layer_info.blend_info.source = 0;
+    header->layer_info.blend_info.alpha = 0;
+    header->layer_info.blend_info.clamp = JXL_FALSE;
+    header->layer_info.save_as_reference = 0;
+  } else {
+    header->layer_info.blend_info.blendmode =
+        static_cast<JxlBlendMode>(dec->frame_header->blending_info.mode);
+    header->layer_info.blend_info.source =
+        dec->frame_header->blending_info.source;
+    header->layer_info.blend_info.alpha =
+        dec->frame_header->blending_info.alpha_channel;
+    header->layer_info.blend_info.clamp =
+        dec->frame_header->blending_info.clamp;
+    header->layer_info.save_as_reference = dec->frame_header->save_as_reference;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(const JxlDecoder* dec,
+                                                    size_t index,
+                                                    JxlBlendInfo* blend_info) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  const auto& metadata = dec->metadata.m;
+  if (index >= metadata.num_extra_channels) {
+    return JXL_API_ERROR("Invalid extra channel index");
+  }
+  blend_info->blendmode = static_cast<JxlBlendMode>(
+      dec->frame_header->extra_channel_blending_info[index].mode);
+  blend_info->source =
+      dec->frame_header->extra_channel_blending_info[index].source;
+  blend_info->alpha =
+      dec->frame_header->extra_channel_blending_info[index].alpha_channel;
+  blend_info->clamp =
+      dec->frame_header->extra_channel_blending_info[index].clamp;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name,
+                                        size_t size) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  if (size < dec->frame_header->name.size() + 1) {
+    return JXL_API_ERROR("too small frame name output buffer");
+  }
+  memcpy(name, dec->frame_header->name.c_str(),
+         dec->frame_header->name.size() + 1);
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetPreferredColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding) {
+  if (!dec->got_all_headers) {
+    return JXL_API_ERROR("color info not yet available");
+  }
+  if (dec->post_headers) {
+    return JXL_API_ERROR("too late to set the color encoding");
+  }
+  if (dec->image_metadata.color_encoding.IsGray() &&
+      color_encoding->color_space != JXL_COLOR_SPACE_GRAY &&
+      dec->image_out_buffer_set && dec->image_out_format.num_channels < 3) {
+    return JXL_API_ERROR("Number of channels is too low for color output");
+  }
+  if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN) {
+    return JXL_API_ERROR("Unknown output colorspace");
+  }
+  jxl::ColorEncoding c_out;
+  JXL_API_RETURN_IF_ERROR(
+      ConvertExternalToInternalColorEncoding(*color_encoding, &c_out));
+  JXL_API_RETURN_IF_ERROR(!c_out.ICC().empty());
+  auto& output_encoding = dec->passes_state->output_encoding_info;
+  if (!c_out.SameColorEncoding(output_encoding.color_encoding)) {
+    JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out));
+    dec->image_metadata.color_encoding = output_encoding.color_encoding;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetDesiredIntensityTarget(
+    JxlDecoder* dec, float desired_intensity_target) {
+  if (desired_intensity_target < 0) {
+    return JXL_API_ERROR("negative intensity target requested");
+  }
+  dec->desired_intensity_target = desired_intensity_target;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetBoxBuffer(JxlDecoder* dec, uint8_t* data,
+                                        size_t size) {
+  if (dec->box_out_buffer_set) {
+    return JXL_API_ERROR("must release box buffer before setting it again");
+  }
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only set box buffer after box event");
+  }
+
+  dec->box_out_buffer_set = true;
+  dec->box_out_buffer_set_current_box = true;
+  dec->box_out_buffer = data;
+  dec->box_out_buffer_size = size;
+  dec->box_out_buffer_pos = 0;
+  return JXL_DEC_SUCCESS;
+}
+
+size_t JxlDecoderReleaseBoxBuffer(JxlDecoder* dec) {
+  if (!dec->box_out_buffer_set) {
+    return 0;
+  }
+  size_t result = dec->box_out_buffer_size - dec->box_out_buffer_pos;
+  dec->box_out_buffer_set = false;
+  dec->box_out_buffer = nullptr;
+  dec->box_out_buffer_size = 0;
+  if (!dec->box_out_buffer_set_current_box) {
+    dec->box_out_buffer_begin = 0;
+  } else {
+    dec->box_out_buffer_begin += dec->box_out_buffer_pos;
+  }
+  dec->box_out_buffer_set_current_box = false;
+  return result;
+}
+
+JxlDecoderStatus JxlDecoderSetDecompressBoxes(JxlDecoder* dec,
+                                              JXL_BOOL decompress) {
+  // TODO(lode): return error if libbrotli is not compiled in the jxl decoding
+  // library
+  dec->decompress_boxes = decompress;
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetBoxType(JxlDecoder* dec, JxlBoxType type,
+                                      JXL_BOOL decompressed) {
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event");
+  }
+  if (decompressed) {
+    memcpy(type, dec->box_decoded_type, sizeof(dec->box_decoded_type));
+  } else {
+    memcpy(type, dec->box_type, sizeof(dec->box_type));
+  }
+
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderGetBoxSizeRaw(const JxlDecoder* dec,
+                                         uint64_t* size) {
+  if (!dec->box_event) {
+    return JXL_API_ERROR("can only get box info after JXL_DEC_BOX event");
+  }
+  if (size) {
+    *size = dec->box_size;
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec,
+                                                JxlProgressiveDetail detail) {
+  if (detail != kDC && detail != kLastPasses && detail != kPasses) {
+    return JXL_API_ERROR(
+        "Values other than kDC (%d), kLastPasses (%d) and kPasses (%d), "
+        "like %d are not implemented.",
+        kDC, kLastPasses, kPasses, detail);
+  }
+  dec->prog_detail = detail;
+  return JXL_DEC_SUCCESS;
+}
+
+namespace {
+
+template <typename T>
+JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                                      JxlPixelFormat format) {
+  if ((format.data_type == JXL_TYPE_FLOAT ||
+       format.data_type == JXL_TYPE_FLOAT16) &&
+      bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return JXL_API_ERROR(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is implemented "
+        "for float types.");
+  }
+  uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format);
+  if (format.data_type == JXL_TYPE_UINT8 &&
+      (bits_per_sample == 0 || bits_per_sample > 8)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint8 output",
+                         bits_per_sample);
+  } else if (format.data_type == JXL_TYPE_UINT16 &&
+             (bits_per_sample == 0 || bits_per_sample > 16)) {
+    return JXL_API_ERROR("Inavlid bit depth %u for uint16 output",
+                         bits_per_sample);
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec,
+                                               const JxlBitDepth* bit_depth) {
+  if (!dec->image_out_buffer_set) {
+    return JXL_API_ERROR("No image out buffer was set.");
+  }
+  JXL_API_RETURN_IF_ERROR(
+      VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format));
+  dec->image_out_bit_depth = *bit_depth;
+  return JXL_DEC_SUCCESS;
+}
diff --git a/third_party/jpeg-xl/lib/jxl/decode_test.cc b/third_party/jpeg-xl/lib/jxl/decode_test.cc
new file mode 100644
index 0000000000..30f6b61183
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc
@@ -0,0 +1,5507 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/color_description.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_progressive_split.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+#include "lib/jxl/toc.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+void AppendU32BE(uint32_t u32, jxl::PaddedBytes* bytes) {
+  bytes->push_back(u32 >> 24);
+  bytes->push_back(u32 >> 16);
+  bytes->push_back(u32 >> 8);
+  bytes->push_back(u32 >> 0);
+}
+
+// What type of codestream format in the boxes to use for testing
+enum CodeStreamBoxFormat {
+  // Do not use box format at all, only pure codestream
+  kCSBF_None,
+  // Have a single codestream box, with its actual size given in the box
+  kCSBF_Single,
+  // Have a single codestream box, with box size 0 (final box running to end)
+  kCSBF_Single_Zero_Terminated,
+  // Single codestream box, with another unknown box behind it
+  kCSBF_Single_Other,
+  // Have multiple partial codestream boxes
+  kCSBF_Multi,
+  // Have multiple partial codestream boxes, with final box size 0 (running
+  // to end)
+  kCSBF_Multi_Zero_Terminated,
+  // Have multiple partial codestream boxes, terminated by non-codestream box
+  kCSBF_Multi_Other_Terminated,
+  // Have multiple partial codestream boxes, terminated by non-codestream box
+  // that has its size set to 0 (running to end)
+  kCSBF_Multi_Other_Zero_Terminated,
+  // Have multiple partial codestream boxes, and the first one has a content
+  // of zero length
+  kCSBF_Multi_First_Empty,
+  // Have multiple partial codestream boxes, and the last one has a content
+  // of zero length and there is an unknown empty box at the end
+  kCSBF_Multi_Last_Empty_Other,
+  // Have a compressed exif box before a regular codestream box
+  kCSBF_Brob_Exif,
+  // Not a value but used for counting amount of enum entries
+  kCSBF_NUM_ENTRIES,
+};
+
+// Unknown boxes for testing
+static const char* unk1_box_type = "unk1";
+static const char* unk1_box_contents = "abcdefghijklmnopqrstuvwxyz";
+static const size_t unk1_box_size = strlen(unk1_box_contents);
+static const char* unk2_box_type = "unk2";
+static const char* unk2_box_contents = "0123456789";
+static const size_t unk2_box_size = strlen(unk2_box_contents);
+static const char* unk3_box_type = "unk3";
+static const char* unk3_box_contents = "ABCDEF123456";
+static const size_t unk3_box_size = strlen(unk3_box_contents);
+// Box with brob-compressed exif, including header
+static const uint8_t* box_brob_exif = reinterpret_cast<const uint8_t*>(
+    "\0\0\0@brobExif\241\350\2\300\177\244v\2525\304\360\27=?\267{"
+    "\33\37\314\332\214QX17PT\"\256\0\0\202s\214\313t\333\310\320k\20\276\30"
+    "\204\277l$\326c#\1\b");
+size_t box_brob_exif_size = 64;
+// The uncompressed Exif data from the brob box
+static const uint8_t* exif_uncompressed = reinterpret_cast<const uint8_t*>(
+    "\0\0\0\0MM\0*"
+    "\0\0\0\b\0\5\1\22\0\3\0\0\0\1\0\5\0\0\1\32\0\5\0\0\0\1\0\0\0J\1\33\0\5\0\0"
+    "\0\1\0\0\0R\1("
+    "\0\3\0\0\0\1\0\1\0\0\2\23\0\3\0\0\0\1\0\1\0\0\0\0\0\0\0\0\0\1\0\0\0\1\0\0"
+    "\0\1\0\0\0\1");
+size_t exif_uncompressed_size = 94;
+
+// Returns an ICC profile output by the JPEG XL decoder for RGB_D65_SRG_Rel_Lin,
+// but with, on purpose, rXYZ, bXYZ and gXYZ (the RGB primaries) switched to a
+// different order to ensure the profile does not match any known profile, so
+// the encoder cannot encode it in a compact struct instead.
+jxl::PaddedBytes GetIccTestProfile() {
+  const uint8_t* profile = reinterpret_cast<const uint8_t*>(
+      "\0\0\3\200lcms\0040\0\0mntrRGB XYZ "
+      "\a\344\0\a\0\27\0\21\0$"
+      "\0\37acspAPPL\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\366"
+      "\326\0\1\0\0\0\0\323-lcms\372c\207\36\227\200{"
+      "\2\232s\255\327\340\0\n\26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+      "\0\0\0\0\0\0\0\0\rdesc\0\0\1 "
+      "\0\0\0Bcprt\0\0\1d\0\0\1\0wtpt\0\0\2d\0\0\0\24chad\0\0\2x\0\0\0,"
+      "bXYZ\0\0\2\244\0\0\0\24gXYZ\0\0\2\270\0\0\0\24rXYZ\0\0\2\314\0\0\0\24rTR"
+      "C\0\0\2\340\0\0\0 gTRC\0\0\2\340\0\0\0 bTRC\0\0\2\340\0\0\0 "
+      "chrm\0\0\3\0\0\0\0$dmnd\0\0\3$\0\0\0("
+      "dmdd\0\0\3L\0\0\0002mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0&"
+      "\0\0\0\34\0R\0G\0B\0_\0D\0006\0005\0_\0S\0R\0G\0_\0R\0e\0l\0_"
+      "\0L\0i\0n\0\0mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\344\0\0\0\34\0C\0o\0"
+      "p\0y\0r\0i\0g\0h\0t\0 \0002\0000\0001\08\0 \0G\0o\0o\0g\0l\0e\0 "
+      "\0L\0L\0C\0,\0 \0C\0C\0-\0B\0Y\0-\0S\0A\0 \0003\0.\0000\0 "
+      "\0U\0n\0p\0o\0r\0t\0e\0d\0 "
+      "\0l\0i\0c\0e\0n\0s\0e\0(\0h\0t\0t\0p\0s\0:\0/\0/"
+      "\0c\0r\0e\0a\0t\0i\0v\0e\0c\0o\0m\0m\0o\0n\0s\0.\0o\0r\0g\0/"
+      "\0l\0i\0c\0e\0n\0s\0e\0s\0/\0b\0y\0-\0s\0a\0/\0003\0.\0000\0/"
+      "\0l\0e\0g\0a\0l\0c\0o\0d\0e\0)XYZ "
+      "\0\0\0\0\0\0\366\326\0\1\0\0\0\0\323-"
+      "sf32\0\0\0\0\0\1\fB\0\0\5\336\377\377\363%"
+      "\0\0\a\223\0\0\375\220\377\377\373\241\377\377\375\242\0\0\3\334\0\0\300"
+      "nXYZ \0\0\0\0\0\0o\240\0\08\365\0\0\3\220XYZ "
+      "\0\0\0\0\0\0$\237\0\0\17\204\0\0\266\304XYZ "
+      "\0\0\0\0\0\0b\227\0\0\267\207\0\0\30\331para\0\0\0\0\0\3\0\0\0\1\0\0\0\1"
+      "\0\0\0\0\0\0\0\1\0\0\0\0\0\0chrm\0\0\0\0\0\3\0\0\0\0\243\327\0\0T|"
+      "\0\0L\315\0\0\231\232\0\0&"
+      "g\0\0\17\\mluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\f\0\0\0\34\0G\0o\0o\0g"
+      "\0l\0emluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\26\0\0\0\34\0I\0m\0a\0g\0e"
+      "\0 \0c\0o\0d\0e\0c\0\0");
+  size_t profile_size = 896;
+  jxl::PaddedBytes icc_profile;
+  icc_profile.assign(profile, profile + profile_size);
+  return icc_profile;
+}
+
+}  // namespace
+
+namespace jxl {
+namespace {
+
+void AppendTestBox(const char* type, const char* contents, size_t contents_size,
+                   bool unbounded, PaddedBytes* bytes) {
+  AppendU32BE(contents_size + 8, bytes);
+  bytes->push_back(type[0]);
+  bytes->push_back(type[1]);
+  bytes->push_back(type[2]);
+  bytes->push_back(type[3]);
+  const uint8_t* contents_u = reinterpret_cast<const uint8_t*>(contents);
+  bytes->append(contents_u, contents_u + contents_size);
+}
+
+enum PreviewMode {
+  kNoPreview,
+  kSmallPreview,
+  kBigPreview,
+  kNumPreviewModes,
+};
+
+void GeneratePreview(PreviewMode preview_mode, ImageBundle* ib) {
+  if (preview_mode == kSmallPreview) {
+    ib->ShrinkTo(ib->xsize() / 7, ib->ysize() / 7);
+  } else if (preview_mode == kBigPreview) {
+    auto upsample7 = [&](const ImageF& in, ImageF* out) {
+      for (size_t y = 0; y < out->ysize(); ++y) {
+        for (size_t x = 0; x < out->xsize(); ++x) {
+          out->Row(y)[x] = in.ConstRow(y / 7)[x / 7];
+        }
+      }
+    };
+    Image3F preview(ib->xsize() * 7, ib->ysize() * 7);
+    for (size_t c = 0; c < 3; ++c) {
+      upsample7(ib->color()->Plane(c), &preview.Plane(c));
+    }
+    std::vector<ImageF> extra_channels;
+    for (size_t i = 0; i < ib->extra_channels().size(); ++i) {
+      ImageF ec(ib->xsize() * 7, ib->ysize() * 7);
+      upsample7(ib->extra_channels()[i], &ec);
+      extra_channels.emplace_back(std::move(ec));
+    }
+    ib->RemoveColor();
+    ib->ClearExtraChannels();
+    ib->SetFromImage(std::move(preview), ib->c_current());
+    ib->SetExtraChannels(std::move(extra_channels));
+  }
+}
+
+struct TestCodestreamParams {
+  CompressParams cparams;
+  CodeStreamBoxFormat box_format = kCSBF_None;
+  JxlOrientation orientation = JXL_ORIENT_IDENTITY;
+  PreviewMode preview_mode = kNoPreview;
+  bool add_intrinsic_size = false;
+  bool add_icc_profile = false;
+  float intensity_target = 0.0;
+  std::string color_space;
+  PaddedBytes* jpeg_codestream = nullptr;
+  const ProgressiveMode* progressive_mode = nullptr;
+};
+
+// Input pixels always given as 16-bit RGBA, 8 bytes per pixel.
+// include_alpha determines if the encoded image should contain the alpha
+// channel.
+// add_icc_profile: if false, encodes the image as sRGB using the JXL fields,
+// for grayscale or RGB images. If true, encodes the image using the ICC profile
+// returned by GetIccTestProfile, without the JXL fields, this requires the
+// image is RGB, not grayscale.
+// Providing jpeg_codestream will populate the jpeg_codestream with compressed
+// JPEG bytes, and make it possible to reconstruct those exact JPEG bytes using
+// the return value _if_ add_container indicates a box format.
+PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
+                                    size_t ysize, size_t num_channels,
+                                    const TestCodestreamParams& params) {
+  // Compress the pixels with JPEG XL.
+  bool grayscale = (num_channels <= 2);
+  bool include_alpha = !(num_channels & 1) && params.jpeg_codestream == nullptr;
+  size_t bitdepth = params.jpeg_codestream == nullptr ? 16 : 8;
+  CodecInOut io;
+  io.SetSize(xsize, ysize);
+  ColorEncoding color_encoding;
+  if (params.add_icc_profile) {
+    // the hardcoded ICC profile we attach requires RGB.
+    EXPECT_EQ(false, grayscale);
+    EXPECT_TRUE(params.color_space.empty());
+    EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile()));
+  } else if (!params.color_space.empty()) {
+    JxlColorEncoding c;
+    EXPECT_TRUE(jxl::ParseDescription(params.color_space, &c));
+    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+    EXPECT_EQ(color_encoding.IsGray(), grayscale);
+  } else {
+    color_encoding = jxl::ColorEncoding::SRGB(/*is_gray=*/grayscale);
+  }
+  ThreadPool pool(nullptr, nullptr);
+  io.metadata.m.SetUintSamples(bitdepth);
+  if (include_alpha) {
+    io.metadata.m.SetAlphaBits(bitdepth);
+  }
+  if (params.intensity_target != 0) {
+    io.metadata.m.SetIntensityTarget(params.intensity_target);
+  }
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
+  // Make the grayscale-ness of the io metadata color_encoding and the packed
+  // image match.
+  io.metadata.m.color_encoding = color_encoding;
+  EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding,
+                                  /*bits_per_sample=*/16, format, &pool,
+                                  &io.Main()));
+  jxl::PaddedBytes jpeg_data;
+  if (params.jpeg_codestream != nullptr) {
+#if JPEGXL_ENABLE_JPEG
+    std::vector<uint8_t> jpeg_bytes;
+    io.jpeg_quality = 70;
+    EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding,
+                       /*bits_per_sample=*/8, &jpeg_bytes, &pool));
+    params.jpeg_codestream->append(jpeg_bytes.data(),
+                                   jpeg_bytes.data() + jpeg_bytes.size());
+    EXPECT_TRUE(jxl::jpeg::DecodeImageJPG(
+        jxl::Span<const uint8_t>(jpeg_bytes.data(), jpeg_bytes.size()), &io));
+    EXPECT_TRUE(
+        EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params.cparams));
+    io.metadata.m.xyb_encoded = false;
+#else   // JPEGXL_ENABLE_JPEG
+    JXL_ABORT(
+        "unable to create reconstructible JPEG without JPEG support enabled");
+#endif  // JPEGXL_ENABLE_JPEG
+  }
+  if (params.preview_mode) {
+    io.preview_frame = io.Main().Copy();
+    GeneratePreview(params.preview_mode, &io.preview_frame);
+    io.metadata.m.have_preview = true;
+    EXPECT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(),
+                                               io.preview_frame.ysize()));
+  }
+  if (params.add_intrinsic_size) {
+    EXPECT_TRUE(io.metadata.m.intrinsic_size.Set(xsize / 3, ysize / 3));
+  }
+  io.metadata.m.orientation = params.orientation;
+  AuxOut aux_out;
+  PaddedBytes compressed;
+  PassesEncoderState enc_state;
+  if (params.progressive_mode) {
+    enc_state.progressive_splitter.SetProgressiveMode(*params.progressive_mode);
+  }
+  EXPECT_TRUE(EncodeFile(params.cparams, &io, &enc_state, &compressed,
+                         GetJxlCms(), &aux_out, &pool));
+  CodeStreamBoxFormat add_container = params.box_format;
+  if (add_container != kCSBF_None) {
+    // Header with signature box and ftyp box.
+    const uint8_t header[] = {0,    0,    0,    0xc,  0x4a, 0x58, 0x4c, 0x20,
+                              0xd,  0xa,  0x87, 0xa,  0,    0,    0,    0x14,
+                              0x66, 0x74, 0x79, 0x70, 0x6a, 0x78, 0x6c, 0x20,
+                              0,    0,    0,    0,    0x6a, 0x78, 0x6c, 0x20};
+
+    bool is_multi = add_container == kCSBF_Multi ||
+                    add_container == kCSBF_Multi_Zero_Terminated ||
+                    add_container == kCSBF_Multi_Other_Terminated ||
+                    add_container == kCSBF_Multi_Other_Zero_Terminated ||
+                    add_container == kCSBF_Multi_First_Empty ||
+                    add_container == kCSBF_Multi_Last_Empty_Other;
+
+    if (is_multi) {
+      size_t third = compressed.size() / 3;
+      std::vector<uint8_t> compressed0(compressed.data(),
+                                       compressed.data() + third);
+      std::vector<uint8_t> compressed1(compressed.data() + third,
+                                       compressed.data() + 2 * third);
+      std::vector<uint8_t> compressed2(compressed.data() + 2 * third,
+                                       compressed.data() + compressed.size());
+
+      PaddedBytes c;
+      c.append(header, header + sizeof(header));
+      if (params.jpeg_codestream != nullptr) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                             &c);
+        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+      }
+      uint32_t jxlp_index = 0;
+      if (add_container == kCSBF_Multi_First_Empty) {
+        // Dummy (empty) codestream part
+        AppendU32BE(12, &c);
+        c.push_back('j');
+        c.push_back('x');
+        c.push_back('l');
+        c.push_back('p');
+        AppendU32BE(jxlp_index++, &c);
+      }
+      // First codestream part
+      AppendU32BE(compressed0.size() + 12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      c.append(compressed0.data(), compressed0.data() + compressed0.size());
+      // A few non-codestream boxes in between
+      AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false, &c);
+      AppendTestBox(unk2_box_type, unk2_box_contents, unk2_box_size, false, &c);
+      // Dummy (empty) codestream part
+      AppendU32BE(12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      // Second codestream part
+      AppendU32BE(compressed1.size() + 12, &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      AppendU32BE(jxlp_index++, &c);
+      c.append(compressed1.data(), compressed1.data() + compressed1.size());
+      // Third (last) codestream part
+      AppendU32BE(add_container == kCSBF_Multi_Zero_Terminated
+                      ? 0
+                      : (compressed2.size() + 12),
+                  &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('p');
+      if (add_container != kCSBF_Multi_Last_Empty_Other) {
+        AppendU32BE(jxlp_index++ | 0x80000000, &c);
+      } else {
+        AppendU32BE(jxlp_index++, &c);
+      }
+      c.append(compressed2.data(), compressed2.data() + compressed2.size());
+      if (add_container == kCSBF_Multi_Last_Empty_Other) {
+        // Dummy (empty) codestream part
+        AppendU32BE(12, &c);
+        c.push_back('j');
+        c.push_back('x');
+        c.push_back('l');
+        c.push_back('p');
+        AppendU32BE(jxlp_index++ | 0x80000000, &c);
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false,
+                      &c);
+      }
+      if (add_container == kCSBF_Multi_Other_Terminated) {
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, false,
+                      &c);
+      }
+      if (add_container == kCSBF_Multi_Other_Zero_Terminated) {
+        AppendTestBox(unk3_box_type, unk3_box_contents, unk3_box_size, true,
+                      &c);
+      }
+      compressed.swap(c);
+    } else {
+      PaddedBytes c;
+      c.append(header, header + sizeof(header));
+      if (params.jpeg_codestream != nullptr) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                             &c);
+        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+      }
+      if (add_container == kCSBF_Brob_Exif) {
+        c.append(box_brob_exif, box_brob_exif + box_brob_exif_size);
+      }
+      AppendU32BE(add_container == kCSBF_Single_Zero_Terminated
+                      ? 0
+                      : (compressed.size() + 8),
+                  &c);
+      c.push_back('j');
+      c.push_back('x');
+      c.push_back('l');
+      c.push_back('c');
+      c.append(compressed.data(), compressed.data() + compressed.size());
+      if (add_container == kCSBF_Single_Other) {
+        AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false,
+                      &c);
+      }
+      compressed.swap(c);
+    }
+  }
+
+  return compressed;
+}
+
+JxlDecoderStatus ProcessInputIgnoreBoxes(JxlDecoder* dec) {
+  JxlDecoderStatus status = JXL_DEC_BOX;
+  while (status == JXL_DEC_BOX) {
+    status = JxlDecoderProcessInput(dec);
+  }
+  return status;
+}
+
+// Decodes one-shot with the API for non-streaming decoding tests.
+std::vector<uint8_t> DecodeWithAPI(JxlDecoder* dec,
+                                   Span<const uint8_t> compressed,
+                                   const JxlPixelFormat& format,
+                                   bool use_callback, bool set_buffer_early,
+                                   bool use_resizable_runner,
+                                   bool require_boxes, bool expect_success,
+                                   PaddedBytes* icc = nullptr) {
+  JxlThreadParallelRunnerPtr runner_fixed;
+  JxlResizableParallelRunnerPtr runner_resizable;
+  JxlParallelRunner runner_fn;
+  void* runner;
+
+  if (use_resizable_runner) {
+    runner_resizable = JxlResizableParallelRunnerMake(nullptr);
+    runner = runner_resizable.get();
+    runner_fn = JxlResizableParallelRunner;
+  } else {
+    size_t hw_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
+    runner_fixed =
+        JxlThreadParallelRunnerMake(nullptr, std::min<size_t>(hw_threads, 16));
+    runner = runner_fixed.get();
+    runner_fn = JxlThreadParallelRunner;
+  }
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, runner_fn, runner));
+
+  auto process_input =
+      require_boxes ? ProcessInputIgnoreBoxes : JxlDecoderProcessInput;
+
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderSubscribeEvents(
+          dec, JXL_DEC_BASIC_INFO | (set_buffer_early ? JXL_DEC_FRAME : 0) |
+                   JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FULL_IMAGE |
+                   (require_boxes ? JXL_DEC_BOX : 0) |
+                   (icc != nullptr ? JXL_DEC_COLOR_ENCODING : 0)));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  if (use_resizable_runner) {
+    JxlResizableParallelRunnerSetThreads(
+        runner,
+        JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+  }
+
+  std::vector<uint8_t> pixels(buffer_size);
+  size_t bytes_per_pixel = format.num_channels *
+                           test::GetDataBits(format.data_type) /
+                           jxl::kBitsPerByte;
+  size_t stride = bytes_per_pixel * info.xsize;
+  if (format.align > 1) {
+    stride = jxl::DivCeil(stride, format.align) * format.align;
+  }
+  auto callback = [&](size_t x, size_t y, size_t num_pixels,
+                      const void* pixels_row) {
+    memcpy(pixels.data() + stride * y + bytes_per_pixel * x, pixels_row,
+           num_pixels * bytes_per_pixel);
+  };
+
+  JxlDecoderStatus status = process_input(dec);
+
+  if (status == JXL_DEC_COLOR_ENCODING) {
+    size_t icc_size = 0;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetICCProfileSize(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size));
+    icc->resize(icc_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc->data(), icc_size));
+
+    status = process_input(dec);
+  }
+
+  std::vector<uint8_t> preview;
+  if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) {
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+    preview.resize(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(),
+                                            preview.size()));
+    EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, process_input(dec));
+
+    status = process_input(dec);
+  }
+
+  if (set_buffer_early) {
+    EXPECT_EQ(JXL_DEC_FRAME, status);
+  } else {
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, status);
+  }
+
+  if (use_callback) {
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetImageOutCallback(
+                  dec, &format,
+                  [](void* opaque, size_t x, size_t y, size_t xsize,
+                     const void* pixels_row) {
+                    auto cb = static_cast<decltype(&callback)>(opaque);
+                    (*cb)(x, y, xsize, pixels_row);
+                  },
+                  /*opaque=*/&callback));
+  } else {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+  }
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input(dec));
+
+  // After the full image was output, JxlDecoderProcessInput should return
+  // success to indicate all is done, unless we requested boxes and the last
+  // box was not a terminal unbounded box, in which case it should ask for
+  // more input.
+  JxlDecoderStatus expected_status =
+      expect_success ? JXL_DEC_SUCCESS : JXL_DEC_NEED_MORE_INPUT;
+  EXPECT_EQ(expected_status, process_input(dec));
+
+  return pixels;
+}
+
+// Decodes one-shot with the API for non-streaming decoding tests.
+std::vector<uint8_t> DecodeWithAPI(Span<const uint8_t> compressed,
+                                   const JxlPixelFormat& format,
+                                   bool use_callback, bool set_buffer_early,
+                                   bool use_resizable_runner,
+                                   bool require_boxes, bool expect_success) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  std::vector<uint8_t> pixels =
+      DecodeWithAPI(dec, compressed, format, use_callback, set_buffer_early,
+                    use_resizable_runner, require_boxes, expect_success);
+  JxlDecoderDestroy(dec);
+  return pixels;
+}
+
+}  // namespace
+}  // namespace jxl
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(DecodeTest, JxlSignatureCheckTest) {
+  std::vector<std::pair<int, std::vector<uint8_t>>> tests = {
+      // No JPEGXL header starts with 'a'.
+      {JXL_SIG_INVALID, {'a'}},
+      {JXL_SIG_INVALID, {'a', 'b', 'c', 'd', 'e', 'f'}},
+
+      // Empty file is not enough bytes.
+      {JXL_SIG_NOT_ENOUGH_BYTES, {}},
+
+      // JPEGXL headers.
+      {JXL_SIG_NOT_ENOUGH_BYTES, {0xff}},  // Part of a signature.
+      {JXL_SIG_INVALID, {0xff, 0xD8}},     // JPEG-1
+      {JXL_SIG_CODESTREAM, {0xff, 0x0a}},
+
+      // JPEGXL container file.
+      {JXL_SIG_CONTAINER,
+       {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0xA}},
+      // Ending with invalid byte.
+      {JXL_SIG_INVALID, {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87, 0}},
+      // Part of signature.
+      {JXL_SIG_NOT_ENOUGH_BYTES,
+       {0, 0, 0, 0xc, 'J', 'X', 'L', ' ', 0xD, 0xA, 0x87}},
+      {JXL_SIG_NOT_ENOUGH_BYTES, {0}},
+  };
+  for (const auto& test : tests) {
+    EXPECT_EQ(test.first,
+              JxlSignatureCheck(test.second.data(), test.second.size()))
+        << "Where test data is " << ::testing::PrintToString(test.second);
+  }
+}
+
+TEST(DecodeTest, DefaultAllocTest) {
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, CustomAllocTest) {
+  struct CalledCounters {
+    int allocs = 0;
+    int frees = 0;
+  } counters;
+
+  JxlMemoryManager mm;
+  mm.opaque = &counters;
+  mm.alloc = [](void* opaque, size_t size) {
+    reinterpret_cast<CalledCounters*>(opaque)->allocs++;
+    return malloc(size);
+  };
+  mm.free = [](void* opaque, void* address) {
+    reinterpret_cast<CalledCounters*>(opaque)->frees++;
+    free(address);
+  };
+
+  JxlDecoder* dec = JxlDecoderCreate(&mm);
+  EXPECT_NE(nullptr, dec);
+  EXPECT_LE(1, counters.allocs);
+  EXPECT_EQ(0, counters.frees);
+  JxlDecoderDestroy(dec);
+  EXPECT_LE(1, counters.frees);
+}
+
+// TODO(lode): add multi-threaded test when multithreaded pixel decoding from
+// API is implemented.
+TEST(DecodeTest, DefaultParallelRunnerTest) {
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, nullptr, nullptr));
+  JxlDecoderDestroy(dec);
+}
+
+// Creates the header of a JPEG XL file with various custom parameters for
+// testing.
+// xsize, ysize: image dimensions to store in the SizeHeader, max 512.
+// bits_per_sample, orientation: a selection of header parameters to test with.
+// orientation: image orientation to set in the metadata
+// alpha_bits: if non-0, alpha extra channel bits to set in the metadata. Also
+//   gives the alpha channel the name "alpha_test"
+// have_container: add box container format around the codestream.
+// metadata_default: if true, ImageMetadata is set to default and
+//   bits_per_sample, orientation and alpha_bits are ignored.
+// insert_box: insert an extra box before the codestream box, making the header
+// farther away from the front than is ideal. Only used if have_container.
+std::vector<uint8_t> GetTestHeader(size_t xsize, size_t ysize,
+                                   size_t bits_per_sample, size_t orientation,
+                                   size_t alpha_bits, bool xyb_encoded,
+                                   bool have_container, bool metadata_default,
+                                   bool insert_extra_box,
+                                   const jxl::PaddedBytes& icc_profile) {
+  jxl::BitWriter writer;
+  jxl::BitWriter::Allotment allotment(&writer, 65536);  // Large enough
+
+  if (have_container) {
+    const std::vector<uint8_t> signature_box = {0,   0,   0,   0xc, 'J',  'X',
+                                                'L', ' ', 0xd, 0xa, 0x87, 0xa};
+    const std::vector<uint8_t> filetype_box = {
+        0,   0,   0, 0x14, 'f', 't', 'y', 'p', 'j', 'x',
+        'l', ' ', 0, 0,    0,   0,   'j', 'x', 'l', ' '};
+    const std::vector<uint8_t> extra_box_header = {0,   0,   0,   0xff,
+                                                   't', 'e', 's', 't'};
+    // Beginning of codestream box, with an arbitrary size certainly large
+    // enough to contain the header
+    const std::vector<uint8_t> codestream_box_header = {0,   0,   0,   0xff,
+                                                        'j', 'x', 'l', 'c'};
+
+    for (size_t i = 0; i < signature_box.size(); i++) {
+      writer.Write(8, signature_box[i]);
+    }
+    for (size_t i = 0; i < filetype_box.size(); i++) {
+      writer.Write(8, filetype_box[i]);
+    }
+    if (insert_extra_box) {
+      for (size_t i = 0; i < extra_box_header.size(); i++) {
+        writer.Write(8, extra_box_header[i]);
+      }
+      for (size_t i = 0; i < 255 - 8; i++) {
+        writer.Write(8, 0);
+      }
+    }
+    for (size_t i = 0; i < codestream_box_header.size(); i++) {
+      writer.Write(8, codestream_box_header[i]);
+    }
+  }
+
+  // JXL signature
+  writer.Write(8, 0xff);
+  writer.Write(8, 0x0a);
+
+  // SizeHeader
+  jxl::CodecMetadata metadata;
+  EXPECT_TRUE(metadata.size.Set(xsize, ysize));
+  EXPECT_TRUE(WriteSizeHeader(metadata.size, &writer, 0, nullptr));
+
+  if (!metadata_default) {
+    metadata.m.SetUintSamples(bits_per_sample);
+    metadata.m.orientation = orientation;
+    metadata.m.SetAlphaBits(alpha_bits);
+    metadata.m.xyb_encoded = xyb_encoded;
+    if (alpha_bits != 0) {
+      metadata.m.extra_channel_info[0].name = "alpha_test";
+    }
+  }
+
+  if (!icc_profile.empty()) {
+    jxl::PaddedBytes copy = icc_profile;
+    EXPECT_TRUE(metadata.m.color_encoding.SetICC(std::move(copy)));
+  }
+
+  EXPECT_TRUE(jxl::Bundle::Write(metadata.m, &writer, 0, nullptr));
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(jxl::Bundle::Write(metadata.transform_data, &writer, 0, nullptr));
+
+  if (!icc_profile.empty()) {
+    EXPECT_TRUE(metadata.m.color_encoding.WantICC());
+    EXPECT_TRUE(jxl::WriteICC(icc_profile, &writer, 0, nullptr));
+  }
+
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+  return std::vector<uint8_t>(
+      writer.GetSpan().data(),
+      writer.GetSpan().data() + writer.GetSpan().size());
+}
+
+TEST(DecodeTest, BasicInfoTest) {
+  size_t xsize[2] = {50, 33};
+  size_t ysize[2] = {50, 77};
+  size_t bits_per_sample[2] = {8, 23};
+  size_t orientation[2] = {3, 5};
+  size_t alpha_bits[2] = {0, 8};
+  JXL_BOOL have_container[2] = {0, 1};
+  bool xyb_encoded = false;
+
+  std::vector<std::vector<uint8_t>> test_samples;
+  // Test with direct codestream
+  test_samples.push_back(GetTestHeader(
+      xsize[0], ysize[0], bits_per_sample[0], orientation[0], alpha_bits[0],
+      xyb_encoded, have_container[0], /*metadata_default=*/false,
+      /*insert_extra_box=*/false, {}));
+  // Test with container and different parameters
+  test_samples.push_back(GetTestHeader(
+      xsize[1], ysize[1], bits_per_sample[1], orientation[1], alpha_bits[1],
+      xyb_encoded, have_container[1], /*metadata_default=*/false,
+      /*insert_extra_box=*/false, {}));
+
+  for (size_t i = 0; i < test_samples.size(); ++i) {
+    const std::vector<uint8_t>& data = test_samples[i];
+    // Test decoding too small header first, until we reach the final byte.
+    for (size_t size = 0; size <= data.size(); ++size) {
+      // Test with a new decoder for each tested byte size.
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+      const uint8_t* next_in = data.data();
+      size_t avail_in = size;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+
+      JxlBasicInfo info;
+      bool have_basic_info = !JxlDecoderGetBasicInfo(dec, &info);
+
+      if (size == data.size()) {
+        EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+
+        // All header bytes given so the decoder must have the basic info.
+        EXPECT_EQ(true, have_basic_info);
+        EXPECT_EQ(have_container[i], info.have_container);
+        EXPECT_EQ(alpha_bits[i], info.alpha_bits);
+        // Orientations 5..8 swap the dimensions
+        if (orientation[i] >= 5) {
+          EXPECT_EQ(xsize[i], info.ysize);
+          EXPECT_EQ(ysize[i], info.xsize);
+        } else {
+          EXPECT_EQ(xsize[i], info.xsize);
+          EXPECT_EQ(ysize[i], info.ysize);
+        }
+        // The API should set the orientation to identity by default since it
+        // already applies the transformation internally by default.
+        EXPECT_EQ(1u, info.orientation);
+
+        EXPECT_EQ(3u, info.num_color_channels);
+
+        if (alpha_bits[i] != 0) {
+          // Expect an extra channel
+          EXPECT_EQ(1u, info.num_extra_channels);
+          JxlExtraChannelInfo extra;
+          EXPECT_EQ(0, JxlDecoderGetExtraChannelInfo(dec, 0, &extra));
+          EXPECT_EQ(alpha_bits[i], extra.bits_per_sample);
+          EXPECT_EQ(JXL_CHANNEL_ALPHA, extra.type);
+          EXPECT_EQ(0, extra.alpha_premultiplied);
+          // Verify the name "alpha_test" given to the alpha channel
+          EXPECT_EQ(10u, extra.name_length);
+          char name[11];
+          EXPECT_EQ(0,
+                    JxlDecoderGetExtraChannelName(dec, 0, name, sizeof(name)));
+          EXPECT_EQ(std::string("alpha_test"), std::string(name));
+        } else {
+          EXPECT_EQ(0u, info.num_extra_channels);
+        }
+
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      } else {
+        // If we did not give the full header, the basic info should not be
+        // available. Allow a few bytes of slack due to some bits for default
+        // opsinmatrix/extension bits.
+        if (size + 2 < data.size()) {
+          EXPECT_EQ(false, have_basic_info);
+          EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status);
+        }
+      }
+
+      // Test that decoder doesn't allow setting a setting required at beginning
+      // unless it's reset
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+      JxlDecoderReset(dec);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+TEST(DecodeTest, BufferSizeTest) {
+  size_t xsize = 33;
+  size_t ysize = 77;
+  size_t bits_per_sample = 8;
+  size_t orientation = 1;
+  size_t alpha_bits = 8;
+  bool have_container = false;
+  bool xyb_encoded = false;
+
+  std::vector<uint8_t> header =
+      GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits,
+                    xyb_encoded, have_container, /*metadata_default=*/false,
+                    /*insert_extra_box=*/false, {});
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+  const uint8_t* next_in = header.data();
+  size_t avail_in = header.size();
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+  size_t image_out_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &image_out_size));
+  EXPECT_EQ(xsize * ysize * 4, image_out_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, BasicInfoSizeHintTest) {
+  // Test on a file where the size hint is too small initially due to inserting
+  // a box before the codestream (something that is normally not recommended)
+  size_t xsize = 50;
+  size_t ysize = 50;
+  size_t bits_per_sample = 16;
+  size_t orientation = 1;
+  size_t alpha_bits = 0;
+  bool xyb_encoded = false;
+  std::vector<uint8_t> data = GetTestHeader(
+      xsize, ysize, bits_per_sample, orientation, alpha_bits, xyb_encoded,
+      /*have_container=*/true, /*metadata_default=*/false,
+      /*insert_extra_box=*/true, {});
+
+  JxlDecoderStatus status;
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+
+  size_t hint0 = JxlDecoderSizeHintBasicInfo(dec);
+  // Test that the test works as intended: we construct a file on purpose to
+  // be larger than the first hint by having that extra box.
+  EXPECT_LT(hint0, data.size());
+  const uint8_t* next_in = data.data();
+  // Do as if we have only as many bytes as indicated by the hint available
+  size_t avail_in = std::min(hint0, data.size());
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, status);
+  // Basic info cannot be available yet due to the extra inserted box.
+  EXPECT_EQ(false, !JxlDecoderGetBasicInfo(dec, nullptr));
+
+  size_t num_read = avail_in - JxlDecoderReleaseInput(dec);
+  EXPECT_LT(num_read, data.size());
+
+  size_t hint1 = JxlDecoderSizeHintBasicInfo(dec);
+  // The hint must be larger than the previous hint (taking already processed
+  // bytes into account, the hint is a hint for the next avail_in) since the
+  // decoder now knows there is a box in between.
+  EXPECT_GT(hint1 + num_read, hint0);
+  avail_in = std::min<size_t>(hint1, data.size() - num_read);
+  next_in += num_read;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  status = JxlDecoderProcessInput(dec);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, status);
+  JxlBasicInfo info;
+  // We should have the basic info now, since we only added one box in-between,
+  // and the decoder should have known its size, its implementation can return
+  // a correct hint.
+  EXPECT_EQ(true, !JxlDecoderGetBasicInfo(dec, &info));
+
+  // Also test if the basic info is correct.
+  EXPECT_EQ(1, info.have_container);
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_EQ(orientation, info.orientation);
+  EXPECT_EQ(bits_per_sample, info.bits_per_sample);
+
+  JxlDecoderDestroy(dec);
+}
+
+std::vector<uint8_t> GetIccTestHeader(const jxl::PaddedBytes& icc_profile,
+                                      bool xyb_encoded) {
+  size_t xsize = 50;
+  size_t ysize = 50;
+  size_t bits_per_sample = 16;
+  size_t orientation = 1;
+  size_t alpha_bits = 0;
+  return GetTestHeader(xsize, ysize, bits_per_sample, orientation, alpha_bits,
+                       xyb_encoded,
+                       /*have_container=*/false, /*metadata_default=*/false,
+                       /*insert_extra_box=*/false, icc_profile);
+}
+
+// Tests the case where pixels and metadata ICC profile are the same
+TEST(DecodeTest, IccProfileTestOriginal) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  bool xyb_encoded = false;
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
+  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Expect the opposite of xyb_encoded for uses_original_profile
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(JXL_TRUE, info.uses_original_profile);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  // the encoded color profile expected to be not available, since the image
+  // has an ICC profile instead
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  size_t dec_profile_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+
+  // Check that can get return status with NULL size
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  // The profiles must be equal. This requires they have equal size, and if
+  // they do, we can get the profile and compare the contents.
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+  if (icc_profile.size() == dec_profile_size) {
+    jxl::PaddedBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetColorAsICCProfile(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                  icc_profile2.data(), icc_profile2.size()));
+    EXPECT_EQ(icc_profile, icc_profile2);
+  }
+
+  // the data is not xyb_encoded, so same result expected for the pixel data
+  // color profile
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+// Tests the case where pixels and metadata ICC profile are different
+TEST(DecodeTest, IccProfileTestXybEncoded) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  bool xyb_encoded = true;
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
+  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  JxlPixelFormat format_int = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Expect the opposite of xyb_encoded for uses_original_profile
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(JXL_FALSE, info.uses_original_profile);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  // the encoded color profile expected to be not available, since the image
+  // has an ICC profile instead
+  EXPECT_EQ(JXL_DEC_ERROR,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  // Check that can get return status with NULL size
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+
+  size_t dec_profile_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+
+  // The profiles must be equal. This requires they have equal size, and if
+  // they do, we can get the profile and compare the contents.
+  EXPECT_EQ(icc_profile.size(), dec_profile_size);
+  if (icc_profile.size() == dec_profile_size) {
+    jxl::PaddedBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetColorAsICCProfile(
+                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                  icc_profile2.data(), icc_profile2.size()));
+    EXPECT_EQ(icc_profile, icc_profile2);
+  }
+
+  // Data is xyb_encoded, so the data profile is a different profile, encoded
+  // as structured profile.
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+  JxlColorEncoding pixel_encoding;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
+  // The API returns LINEAR by default when the colorspace cannot be represented
+  // by enum values.
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+
+  // Test the same but with integer format.
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetColorAsEncodedProfile(
+          dec, &format_int, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+
+  // Test after setting the preferred color profile to non-linear sRGB:
+  // for XYB images with ICC profile, this setting is expected to take effect.
+  jxl::ColorEncoding temp_jxl_srgb = jxl::ColorEncoding::SRGB(false);
+  JxlColorEncoding pixel_encoding_srgb;
+  ConvertInternalToExternalColorEncoding(temp_jxl_srgb, &pixel_encoding_srgb);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_srgb));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function);
+
+  // The decoder can also output this as a generated ICC profile anyway, and
+  // we're certain that it will differ from the above defined profile since
+  // the sRGB data should not have swapped R/G/B primaries.
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  // We don't need to dictate exactly what size the generated ICC profile
+  // must be (since there are many ways to represent the same color space),
+  // but it should not be zero.
+  EXPECT_NE(0u, dec_profile_size);
+  jxl::PaddedBytes icc_profile2(dec_profile_size);
+  if (0 != dec_profile_size) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc_profile2.data(), icc_profile2.size()));
+    // expected not equal
+    EXPECT_NE(icc_profile, icc_profile2);
+  }
+
+  // Test setting another different preferred profile, to verify that the
+  // returned JXL_COLOR_PROFILE_TARGET_DATA ICC profile is correctly
+  // updated.
+
+  jxl::ColorEncoding temp_jxl_linear = jxl::ColorEncoding::LinearSRGB(false);
+  JxlColorEncoding pixel_encoding_linear;
+  ConvertInternalToExternalColorEncoding(temp_jxl_linear,
+                                         &pixel_encoding_linear);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_linear));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
+                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 &dec_profile_size));
+  EXPECT_NE(0u, dec_profile_size);
+  jxl::PaddedBytes icc_profile3(dec_profile_size);
+  if (0 != dec_profile_size) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc_profile3.data(), icc_profile3.size()));
+    // expected not equal to the previously set preferred profile.
+    EXPECT_NE(icc_profile2, icc_profile3);
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+// Test decoding ICC from partial files byte for byte.
+// This test must pass also if JXL_CRASH_ON_ERROR is enabled, that is, the
+// decoding of the ANS histogram and stream of the encoded ICC profile must also
+// handle the case of not enough input bytes with StatusCode::kNotEnoughBytes
+// rather than fatal error status codes.
+TEST(DecodeTest, ICCPartialTest) {
+  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  std::vector<uint8_t> data = GetIccTestHeader(icc_profile, false);
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  const uint8_t* next_in = data.data();
+  size_t avail_in = 0;
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING));
+
+  bool seen_basic_info = false;
+  bool seen_color_encoding = false;
+  size_t total_size = 0;
+
+  for (;;) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+    size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, avail_in);
+    next_in += avail_in - remaining;
+    avail_in = remaining;
+    if (status == JXL_DEC_NEED_MORE_INPUT) {
+      if (total_size >= data.size()) {
+        // End of partial codestream with codestrema headers and ICC profile
+        // reached, it should not require more input since full image is not
+        // requested
+        FAIL();
+        break;
+      }
+      size_t increment = 1;
+      if (total_size + increment > data.size()) {
+        increment = data.size() - total_size;
+      }
+      total_size += increment;
+      avail_in += increment;
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      EXPECT_FALSE(seen_basic_info);
+      seen_basic_info = true;
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      EXPECT_TRUE(seen_basic_info);
+      EXPECT_FALSE(seen_color_encoding);
+      seen_color_encoding = true;
+
+      // Sanity check that the ICC profile was decoded correctly
+      size_t dec_profile_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderGetICCProfileSize(dec, &format,
+                                            JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                            &dec_profile_size));
+      EXPECT_EQ(icc_profile.size(), dec_profile_size);
+
+    } else if (status == JXL_DEC_SUCCESS) {
+      EXPECT_TRUE(seen_color_encoding);
+      break;
+    } else {
+      // We do not expect any other events or errors
+      FAIL();
+      break;
+    }
+  }
+
+  EXPECT_TRUE(seen_basic_info);
+  EXPECT_TRUE(seen_color_encoding);
+
+  JxlDecoderDestroy(dec);
+}
+
+struct PixelTestConfig {
+  // Input image definition.
+  bool grayscale;
+  bool include_alpha;
+  size_t xsize;
+  size_t ysize;
+  jxl::PreviewMode preview_mode;
+  bool add_intrinsic_size;
+  // Output format.
+  JxlEndianness endianness;
+  JxlDataType data_type;
+  uint32_t output_channels;
+  // Container options.
+  CodeStreamBoxFormat add_container;
+  // Decoding mode.
+  bool use_callback;
+  bool set_buffer_early;
+  bool use_resizable_runner;
+  // Exif orientation, 1-8
+  JxlOrientation orientation;
+  bool keep_orientation;
+  size_t upsampling;
+};
+
+class DecodeTestParam : public ::testing::TestWithParam<PixelTestConfig> {};
+
+TEST_P(DecodeTestParam, PixelTest) {
+  PixelTestConfig config = GetParam();
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  if (config.keep_orientation) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetKeepOrientation(dec, JXL_TRUE));
+  }
+
+  size_t num_pixels = config.xsize * config.ysize;
+  uint32_t orig_channels =
+      (config.grayscale ? 1 : 3) + (config.include_alpha ? 1 : 0);
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(config.xsize, config.ysize, orig_channels, 0);
+  JxlPixelFormat format_orig = {orig_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN,
+                                0};
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.resampling = config.upsampling;
+  params.cparams.ec_resampling = config.upsampling;
+  params.box_format = config.add_container;
+  params.orientation = config.orientation;
+  params.preview_mode = config.preview_mode;
+  params.add_intrinsic_size = config.add_intrinsic_size;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), config.xsize,
+      config.ysize, orig_channels, params);
+
+  JxlPixelFormat format = {config.output_channels, config.data_type,
+                           config.endianness, 0};
+
+  bool swap_xy = !config.keep_orientation && (config.orientation > 4);
+  size_t xsize = swap_xy ? config.ysize : config.xsize;
+  size_t ysize = swap_xy ? config.xsize : config.ysize;
+
+  std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      format, config.use_callback, config.set_buffer_early,
+      config.use_resizable_runner, /*require_boxes=*/false,
+      /*expect_success=*/true);
+  JxlDecoderReset(dec);
+  EXPECT_EQ(num_pixels * config.output_channels *
+                jxl::test::GetDataBits(config.data_type) / jxl::kBitsPerByte,
+            pixels2.size());
+
+  // If an orientation transformation is expected, to compare the pixels, also
+  // apply this transformation to the original pixels. ConvertToExternal is
+  // used to achieve this, with a temporary conversion to CodecInOut and back.
+  if (config.orientation > 1 && !config.keep_orientation) {
+    jxl::Span<const uint8_t> bytes(pixels.data(), pixels.size());
+    jxl::ColorEncoding color_encoding =
+        jxl::ColorEncoding::SRGB(config.grayscale);
+
+    jxl::CodecInOut io;
+    if (config.include_alpha) io.metadata.m.SetAlphaBits(16);
+    io.metadata.m.color_encoding = color_encoding;
+    io.SetSize(config.xsize, config.ysize);
+
+    EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize,
+                                    color_encoding, 16, format_orig, nullptr,
+                                    &io.Main()));
+
+    for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0;
+    EXPECT_TRUE(ConvertToExternal(
+        io.Main(), 16,
+        /*float_out=*/false, orig_channels, JXL_BIG_ENDIAN,
+        xsize * 2 * orig_channels, nullptr, pixels.data(), pixels.size(),
+        /*out_callback=*/{},
+        static_cast<jxl::Orientation>(config.orientation)));
+  }
+  if (config.upsampling == 1) {
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format));
+  } else {
+    // resampling is of course not lossless, so as a rough check:
+    // count pixels that are more than off-by-25 in the 8-bit value of one of
+    // the channels
+    EXPECT_LE(
+        jxl::test::ComparePixels(
+            pixels.data(), pixels2.data(), xsize, ysize, format_orig, format,
+            50.0 * (config.data_type == JXL_TYPE_UINT8 ? 1.0 : 256.0)),
+        300u);
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+std::vector<PixelTestConfig> GeneratePixelTests() {
+  std::vector<PixelTestConfig> all_tests;
+  struct ChannelInfo {
+    bool grayscale;
+    bool include_alpha;
+    size_t output_channels;
+  };
+  ChannelInfo ch_info[] = {
+      {false, true, 4},   // RGBA -> RGBA
+      {true, false, 1},   // G -> G
+      {true, true, 1},    // GA -> G
+      {true, true, 2},    // GA -> GA
+      {false, false, 3},  // RGB -> RGB
+      {false, true, 3},   // RGBA -> RGB
+      {false, false, 4},  // RGB -> RGBA
+  };
+
+  struct OutputFormat {
+    JxlEndianness endianness;
+    JxlDataType data_type;
+  };
+  OutputFormat out_formats[] = {
+      {JXL_NATIVE_ENDIAN, JXL_TYPE_UINT8},
+      {JXL_LITTLE_ENDIAN, JXL_TYPE_UINT16},
+      {JXL_BIG_ENDIAN, JXL_TYPE_UINT16},
+      {JXL_NATIVE_ENDIAN, JXL_TYPE_FLOAT16},
+      {JXL_LITTLE_ENDIAN, JXL_TYPE_FLOAT},
+      {JXL_BIG_ENDIAN, JXL_TYPE_FLOAT},
+  };
+
+  auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize,
+                       jxl::PreviewMode preview_mode, bool intrinsic_size,
+                       CodeStreamBoxFormat box, JxlOrientation orientation,
+                       bool keep_orientation, OutputFormat format,
+                       bool use_callback, bool set_buffer_early,
+                       bool resizable_runner, size_t upsampling) {
+    PixelTestConfig c;
+    c.grayscale = ch.grayscale;
+    c.include_alpha = ch.include_alpha;
+    c.preview_mode = preview_mode;
+    c.add_intrinsic_size = intrinsic_size;
+    c.xsize = xsize;
+    c.ysize = ysize;
+    c.add_container = (CodeStreamBoxFormat)box;
+    c.output_channels = ch.output_channels;
+    c.data_type = format.data_type;
+    c.endianness = format.endianness;
+    c.use_callback = use_callback;
+    c.set_buffer_early = set_buffer_early;
+    c.use_resizable_runner = resizable_runner;
+    c.orientation = orientation;
+    c.keep_orientation = keep_orientation;
+    c.upsampling = upsampling;
+    all_tests.push_back(c);
+  };
+
+  // Test output formats and methods.
+  for (ChannelInfo ch : ch_info) {
+    for (int use_callback = 0; use_callback <= 1; use_callback++) {
+      for (size_t upsampling : {1, 2, 4, 8}) {
+        for (OutputFormat fmt : out_formats) {
+          make_test(ch, 301, 33, jxl::kNoPreview,
+                    /*add_intrinsic_size=*/false,
+                    CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
+                    /*keep_orientation=*/false, fmt, use_callback,
+                    /*set_buffer_early=*/false, /*resizable_runner=*/false,
+                    upsampling);
+        }
+      }
+    }
+  }
+  // Test codestream formats.
+  for (size_t box = 1; box < kCSBF_NUM_ENTRIES; ++box) {
+    make_test(ch_info[0], 77, 33, jxl::kNoPreview,
+              /*add_intrinsic_size=*/false, (CodeStreamBoxFormat)box,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false,
+              /*set_buffer_early=*/false, /*resizable_runner=*/false, 1);
+  }
+  // Test previews.
+  for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes;
+       preview_mode++) {
+    make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode,
+              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/false, 1);
+  }
+  // Test intrinsic sizes.
+  for (int add_intrinsic_size = 0; add_intrinsic_size <= 1;
+       add_intrinsic_size++) {
+    make_test(ch_info[0], 55, 34, jxl::kNoPreview, add_intrinsic_size,
+              CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/false, 1);
+  }
+  // Test setting buffers early.
+  make_test(ch_info[0], 300, 33, jxl::kNoPreview,
+            /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+            JXL_ORIENT_IDENTITY,
+            /*keep_orientation=*/false, out_formats[0],
+            /*use_callback=*/false, /*set_buffer_early=*/true,
+            /*resizable_runner=*/false, 1);
+
+  // Test using the resizable runner
+  for (size_t i = 0; i < 4; i++) {
+    make_test(ch_info[0], 300 << i, 33 << i, jxl::kNoPreview,
+              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+              JXL_ORIENT_IDENTITY,
+              /*keep_orientation=*/false, out_formats[0],
+              /*use_callback=*/false, /*set_buffer_early=*/false,
+              /*resizable_runner=*/true, 1);
+  }
+
+  // Test orientations.
+  for (int orientation = 2; orientation <= 8; ++orientation) {
+    for (int keep_orientation = 0; keep_orientation <= 1; keep_orientation++) {
+      for (int use_callback = 0; use_callback <= 1; use_callback++) {
+        for (ChannelInfo ch : ch_info) {
+          for (OutputFormat fmt : out_formats) {
+            make_test(ch, 280, 12, jxl::kNoPreview,
+                      /*add_intrinsic_size=*/false,
+                      CodeStreamBoxFormat::kCSBF_None,
+                      static_cast<JxlOrientation>(orientation),
+                      /*keep_orientation=*/keep_orientation, fmt,
+                      /*use_callback=*/use_callback, /*set_buffer_early=*/true,
+                      /*resizable_runner=*/false, 1);
+          }
+        }
+      }
+    }
+  }
+
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const PixelTestConfig& c) {
+  os << c.xsize << "x" << c.ysize;
+  const char* colors[] = {"", "G", "GA", "RGB", "RGBA"};
+  os << colors[(c.grayscale ? 1 : 3) + (c.include_alpha ? 1 : 0)];
+  os << "to";
+  os << colors[c.output_channels];
+  switch (c.data_type) {
+    case JXL_TYPE_UINT8:
+      os << "u8";
+      break;
+    case JXL_TYPE_UINT16:
+      os << "u16";
+      break;
+    case JXL_TYPE_FLOAT:
+      os << "f32";
+      break;
+    case JXL_TYPE_FLOAT16:
+      os << "f16";
+      break;
+    default:
+      JXL_ASSERT(false);
+  };
+  if (jxl::test::GetDataBits(c.data_type) > jxl::kBitsPerByte) {
+    if (c.endianness == JXL_NATIVE_ENDIAN) {
+      // add nothing
+    } else if (c.endianness == JXL_BIG_ENDIAN) {
+      os << "BE";
+    } else if (c.endianness == JXL_LITTLE_ENDIAN) {
+      os << "LE";
+    }
+  }
+  if (c.add_container != CodeStreamBoxFormat::kCSBF_None) {
+    os << "Box";
+    os << (size_t)c.add_container;
+  }
+  if (c.preview_mode == jxl::kSmallPreview) os << "Preview";
+  if (c.preview_mode == jxl::kBigPreview) os << "BigPreview";
+  if (c.add_intrinsic_size) os << "IntrinicSize";
+  if (c.use_callback) os << "Callback";
+  if (c.set_buffer_early) os << "EarlyBuffer";
+  if (c.use_resizable_runner) os << "ResizableRunner";
+  if (c.orientation != 1) os << "O" << c.orientation;
+  if (c.keep_orientation) os << "Keep";
+  if (c.upsampling > 1) os << "x" << c.upsampling;
+  return os;
+}
+
+std::string PixelTestDescription(
+    const testing::TestParamInfo<DecodeTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeTest, DecodeTestParam,
+                                   testing::ValuesIn(GeneratePixelTests()),
+                                   PixelTestDescription);
+
+TEST(DecodeTest, PixelTestWithICCProfileLossless) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.add_icc_profile = true;
+  // For variation: some have container and no preview, others have preview
+  // and no container.
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  for (uint32_t channels = 3; channels <= 4; ++channels) {
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/false, /*set_buffer_early=*/false,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0};
+
+      // Test with the container for one of the pixel formats.
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/true, /*set_buffer_early=*/true,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels * 2, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+
+    {
+      JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+      std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          format, /*use_callback=*/false, /*set_buffer_early=*/false,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
+          /*expect_success=*/true);
+      JxlDecoderReset(dec);
+      EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+      EXPECT_EQ(0u,
+                jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                         ysize, format_orig, format));
+    }
+  }
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, PixelTestWithICCProfileLossy) {
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+      params);
+  uint32_t channels = 3;
+
+  JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+
+  jxl::PaddedBytes icc;
+  std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      format, /*use_callback=*/false, /*set_buffer_early=*/true,
+      /*use_resizable_runner=*/false, /*require_boxes=*/false,
+      /*expect_success=*/true, /*icc=*/&icc);
+  JxlDecoderReset(dec);
+  EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+
+  // The input pixels use the profile matching GetIccTestProfile, since we set
+  // add_icc_profile for CreateTestJXLCodestream to true.
+  jxl::ColorEncoding color_encoding0;
+  EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile()));
+  jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+  jxl::CodecInOut io0;
+  io0.SetSize(xsize, ysize);
+  EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                  /*bits_per_sample=*/16, format_orig,
+                                  /*pool=*/nullptr, &io0.Main()));
+
+  jxl::ColorEncoding color_encoding1;
+  EXPECT_TRUE(color_encoding1.SetICC(std::move(icc)));
+  jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+  jxl::CodecInOut io1;
+  io1.SetSize(xsize, ysize);
+  EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                  /*bits_per_sample=*/32, format,
+                                  /*pool=*/nullptr, &io1.Main()));
+
+  jxl::ButteraugliParams ba;
+  EXPECT_THAT(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              IsSlightlyBelow(0.79f));
+
+  JxlDecoderDestroy(dec);
+}
+
+std::string ColorDescription(JxlColorEncoding c) {
+  jxl::ColorEncoding color_encoding;
+  EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+  return Description(color_encoding);
+}
+
+std::string GetOrigProfile(JxlDecoder* dec) {
+  JxlColorEncoding c;
+  JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_ORIGINAL;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+  return ColorDescription(c);
+}
+
+std::string GetDataProfile(JxlDecoder* dec) {
+  JxlColorEncoding c;
+  JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+  return ColorDescription(c);
+}
+
+double ButteraugliDistance(size_t xsize, size_t ysize,
+                           const std::vector<uint8_t>& pixels_in,
+                           const jxl::ColorEncoding& color_in,
+                           float intensity_in,
+                           const std::vector<uint8_t>& pixels_out,
+                           const jxl::ColorEncoding& color_out,
+                           float intensity_out) {
+  jxl::CodecInOut in;
+  in.metadata.m.color_encoding = color_in;
+  in.metadata.m.SetIntensityTarget(intensity_in);
+  JxlPixelFormat format_in = {static_cast<uint32_t>(color_in.Channels()),
+                              JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(jxl::ConvertFromExternal(
+      jxl::Span<const uint8_t>(pixels_in.data(), pixels_in.size()), xsize,
+      ysize, color_in,
+      /*bits_per_sample=*/16, format_in,
+      /*pool=*/nullptr, &in.Main()));
+  jxl::CodecInOut out;
+  out.metadata.m.color_encoding = color_out;
+  out.metadata.m.SetIntensityTarget(intensity_out);
+  JxlPixelFormat format_out = {static_cast<uint32_t>(color_out.Channels()),
+                               JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(jxl::ConvertFromExternal(
+      jxl::Span<const uint8_t>(pixels_out.data(), pixels_out.size()), xsize,
+      ysize, color_out,
+      /*bits_per_sample=*/16, format_out,
+      /*pool=*/nullptr, &out.Main()));
+  return ButteraugliDistance(in.frames, out.frames, jxl::ButteraugliParams(),
+                             jxl::GetJxlCms(), nullptr, nullptr);
+}
+
+class DecodeAllEncodingsTest
+    : public ::testing::TestWithParam<jxl::test::ColorEncodingDescriptor> {};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    DecodeAllEncodingsTestInstantiation, DecodeAllEncodingsTest,
+    ::testing::ValuesIn(jxl::test::AllEncodings()));
+TEST_P(DecodeAllEncodingsTest, PreserveOriginalProfileTest) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
+  const auto& cdesc = GetParam();
+  jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(cdesc);
+  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
+  std::string color_space_in = Description(c_in);
+  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  printf("Testing input color space %s\n", color_space_in.c_str());
+  jxl::TestCodestreamParams params;
+  params.color_space = color_space_in;
+  params.intensity_target = intensity_in;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+      params);
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_FALSE(info.uses_original_profile);
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+  EXPECT_EQ(GetDataProfile(dec), color_space_in);
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+  std::vector<uint8_t> out(pixels.size());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(dec, &format, out.data(), out.size()));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
+                                    out, c_in, intensity_in);
+  EXPECT_LT(dist, 1.29);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+  JxlDecoderDestroy(dec);
+}
+
+namespace {
+void SetPreferredColorProfileTest(
+    const jxl::test::ColorEncodingDescriptor& from) {
+  size_t xsize = 123, ysize = 77;
+  int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
+  jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(from);
+  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
+  if (c_in.white_point != jxl::WhitePoint::kD65) return;
+  uint32_t num_channels = c_in.Channels();
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::string color_space_in = Description(c_in);
+  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  jxl::TestCodestreamParams params;
+  params.color_space = color_space_in;
+  params.intensity_target = intensity_in;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  auto all_encodings = jxl::test::AllEncodings();
+  all_encodings.push_back(
+      {jxl::ColorSpace::kXYB, jxl::WhitePoint::kD65, jxl::Primaries::kCustom,
+       jxl::TransferFunction::kUnknown, jxl::RenderingIntent::kPerceptual});
+  for (const auto& c1 : all_encodings) {
+    jxl::ColorEncoding c_out = jxl::test::ColorEncodingFromDescriptor(c1);
+    float intensity_out = intensity_in;
+    if (c_out.GetColorSpace() != jxl::ColorSpace::kXYB) {
+      if (c_out.rendering_intent != jxl::RenderingIntent::kRelative) {
+        continue;
+      }
+      if ((c_in.primaries == jxl::Primaries::k2100 &&
+           c_out.primaries != jxl::Primaries::k2100) ||
+          (c_in.primaries == jxl::Primaries::kP3 &&
+           c_out.primaries == jxl::Primaries::kSRGB)) {
+        // Converting to a narrower gamut does not work without gammut mapping.
+        continue;
+      }
+    }
+    if (c_out.tf.IsHLG() && intensity_out > 300) {
+      // The Linear->HLG OOTF function at this intensity level can push
+      // saturated colors out of gamut, so we would need gamut mapping in
+      // this case too.
+      continue;
+    }
+    std::string color_space_out = Description(c_out);
+    if (color_space_in == color_space_out) continue;
+    printf("Testing input color space %s with output color space %s\n",
+           color_space_in.c_str(), color_space_out.c_str());
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, data.data(), data.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    EXPECT_EQ(xsize, info.xsize);
+    EXPECT_EQ(ysize, info.ysize);
+    EXPECT_FALSE(info.uses_original_profile);
+    EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+    EXPECT_EQ(GetDataProfile(dec), color_space_in);
+    JxlColorEncoding encoding_out;
+    EXPECT_TRUE(jxl::ParseDescription(color_space_out, &encoding_out));
+    if (c_out.GetColorSpace() == jxl::ColorSpace::kXYB &&
+        (c_in.primaries != jxl::Primaries::kSRGB || c_in.tf.IsPQ())) {
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+      JxlDecoderDestroy(dec);
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+    EXPECT_EQ(GetOrigProfile(dec), color_space_in);
+    EXPECT_EQ(GetDataProfile(dec), color_space_out);
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    JxlPixelFormat out_format = format;
+    out_format.num_channels = c_out.Channels();
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &out_format, &buffer_size));
+    std::vector<uint8_t> out(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &out_format, out.data(), out.size()));
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
+                                      out, c_out, intensity_out);
+    if (c_in.white_point == c_out.white_point) {
+      EXPECT_LT(dist, 1.29);
+    } else {
+      EXPECT_LT(dist, 4.0);
+    }
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+  }
+}
+}  // namespace
+
+TEST(DecodeTest, SetPreferredColorProfileTestFromGray) {
+  jxl::test::ColorEncodingDescriptor gray = {
+      jxl::ColorSpace::kGray, jxl::WhitePoint::kD65, jxl::Primaries::kSRGB,
+      jxl::TransferFunction::kSRGB, jxl::RenderingIntent::kRelative};
+  SetPreferredColorProfileTest(gray);
+}
+
+TEST_P(DecodeAllEncodingsTest, SetPreferredColorProfileTest) {
+  const auto& from = GetParam();
+  SetPreferredColorProfileTest(from);
+}
+
+// Tests the case of lossy sRGB image without alpha channel, decoded to RGB8
+// and to RGBA8
+TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
+  for (unsigned channels = 3; channels <= 4; channels++) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    size_t xsize = 123, ysize = 77;
+    size_t num_pixels = xsize * ysize;
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+    JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        jxl::TestCodestreamParams());
+
+    JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/true, /*set_buffer_early=*/false,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success*/ true);
+    JxlDecoderReset(dec);
+    EXPECT_EQ(num_pixels * channels, pixels2.size());
+
+    jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+    jxl::CodecInOut io0;
+    io0.SetSize(xsize, ysize);
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
+
+    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(0.7f));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+// Opaque image with noise enabled, decoded to RGB8 and RGBA8.
+TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
+  for (unsigned channels = 3; channels <= 4; channels++) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    size_t xsize = 512, ysize = 300;
+    size_t num_pixels = xsize * ysize;
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+    JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::TestCodestreamParams params;
+    params.cparams.noise = jxl::Override::kOn;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+
+    JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    JxlDecoderReset(dec);
+    EXPECT_EQ(num_pixels * channels, pixels2.size());
+
+    jxl::ColorEncoding color_encoding0 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
+    jxl::CodecInOut io0;
+    io0.SetSize(xsize, ysize);
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
+
+    jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
+    jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(1.7f));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ProcessEmptyInputWithBoxes) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  uint32_t channels = 3;
+  JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    jxl::TestCodestreamParams params;
+    params.box_format = (CodeStreamBoxFormat)i;
+    printf("Testing empty input with box format %d\n", (int)params.box_format);
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    const int events =
+        JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_COLOR_ENCODING;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+    EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    const size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, compressed.size());
+    EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ExtraBytesAfterCompressedStream) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i;
+    if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    printf("Testing with box format %d\n", (int)box_format);
+    size_t last_unknown_box_size = 0;
+    if (box_format == kCSBF_Single_Other) {
+      last_unknown_box_size = unk1_box_size + 8;
+    } else if (box_format == kCSBF_Multi_Other_Terminated) {
+      last_unknown_box_size = unk3_box_size + 8;
+    } else if (box_format == kCSBF_Multi_Last_Empty_Other) {
+      // If boxes are not required, the decoder won't consume the last empty
+      // jxlp box.
+      last_unknown_box_size = 12 + unk3_box_size + 8;
+    }
+    jxl::TestCodestreamParams params;
+    params.box_format = box_format;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    // Add some more bytes after compressed data.
+    compressed.push_back(0);
+    compressed.push_back(1);
+    compressed.push_back(2);
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    uint32_t channels = 3;
+    JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
+    EXPECT_EQ(last_unknown_box_size + 3, unconsumed_bytes);
+    EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ExtraBytesAfterCompressedStreamRequireBoxes) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat box_format = (CodeStreamBoxFormat)i;
+    if (box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    printf("Testing with box format %d\n", (int)box_format);
+    bool expect_success = (box_format == kCSBF_None ||
+                           box_format == kCSBF_Single_Zero_Terminated ||
+                           box_format == kCSBF_Multi_Zero_Terminated);
+    jxl::TestCodestreamParams params;
+    params.box_format = box_format;
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+    // Add some more bytes after compressed data.
+    compressed.push_back(0);
+    compressed.push_back(1);
+    compressed.push_back(2);
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    uint32_t channels = 3;
+    JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        /*use_resizable_runner=*/false, /*require_boxes=*/true, expect_success);
+    size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
+    EXPECT_EQ(3, unconsumed_bytes);
+    EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, ConcatenatedCompressedStreams) {
+  size_t xsize = 123, ysize = 77;
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  jxl::CompressParams cparams;
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    CodeStreamBoxFormat first_box_format = (CodeStreamBoxFormat)i;
+    if (first_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+    jxl::TestCodestreamParams params1;
+    params1.box_format = first_box_format;
+    jxl::PaddedBytes compressed1 = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params1);
+    for (int j = 0; j < kCSBF_NUM_ENTRIES; ++j) {
+      CodeStreamBoxFormat second_box_format = (CodeStreamBoxFormat)j;
+      if (second_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
+      printf("Testing with box format pair %d, %d\n", (int)first_box_format,
+             (int)second_box_format);
+      jxl::TestCodestreamParams params2;
+      params2.box_format = second_box_format;
+      jxl::PaddedBytes compressed2 = jxl::CreateTestJXLCodestream(
+          jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+          3, params2);
+      jxl::PaddedBytes concat;
+      concat.append(compressed1);
+      concat.append(compressed2);
+      uint32_t channels = 3;
+      JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
+      size_t remaining = concat.size();
+      for (int part = 0; part < 2; ++part) {
+        printf("  Decoding part %d\n", part + 1);
+        JxlDecoder* dec = JxlDecoderCreate(NULL);
+        size_t pos = concat.size() - remaining;
+        bool expect_success =
+            (part == 0 || second_box_format == kCSBF_None ||
+             second_box_format == kCSBF_Single_Zero_Terminated ||
+             second_box_format == kCSBF_Multi_Zero_Terminated);
+        std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+            dec, jxl::Span<const uint8_t>(concat.data() + pos, remaining),
+            format, /*use_callback=*/false, /*set_buffer_early=*/true,
+            /*use_resizable_runner=*/false, /*require_boxes=*/true,
+            expect_success);
+        EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
+        remaining = JxlDecoderReleaseInput(dec);
+        JxlDecoderDestroy(dec);
+      }
+      EXPECT_EQ(0, remaining);
+    }
+  }
+}
+
+void TestPartialStream(bool reconstructible_jpeg) {
+  size_t xsize = 123, ysize = 77;
+  uint32_t channels = 4;
+  if (reconstructible_jpeg) {
+    channels = 3;
+  }
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, 0);
+  JxlPixelFormat format_orig = {channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  jxl::TestCodestreamParams params;
+  if (reconstructible_jpeg) {
+    params.cparams.color_transform = jxl::ColorTransform::kNone;
+  } else {
+    // Lossless to verify pixels exactly after roundtrip.
+    params.cparams.SetLossless();
+  }
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  jxl::PaddedBytes jpeg_output(64);
+  size_t used_jpeg_output = 0;
+
+  std::vector<jxl::PaddedBytes> codestreams(kCSBF_NUM_ENTRIES);
+  std::vector<jxl::PaddedBytes> jpeg_codestreams(kCSBF_NUM_ENTRIES);
+  for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    params.box_format = (CodeStreamBoxFormat)i;
+    if (reconstructible_jpeg) {
+      params.jpeg_codestream = &jpeg_codestreams[i];
+    }
+    codestreams[i] = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        channels, params);
+  }
+
+  // Test multiple step sizes, to test different combinations of the streaming
+  // box parsing.
+  std::vector<size_t> increments = {1, 3, 17, 23, 120, 700, 1050};
+
+  for (size_t index = 0; index < increments.size(); index++) {
+    for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+      if (reconstructible_jpeg &&
+          (CodeStreamBoxFormat)i == CodeStreamBoxFormat::kCSBF_None) {
+        continue;
+      }
+      const jxl::PaddedBytes& data = codestreams[i];
+      const uint8_t* next_in = data.data();
+      size_t avail_in = 0;
+
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(
+                    dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE |
+                             JXL_DEC_JPEG_RECONSTRUCTION));
+
+      bool seen_basic_info = false;
+      bool seen_full_image = false;
+      bool seen_jpeg_recon = false;
+
+      size_t total_size = 0;
+
+      for (;;) {
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+        size_t remaining = JxlDecoderReleaseInput(dec);
+        EXPECT_LE(remaining, avail_in);
+        next_in += avail_in - remaining;
+        avail_in = remaining;
+        if (status == JXL_DEC_NEED_MORE_INPUT) {
+          if (total_size >= data.size()) {
+            // End of test data reached, it should have successfully decoded the
+            // image now.
+            FAIL();
+            break;
+          }
+
+          size_t increment = increments[index];
+          // End of the file reached, should be the final test.
+          if (total_size + increment > data.size()) {
+            increment = data.size() - total_size;
+          }
+          total_size += increment;
+          avail_in += increment;
+        } else if (status == JXL_DEC_BASIC_INFO) {
+          // This event should happen exactly once
+          EXPECT_FALSE(seen_basic_info);
+          if (seen_basic_info) break;
+          seen_basic_info = true;
+          JxlBasicInfo info;
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+          EXPECT_EQ(info.xsize, xsize);
+          EXPECT_EQ(info.ysize, ysize);
+        } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) {
+          EXPECT_FALSE(seen_basic_info);
+          EXPECT_FALSE(seen_full_image);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetJPEGBuffer(dec, jpeg_output.data(),
+                                            jpeg_output.size()));
+          seen_jpeg_recon = true;
+        } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+          EXPECT_TRUE(seen_jpeg_recon);
+          used_jpeg_output =
+              jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec);
+          jpeg_output.resize(jpeg_output.size() * 2);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetJPEGBuffer(
+                        dec, jpeg_output.data() + used_jpeg_output,
+                        jpeg_output.size() - used_jpeg_output));
+        } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(
+                        dec, &format_orig, pixels2.data(), pixels2.size()));
+        } else if (status == JXL_DEC_FULL_IMAGE) {
+          // This event should happen exactly once
+          EXPECT_FALSE(seen_full_image);
+          if (seen_full_image) break;
+          // This event should happen after basic info
+          EXPECT_TRUE(seen_basic_info);
+          seen_full_image = true;
+          if (reconstructible_jpeg) {
+            used_jpeg_output =
+                jpeg_output.size() - JxlDecoderReleaseJPEGBuffer(dec);
+            EXPECT_EQ(used_jpeg_output, jpeg_codestreams[i].size());
+            EXPECT_EQ(0, memcmp(jpeg_output.data(), jpeg_codestreams[i].data(),
+                                used_jpeg_output));
+          } else {
+            EXPECT_EQ(pixels, pixels2);
+          }
+        } else if (status == JXL_DEC_SUCCESS) {
+          EXPECT_TRUE(seen_full_image);
+          break;
+        } else {
+          // We do not expect any other events or errors
+          FAIL();
+          break;
+        }
+      }
+
+      // Ensure the decoder emitted the basic info and full image events
+      EXPECT_TRUE(seen_basic_info);
+      EXPECT_TRUE(seen_full_image);
+
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+// Tests the return status when trying to decode pixels on incomplete file: it
+// should return JXL_DEC_NEED_MORE_INPUT, not error.
+TEST(DecodeTest, PixelPartialTest) { TestPartialStream(false); }
+
+#if JPEGXL_ENABLE_JPEG
+// Tests the return status when trying to decode JPEG bytes on incomplete file.
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGPartialTest)) {
+  TestPartialStream(true);
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+// The DC event still exists, but is no longer implemented, it is deprecated.
+TEST(DecodeTest, DCNotGettableTest) {
+  // 1x1 pixel JXL image
+  std::string compressed(
+      "\377\n\0\20\260\23\0H\200("
+      "\0\334\0U\17\0\0\250P\31e\334\340\345\\\317\227\37:,"
+      "\246m\\gh\253m\vK\22E\306\261I\252C&pH\22\353 "
+      "\363\6\22\bp\0\200\237\34\231W2d\255$\1",
+      68);
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(
+                dec, reinterpret_cast<const uint8_t*>(compressed.data()),
+                compressed.size()));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+
+  // Since the image is only 1x1 pixel, there is only 1 group, the decoder is
+  // unable to get DC size from this, and will not return the DC at all. Since
+  // no full image is requested either, it is expected to return success.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, PreviewTest) {
+  size_t xsize = 77, ysize = 120;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) {
+    jxl::TestCodestreamParams params;
+    params.preview_mode = mode;
+
+    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+        params);
+
+    JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+
+    jxl::ColorEncoding c_srgb = jxl::ColorEncoding::SRGB(false);
+    jxl::CodecInOut io0;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        c_srgb, /*bits_per_sample=*/16, format_orig, /*pool=*/nullptr,
+        &io0.Main()));
+    GeneratePreview(params.preview_mode, &io0.Main());
+
+    size_t xsize_preview = io0.Main().xsize();
+    size_t ysize_preview = io0.Main().ysize();
+    EXPECT_EQ(xsize_preview, info.preview.xsize);
+    EXPECT_EQ(ysize_preview, info.preview.ysize);
+    EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size);
+
+    EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    std::vector<uint8_t> preview(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(),
+                                            preview.size()));
+
+    EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(preview.data(), preview.size()), xsize_preview,
+        ysize_preview, c_srgb,
+        /*bits_per_sample=*/8, format,
+        /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
+    // tests) if xsize or ysize is < 8, no matter how different the images, a
+    // tiny size that could happen for a preview. ButteraugliDiffmap does
+    // support smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps
+    // move butteraugli's <8x8 handling from ButteraugliDiffmap to
+    // ButteraugliComparator::Diffmap in butteraugli.cc.
+    EXPECT_LE(ButteraugliDistance(io0.frames, io1.frames, ba, jxl::GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              mode == jxl::kSmallPreview ? 0.7f : 1.2f);
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, AlignTest) {
+  size_t xsize = 123, ysize = 77;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  size_t align = 17;
+  JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
+  // On purpose not using jxl::RoundUpTo to test it independently.
+  size_t expected_line_bytes = (1 * 3 * xsize + align - 1) / align * align;
+
+  for (int use_callback = 0; use_callback <= 1; ++use_callback) {
+    std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
+        jxl::Span<const uint8_t>(compressed.data(), compressed.size()), format,
+        use_callback, /*set_buffer_early=*/false,
+        /*use_resizable_runner=*/false, /*require_boxes=*/false,
+        /*expect_success=*/true);
+    EXPECT_EQ(expected_line_bytes * ysize, pixels2.size());
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format));
+  }
+}
+
+TEST(DecodeTest, AnimationTest) {
+  size_t xsize = 123, ysize = 77;
+  static const size_t num_frames = 2;
+  std::vector<uint8_t> frames[2];
+  frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+    EXPECT_EQ(0u, frame_header.name_length);
+    // For now, test with empty name, there's currently no easy way to encode
+    // a jxl file with a frame name because ImageBundle doesn't have a
+    // jxl::FrameHeader to set the name in. We can test the null termination
+    // character though.
+    char name;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameName(dec, &name, 1));
+    EXPECT_EQ(0, name);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, AnimationTestStreaming) {
+  size_t xsize = 123, ysize = 77;
+  static const size_t num_frames = 2;
+  std::vector<uint8_t> frames[2];
+  frames[0] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  frames[1] = jxl::test::GetSomeTestImage(xsize, ysize, 3, 1);
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  const size_t step_size = 16;
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = 0;
+  size_t frame_headers_seen = 0;
+  size_t frames_seen = 0;
+  bool seen_basic_info = false;
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  std::vector<uint8_t> frames2[2];
+  for (size_t i = 0; i < num_frames; ++i) {
+    frames2[i].resize(frames[i].size());
+  }
+
+  size_t total_in = 0;
+  size_t loop_count = 0;
+
+  for (;;) {
+    if (loop_count++ > compressed.size()) {
+      fprintf(stderr, "Too many loops\n");
+      FAIL();
+      break;
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    auto status = JxlDecoderProcessInput(dec);
+    size_t remaining = JxlDecoderReleaseInput(dec);
+    EXPECT_LE(remaining, avail_in);
+    next_in += avail_in - remaining;
+    avail_in = remaining;
+
+    if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+      if (total_in >= compressed.size()) {
+        fprintf(stderr, "Already gave all input data\n");
+        FAIL();
+        break;
+      }
+      size_t amount = step_size;
+      if (total_in + amount > compressed.size()) {
+        amount = compressed.size() - total_in;
+      }
+      avail_in += amount;
+      total_in += amount;
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                     dec, &format, frames2[frames_seen].data(),
+                                     frames2[frames_seen].size()));
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      EXPECT_EQ(false, seen_basic_info);
+      seen_basic_info = true;
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(xsize, info.xsize);
+      EXPECT_EQ(ysize, info.ysize);
+    } else if (status == JXL_DEC_FRAME) {
+      EXPECT_EQ(true, seen_basic_info);
+      frame_headers_seen++;
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      frames_seen++;
+      EXPECT_EQ(frame_headers_seen, frames_seen);
+    } else {
+      fprintf(stderr, "Unexpected status: %d\n", (int)status);
+      FAIL();
+    }
+  }
+
+  EXPECT_EQ(true, seen_basic_info);
+  EXPECT_EQ(num_frames, frames_seen);
+  EXPECT_EQ(num_frames, frame_headers_seen);
+  for (size_t i = 0; i < num_frames; ++i) {
+    EXPECT_EQ(frames[i], frames2[i]);
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, ExtraChannelTest) {
+  size_t xsize = 55, ysize = 257;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  size_t align = 17;
+  JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                 dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(1u, info.num_extra_channels);
+  EXPECT_EQ(JXL_FALSE, info.alpha_premultiplied);
+
+  JxlExtraChannelInfo extra_info;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info));
+  EXPECT_EQ(0, extra_info.type);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  size_t extra_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0));
+
+  std::vector<uint8_t> image(buffer_size);
+  std::vector<uint8_t> extra(extra_size);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, image.data(), image.size()));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetExtraChannelBuffer(
+                                 dec, &format, extra.data(), extra.size(), 0));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  // After the full image was output, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+  JxlDecoderDestroy(dec);
+
+  EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), image.data(), xsize,
+                                         ysize, format_orig, format));
+
+  // Compare the extracted extra channel with the original alpha channel
+
+  std::vector<uint8_t> alpha(pixels.size() / 4);
+  for (size_t i = 0; i < pixels.size(); i += 8) {
+    size_t index_alpha = i / 4;
+    alpha[index_alpha + 0] = pixels[i + 6];
+    alpha[index_alpha + 1] = pixels[i + 7];
+  }
+  JxlPixelFormat format_alpha = format;
+  format_alpha.num_channels = 1;
+  JxlPixelFormat format_orig_alpha = format_orig;
+  format_orig_alpha.num_channels = 1;
+
+  EXPECT_EQ(0u,
+            jxl::test::ComparePixels(alpha.data(), extra.data(), xsize, ysize,
+                                     format_orig_alpha, format_alpha));
+}
+
+TEST(DecodeTest, SkipCurrentFrameTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 7;
+  std::vector<uint8_t> frames[num_frames];
+  for (size_t i = 0; i < num_frames; i++) {
+    frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i);
+  }
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+    if (i & 1) {
+      // Mark some frames as referenceable, others not.
+      bundle.use_for_next_frame = true;
+    }
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  jxl::PassDefinition passes[] = {{2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 0, 1}};
+  jxl::ProgressiveMode progressive_mode{passes};
+  enc_state.progressive_splitter.SetProgressiveMode(progressive_mode);
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                                               JXL_DEC_FRAME_PROGRESSION |
+                                               JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kLastPasses));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    printf("Decoding frame %d\n", (int)i);
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+    std::vector<uint8_t> pixels(buffer_size);
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+    if (i == 2) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(8, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 3) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(4, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 4) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(2, JxlDecoderGetIntendedDownsamplingRatio(dec));
+    if (i == 5) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSkipCurrentFrame(dec));
+      continue;
+    }
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSkipCurrentFrame(dec));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames];
+  for (size_t i = 0; i < num_frames; i++) {
+    frames[i] = jxl::test::GetSomeTestImage(xsize, ysize, 3, i);
+  }
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+  for (size_t i = 0; i < num_frames; ++i) {
+    frame_durations[i] = 5 + i;
+  }
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    jxl::ImageBundle bundle(&io.metadata.m);
+    if (i & 1) {
+      // Mark some frames as referenceable, others not.
+      bundle.use_for_next_frame = true;
+    }
+
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
+        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Decode and test the animation frames
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i == 3) {
+      JxlDecoderSkipFrames(dec, 5);
+      i += 5;
+    }
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  // Test rewinding the decoder and skipping different frames
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    int test_skipping = (i == 9) ? 3 : 0;
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+    // should only skip the next frame, not the currently processed one.
+    if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    if (test_skipping) i += test_skipping;
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameWithBlendingTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames];
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations(num_frames);
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i < 5) {
+      std::vector<uint8_t> frame_internal =
+          jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2 + 1);
+      // An internal frame with 0 duration, and use_for_next_frame, this is a
+      // frame that is not rendered and not output by the API, but on which the
+      // rendered frames depend
+      jxl::ImageBundle bundle_internal(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame_internal.data(),
+                                   frame_internal.size()),
+          xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
+      bundle_internal.duration = 0;
+      bundle_internal.use_for_next_frame = true;
+      io.frames.push_back(std::move(bundle_internal));
+    }
+
+    std::vector<uint8_t> frame =
+        jxl::test::GetSomeTestImage(xsize, ysize, 3, i * 2);
+    // Actual rendered frame
+    frame_durations[i] = 5 + i;
+    jxl::ImageBundle bundle(&io.metadata.m);
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frame.data(), frame.size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = frame_durations[i];
+    // Create some variation in which frames depend on which.
+    if (i != 3 && i != 9 && i != 10) {
+      bundle.use_for_next_frame = true;
+    }
+    if (i != 12) {
+      bundle.blend = true;
+      // Choose a blend mode that depends on the pixels of the saved frame and
+      // doesn't use alpha
+      bundle.blendmode = jxl::BlendMode::kMul;
+    }
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+
+  // Independently decode all frames without any skipping, to create the
+  // expected blended frames, for the actual tests below to compare with.
+  {
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    void* runner = JxlThreadParallelRunnerCreate(
+        NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                   dec, JxlThreadParallelRunner, runner));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    for (size_t i = 0; i < num_frames; ++i) {
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+      frames[i].resize(xsize * ysize * 6);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                            frames[i].size()));
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    }
+
+    // After all frames were decoded, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlThreadParallelRunnerDestroy(runner);
+    JxlDecoderDestroy(dec);
+  }
+
+  JxlDecoder* dec = JxlDecoderCreate(NULL);
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  void* runner = JxlThreadParallelRunnerCreate(
+      NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    // Test rewinding mid-way, not decoding all frames.
+    if (i == 8) {
+      break;
+    }
+  }
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    if (i == 3) {
+      JxlDecoderSkipFrames(dec, 5);
+      i += 5;
+    }
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+  }
+
+  // After all frames were decoded, JxlDecoderProcessInput should return
+  // success to indicate all is done.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  // Test rewinding the decoder and skipping different frames
+
+  JxlDecoderRewind(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    int test_skipping = (i == 9) ? 3 : 0;
+    std::vector<uint8_t> pixels(buffer_size);
+
+    EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+    // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+    // should only skip the next frame, not the currently processed one.
+    if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+    JxlFrameHeader frame_header;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+    EXPECT_EQ(frame_durations[i], frame_header.duration);
+
+    EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, pixels.data(), pixels.size()));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                           xsize, ysize, format, format));
+
+    if (test_skipping) i += test_skipping;
+  }
+
+  JxlThreadParallelRunnerDestroy(runner);
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames + 5];
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames + 5);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations_c;
+  std::vector<uint32_t> frame_durations_nc;
+  std::vector<uint32_t> frame_xsize, frame_ysize, frame_x0, frame_y0;
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    size_t cropxsize = 1 + xsize * 2 / (i + 1);
+    size_t cropysize = 1 + ysize * 3 / (i + 2);
+    int cropx0 = i * 3 - 8;
+    int cropy0 = i * 4 - 7;
+    if (i < 5) {
+      std::vector<uint8_t> frame_internal =
+          jxl::test::GetSomeTestImage(xsize / 2, ysize / 2, 4, i * 2 + 1);
+      // An internal frame with 0 duration, and use_for_next_frame, this is a
+      // frame that is not rendered and not output by default by the API, but on
+      // which the rendered frames depend
+      jxl::ImageBundle bundle_internal(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame_internal.data(),
+                                   frame_internal.size()),
+          xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
+      bundle_internal.duration = 0;
+      bundle_internal.use_for_next_frame = true;
+      bundle_internal.origin = {13, 17};
+      io.frames.push_back(std::move(bundle_internal));
+      frame_durations_nc.push_back(0);
+      frame_xsize.push_back(xsize / 2);
+      frame_ysize.push_back(ysize / 2);
+      frame_x0.push_back(13);
+      frame_y0.push_back(17);
+    }
+
+    std::vector<uint8_t> frame =
+        jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+    // Actual rendered frame
+    jxl::ImageBundle bundle(&io.metadata.m);
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
+    bundle.duration = 5 + i;
+    frame_durations_nc.push_back(5 + i);
+    frame_durations_c.push_back(5 + i);
+    frame_xsize.push_back(cropxsize);
+    frame_ysize.push_back(cropysize);
+    frame_x0.push_back(cropx0);
+    frame_y0.push_back(cropy0);
+    bundle.origin = {cropx0, cropy0};
+    // Create some variation in which frames depend on which.
+    if (i != 3 && i != 9 && i != 10) {
+      bundle.use_for_next_frame = true;
+    }
+    if (i != 12) {
+      bundle.blend = true;
+      bundle.blendmode = jxl::BlendMode::kBlend;
+    }
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                              jxl::GetJxlCms(), &aux_out, nullptr));
+  // try both with and without coalescing
+  for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+    // Independently decode all frames without any skipping, to create the
+    // expected blended frames, for the actual tests below to compare with.
+    {
+      JxlDecoder* dec = JxlDecoderCreate(NULL);
+      const uint8_t* next_in = compressed.data();
+      size_t avail_in = compressed.size();
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+      void* runner = JxlThreadParallelRunnerCreate(
+          NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                     dec, JxlThreadParallelRunner, runner));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+      for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        if (coalescing) {
+          EXPECT_EQ(xsize * ysize * 8, buffer_size);
+        } else {
+          EXPECT_EQ(frame_xsize[i] * frame_ysize[i] * 8, buffer_size);
+        }
+        frames[i].resize(buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                              frames[i].size()));
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      }
+
+      // After all frames were decoded, JxlDecoderProcessInput should return
+      // success to indicate all is done.
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      JxlThreadParallelRunnerDestroy(runner);
+      JxlDecoderDestroy(dec);
+    }
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+    void* runner = JxlThreadParallelRunnerCreate(
+        NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                   dec, JxlThreadParallelRunner, runner));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                                            JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+    for (size_t i = 0; i < num_frames; ++i) {
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+      }
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+      }
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+
+      // Test rewinding mid-way, not decoding all frames.
+      if (i == 8) {
+        break;
+      }
+    }
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      if (i == 3) {
+        JxlDecoderSkipFrames(dec, 5);
+        i += 5;
+      }
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, frame_x0[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, frame_y0[i]);
+        EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                  i != 12 + 5 && frame_header.duration != 0
+                      ? 2
+                      : 0);  // kBlend or the default kReplace
+      }
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+    }
+
+    // After all frames were decoded, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+    // Test rewinding the decoder and skipping different frames
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      int test_skipping = (i == 9) ? 3 : 0;
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+      // should only skip the next frame, not the currently processed one.
+      if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      EXPECT_EQ(0u, jxl::test::ComparePixels(frames[i].data(), pixels.data(),
+                                             frame_header.layer_info.xsize,
+                                             frame_header.layer_info.ysize,
+                                             format, format));
+
+      if (test_skipping) i += test_skipping;
+    }
+
+    JxlThreadParallelRunnerDestroy(runner);
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, OrientedCroppedFrameTest) {
+  const auto test = [](bool keep_orientation, uint32_t orientation,
+                       uint32_t resampling) {
+    size_t xsize = 90, ysize = 120;
+    JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    size_t oxsize = (!keep_orientation && orientation > 4 ? ysize : xsize);
+    size_t oysize = (!keep_orientation && orientation > 4 ? xsize : ysize);
+    jxl::CodecInOut io;
+    io.SetSize(xsize, ysize);
+    io.metadata.m.SetUintSamples(16);
+    io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+    io.metadata.m.orientation = orientation;
+    io.frames.clear();
+    io.SetSize(xsize, ysize);
+
+    for (size_t i = 0; i < 3; ++i) {
+      size_t cropxsize = 1 + xsize * 2 / (i + 1);
+      size_t cropysize = 1 + ysize * 3 / (i + 2);
+      int cropx0 = i * 3 - 8;
+      int cropy0 = i * 4 - 7;
+
+      std::vector<uint8_t> frame =
+          jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+      jxl::ImageBundle bundle(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+          cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle));
+      bundle.origin = {cropx0, cropy0};
+      bundle.use_for_next_frame = true;
+      io.frames.push_back(std::move(bundle));
+    }
+
+    jxl::CompressParams cparams;
+    cparams
+        .SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+    cparams.speed_tier = jxl::SpeedTier::kThunder;
+    cparams.resampling = resampling;
+    jxl::AuxOut aux_out;
+    jxl::PaddedBytes compressed;
+    jxl::PassesEncoderState enc_state;
+    EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                                jxl::GetJxlCms(), &aux_out, nullptr));
+
+    // 0 is merged frame as decoded with coalescing enabled (default)
+    // 1-3 are non-coalesced frames as decoded with coalescing disabled
+    // 4 is the manually merged frame
+    std::vector<uint8_t> frames[5];
+    frames[4].resize(xsize * ysize * 8, 0);
+
+    // try both with and without coalescing
+    for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+      // Independently decode all frames without any skipping, to create the
+      // expected blended frames, for the actual tests below to compare with.
+      {
+        JxlDecoder* dec = JxlDecoderCreate(NULL);
+        const uint8_t* next_in = compressed.data();
+        size_t avail_in = compressed.size();
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetKeepOrientation(dec, keep_orientation));
+        void* runner = JxlThreadParallelRunnerCreate(
+            NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                       dec, JxlThreadParallelRunner, runner));
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        for (size_t i = (coalescing ? 0 : 1); i < (coalescing ? 1 : 4); ++i) {
+          EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+          JxlFrameHeader frame_header;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderGetFrameHeader(dec, &frame_header));
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+          if (coalescing) {
+            EXPECT_EQ(xsize * ysize * 8, buffer_size);
+          } else {
+            EXPECT_EQ(frame_header.layer_info.xsize *
+                          frame_header.layer_info.ysize * 8,
+                      buffer_size);
+          }
+          frames[i].resize(buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                                frames[i].size()));
+          EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+          EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                    JXL_BLEND_REPLACE);
+          if (coalescing) {
+            EXPECT_EQ(frame_header.layer_info.xsize, oxsize);
+            EXPECT_EQ(frame_header.layer_info.ysize, oysize);
+            EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+            EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+          } else {
+            // manually merge this layer
+            int x0 = frame_header.layer_info.crop_x0;
+            int y0 = frame_header.layer_info.crop_y0;
+            int w = frame_header.layer_info.xsize;
+            int h = frame_header.layer_info.ysize;
+            for (int y = 0; y < static_cast<int>(oysize); y++) {
+              if (y < y0 || y >= y0 + h) continue;
+              // pointers do whole 16-bit RGBA pixels at a time
+              uint64_t* row_merged = static_cast<uint64_t*>(
+                  (void*)(frames[4].data() + y * oxsize * 8));
+              uint64_t* row_layer = static_cast<uint64_t*>(
+                  (void*)(frames[i].data() + (y - y0) * w * 8));
+              for (int x = 0; x < static_cast<int>(oxsize); x++) {
+                if (x < x0 || x >= x0 + w) continue;
+                row_merged[x] = row_layer[x - x0];
+              }
+            }
+          }
+        }
+
+        // After all frames were decoded, JxlDecoderProcessInput should return
+        // success to indicate all is done.
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+        JxlThreadParallelRunnerDestroy(runner);
+        JxlDecoderDestroy(dec);
+      }
+    }
+
+    EXPECT_EQ(0u, jxl::test::ComparePixels(frames[0].data(), frames[4].data(),
+                                           oxsize, oysize, format, format));
+  };
+
+  for (bool keep_orientation : {true, false}) {
+    for (uint32_t orientation = 1; orientation <= 8; orientation++) {
+      for (uint32_t resampling : {1, 2, 4, 8}) {
+        SCOPED_TRACE(testing::Message()
+                     << "keep_orientation: " << keep_orientation << ", "
+                     << "orientation: " << orientation << ", "
+                     << "resampling: " << resampling);
+        test(keep_orientation, orientation, resampling);
+      }
+    }
+  }
+}
+
+struct FramePositions {
+  size_t frame_start;
+  size_t header_end;
+  size_t toc_end;
+  std::vector<size_t> section_end;
+};
+
+struct StreamPositions {
+  size_t codestream_start;
+  size_t codestream_end;
+  size_t basic_info;
+  size_t jbrd_end = 0;
+  std::vector<size_t> box_start;
+  std::vector<FramePositions> frames;
+};
+
+void AnalyzeCodestream(const jxl::PaddedBytes& data,
+                       StreamPositions* streampos) {
+  // Unbox data to codestream and mark where it is broken up by boxes.
+  std::vector<uint8_t> codestream;
+  std::vector<std::pair<size_t, size_t>> breakpoints;
+  bool codestream_end = false;
+  ASSERT_LE(2, data.size());
+  if (data[0] == 0xff && data[1] == 0x0a) {
+    codestream = std::vector<uint8_t>(data.begin(), data.end());
+    streampos->codestream_start = 0;
+  } else {
+    const uint8_t* in = data.data();
+    size_t pos = 0;
+    while (pos < data.size()) {
+      ASSERT_LE(pos + 8, data.size());
+      streampos->box_start.push_back(pos);
+      size_t box_size = LoadBE32(in + pos);
+      if (box_size == 0) box_size = data.size() - pos;
+      ASSERT_LE(pos + box_size, data.size());
+      if (memcmp(in + pos + 4, "jxlc", 4) == 0) {
+        EXPECT_TRUE(codestream.empty());
+        streampos->codestream_start = pos + 8;
+        codestream.insert(codestream.end(), in + pos + 8, in + pos + box_size);
+        codestream_end = true;
+      } else if (memcmp(in + pos + 4, "jxlp", 4) == 0) {
+        codestream_end = (LoadBE32(in + pos + 8) & 0x80000000);
+        if (codestream.empty()) {
+          streampos->codestream_start = pos + 12;
+        } else if (box_size > 12 || !codestream_end) {
+          breakpoints.push_back({codestream.size(), 12});
+        }
+        codestream.insert(codestream.end(), in + pos + 12, in + pos + box_size);
+      } else if (memcmp(in + pos + 4, "jbrd", 4) == 0) {
+        EXPECT_TRUE(codestream.empty());
+        streampos->jbrd_end = pos + box_size;
+      } else if (!codestream.empty() && !codestream_end) {
+        breakpoints.push_back({codestream.size(), box_size});
+      }
+      pos += box_size;
+    }
+    ASSERT_EQ(pos, data.size());
+  }
+  // Translate codestream positions to boxed stream positions.
+  size_t offset = streampos->codestream_start;
+  size_t bp = 0;
+  auto add_offset = [&](size_t pos) {
+    while (bp < breakpoints.size() && pos >= breakpoints[bp].first) {
+      offset += breakpoints[bp++].second;
+    }
+    return pos + offset;
+  };
+  // Analyze the unboxed codestream.
+  jxl::BitReader br(
+      jxl::Span<const uint8_t>(codestream.data(), codestream.size()));
+  ASSERT_EQ(br.ReadFixedBits<16>(), 0x0AFF);
+  jxl::CodecMetadata metadata;
+  EXPECT_TRUE(ReadSizeHeader(&br, &metadata.size));
+  EXPECT_TRUE(ReadImageMetadata(&br, &metadata.m));
+  streampos->basic_info =
+      add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte);
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(jxl::Bundle::Read(&br, &metadata.transform_data));
+  EXPECT_TRUE(br.JumpToByteBoundary());
+  bool has_preview = metadata.m.have_preview;
+  while (br.TotalBitsConsumed() < br.TotalBytes() * jxl::kBitsPerByte) {
+    FramePositions p;
+    p.frame_start = add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte);
+    jxl::FrameHeader frame_header(&metadata);
+    if (has_preview) {
+      frame_header.nonserialized_is_preview = true;
+      has_preview = false;
+    }
+    EXPECT_TRUE(ReadFrameHeader(&br, &frame_header));
+    p.header_end =
+        add_offset(jxl::DivCeil(br.TotalBitsConsumed(), jxl::kBitsPerByte));
+    jxl::FrameDimensions frame_dim = frame_header.ToFrameDimensions();
+    uint64_t groups_total_size;
+    const size_t toc_entries = jxl::NumTocEntries(
+        frame_dim.num_groups, frame_dim.num_dc_groups,
+        frame_header.passes.num_passes, /*has_ac_global=*/true);
+    std::vector<uint64_t> section_offsets;
+    std::vector<uint32_t> section_sizes;
+    EXPECT_TRUE(ReadGroupOffsets(toc_entries, &br, &section_offsets,
+                                 &section_sizes, &groups_total_size));
+    EXPECT_EQ(br.TotalBitsConsumed() % jxl::kBitsPerByte, 0);
+    size_t sections_start = br.TotalBitsConsumed() / jxl::kBitsPerByte;
+    p.toc_end = add_offset(sections_start);
+    for (size_t i = 0; i < toc_entries; ++i) {
+      size_t end = sections_start + section_offsets[i] + section_sizes[i];
+      p.section_end.push_back(add_offset(end));
+    }
+    br.SkipBits(groups_total_size * jxl::kBitsPerByte);
+    streampos->frames.push_back(p);
+  }
+  streampos->codestream_end = add_offset(codestream.size());
+  EXPECT_EQ(br.TotalBitsConsumed(), br.TotalBytes() * jxl::kBitsPerByte);
+  EXPECT_TRUE(br.Close());
+}
+
+enum ExpectedFlushState { NO_FLUSH, SAME_FLUSH, NEW_FLUSH };
+struct Breakpoint {
+  size_t file_pos;
+  ExpectedFlushState expect_flush;
+};
+
+void VerifyProgression(size_t xsize, size_t ysize, uint32_t num_channels,
+                       const std::vector<uint8_t>& pixels,
+                       const jxl::PaddedBytes& data,
+                       std::vector<Breakpoint> breakpoints) {
+  // Size large enough for multiple groups, required to have progressive stages.
+  ASSERT_LT(256, xsize);
+  ASSERT_LT(256, ysize);
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  int bp = 0;
+  const uint8_t* next_in = data.data();
+  size_t avail_in = breakpoints[bp].file_pos;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  double prev_dist = 1.0;
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+    printf("bp: %d  status: 0x%x\n", bp, (int)status);
+    if (status == JXL_DEC_BASIC_INFO) {
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(info.xsize, xsize);
+      EXPECT_EQ(info.ysize, ysize);
+      // Output buffer/callback not yet set
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      EXPECT_EQ(pixels2.size(), buffer_size);
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                            pixels2.size()));
+    } else if (status == JXL_DEC_FRAME) {
+      // Nothing to do.
+    } else if (status == JXL_DEC_SUCCESS) {
+      EXPECT_EQ(bp + 1, breakpoints.size());
+      break;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT ||
+               status == JXL_DEC_FULL_IMAGE) {
+      if (breakpoints[bp].expect_flush == NO_FLUSH) {
+        EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+      } else {
+        if (status != JXL_DEC_FULL_IMAGE) {
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+        }
+        double dist = jxl::test::DistanceRMS(pixels2.data(), pixels.data(),
+                                             xsize, ysize, format);
+        if (breakpoints[bp].expect_flush == NEW_FLUSH) {
+          EXPECT_LT(dist, prev_dist);
+          prev_dist = dist;
+        } else {
+          EXPECT_EQ(dist, prev_dist);
+        }
+      }
+      if (status == JXL_DEC_FULL_IMAGE) {
+        EXPECT_EQ(bp + 1, breakpoints.size());
+        continue;
+      }
+      ASSERT_LT(++bp, breakpoints.size());
+      next_in += avail_in - JxlDecoderReleaseInput(dec);
+      avail_in = breakpoints[bp].file_pos - (next_in - data.data());
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    } else {
+      printf("Unexpected status: 0x%x\n", (int)status);
+      FAIL();  // unexpected returned status
+    }
+  }
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, ProgressionTest) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.progressive_dc = 1;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  StreamPositions streampos;
+  AnalyzeCodestream(data, &streampos);
+  const std::vector<FramePositions>& fp = streampos.frames;
+  // We have preview, dc frame and regular frame.
+  EXPECT_EQ(3, fp.size());
+  EXPECT_EQ(7, fp[2].section_end.size());
+  EXPECT_EQ(data.size(), fp[2].section_end[6]);
+  std::vector<Breakpoint> breakpoints{
+      {fp[0].frame_start, NO_FLUSH},           // headers
+      {fp[1].frame_start, NO_FLUSH},           // preview
+      {fp[2].frame_start, NO_FLUSH},           // dc frame
+      {fp[2].section_end[0], NO_FLUSH},        // DC global
+      {fp[2].section_end[1] - 1, NO_FLUSH},    // partial DC group
+      {fp[2].section_end[1], NEW_FLUSH},       // DC group
+      {fp[2].section_end[2], SAME_FLUSH},      // AC global
+      {fp[2].section_end[3], NEW_FLUSH},       // AC group 0
+      {fp[2].section_end[4] - 1, SAME_FLUSH},  // partial AC group 1
+      {fp[2].section_end[4], NEW_FLUSH},       // AC group 1
+      {fp[2].section_end[5], NEW_FLUSH},       // AC group 2
+      {data.size() - 1, SAME_FLUSH},           // partial AC group 3
+      {data.size(), NEW_FLUSH}};               // full image
+  VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints);
+}
+
+TEST(DecodeTest, ProgressionTestLosslessAlpha) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.responsive = 1;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  StreamPositions streampos;
+  AnalyzeCodestream(data, &streampos);
+  const std::vector<FramePositions>& fp = streampos.frames;
+  // We have preview, dc frame and regular frame.
+  EXPECT_EQ(1, fp.size());
+  EXPECT_EQ(7, fp[0].section_end.size());
+  EXPECT_EQ(data.size(), fp[0].section_end[6]);
+  std::vector<Breakpoint> breakpoints{
+      {fp[0].frame_start, NO_FLUSH},           // headers
+      {fp[0].section_end[0] - 1, NO_FLUSH},    // partial DC global
+      {fp[0].section_end[0], NEW_FLUSH},       // DC global
+      {fp[0].section_end[1], SAME_FLUSH},      // DC group
+      {fp[0].section_end[2], SAME_FLUSH},      // AC global
+      {fp[0].section_end[3], NEW_FLUSH},       // AC group 0
+      {fp[0].section_end[4] - 1, SAME_FLUSH},  // partial AC group 1
+      {fp[0].section_end[4], NEW_FLUSH},       // AC group 1
+      {fp[0].section_end[5], NEW_FLUSH},       // AC group 2
+      {data.size() - 1, SAME_FLUSH},           // partial AC group 3
+      {data.size(), NEW_FLUSH}};               // full image
+  VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints);
+}
+
+void VerifyFilePosition(size_t expected_pos, const jxl::PaddedBytes& data,
+                        JxlDecoder* dec) {
+  size_t remaining = JxlDecoderReleaseInput(dec);
+  size_t pos = data.size() - remaining;
+  EXPECT_EQ(expected_pos, pos);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, data.data() + pos, remaining));
+}
+
+TEST(DecodeTest, InputHandlingTestOneShot) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    jxl::TestCodestreamParams params;
+    params.cparams.progressive_dc = 1;
+    params.preview_mode = jxl::kSmallPreview;
+    params.box_format = (CodeStreamBoxFormat)i;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview, dc frame and regular frame.
+    EXPECT_EQ(3, fp.size());
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    int kNumEvents = 6;
+    int events[] = {
+        JXL_DEC_BASIC_INFO, JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE,
+        JXL_DEC_FRAME,      JXL_DEC_FULL_IMAGE,     JXL_DEC_FRAME_PROGRESSION,
+    };
+    size_t end_positions[] = {
+        streampos.basic_info,     fp[0].frame_start,
+        fp[1].frame_start,        fp[2].toc_end,
+        streampos.codestream_end, streampos.codestream_end};
+    int events_wanted = 0;
+    for (int j = 0; j < kNumEvents; ++j) {
+      events_wanted |= events[j];
+      size_t end_pos = end_positions[j];
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, data.data(), data.size()));
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(streampos.basic_info, data, dec);
+      if (j >= 1) {
+        EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].frame_start, data, dec);
+      }
+      if (j >= 2) {
+        EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_GE(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                buffer_size));
+        EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].frame_start, data, dec);
+      }
+      if (j >= 3) {
+        EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[2].toc_end, data, dec);
+        if (j >= 5) {
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC));
+        }
+      }
+      if (j >= 4) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[2].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_EQ(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                              pixels2.size()));
+        if (j >= 5) {
+          EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, JxlDecoderProcessInput(dec));
+          VerifyFilePosition(fp[2].section_end[1], data, dec);
+        }
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.codestream_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(end_pos, data, dec);
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+#if JPEGXL_ENABLE_JPEG
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) {
+  size_t xsize = 123;
+  size_t ysize = 77;
+  size_t channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
+  for (int i = 1; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    jxl::PaddedBytes jpeg_codestream;
+    jxl::TestCodestreamParams params;
+    params.cparams.color_transform = jxl::ColorTransform::kNone;
+    params.jpeg_codestream = &jpeg_codestream;
+    params.preview_mode = jxl::kSmallPreview;
+    params.box_format = (CodeStreamBoxFormat)i;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        channels, params);
+    JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview and regular frame.
+    EXPECT_EQ(2, fp.size());
+    EXPECT_LT(0, streampos.jbrd_end);
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    int kNumEvents = 6;
+    int events[] = {JXL_DEC_BASIC_INFO,     JXL_DEC_JPEG_RECONSTRUCTION,
+                    JXL_DEC_COLOR_ENCODING, JXL_DEC_PREVIEW_IMAGE,
+                    JXL_DEC_FRAME,          JXL_DEC_FULL_IMAGE};
+    size_t end_positions[] = {streampos.basic_info, streampos.basic_info,
+                              fp[0].frame_start,    fp[1].frame_start,
+                              fp[1].toc_end,        streampos.codestream_end};
+    int events_wanted = 0;
+    for (int j = 0; j < kNumEvents; ++j) {
+      printf("j = %d\n", j);
+      events_wanted |= events[j];
+      size_t end_pos = end_positions[j];
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, data.data(), data.size()));
+      if (j >= 1) {
+        EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.jbrd_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(streampos.basic_info, data, dec);
+      if (j >= 2) {
+        EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].frame_start, data, dec);
+      }
+      if (j >= 3) {
+        EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[0].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_GE(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                buffer_size));
+        EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].frame_start, data, dec);
+      }
+      if (j >= 4) {
+        EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].toc_end, data, dec);
+      }
+      if (j >= 5) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(fp[1].toc_end, data, dec);
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        EXPECT_EQ(pixels2.size(), buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                              pixels2.size()));
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+        VerifyFilePosition(streampos.codestream_end, data, dec);
+      }
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      VerifyFilePosition(end_pos, data, dec);
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+TEST(DecodeTest, InputHandlingTestStreaming) {
+  size_t xsize = 508, ysize = 470;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  for (int i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
+    printf("Testing with box format %d\n", i);
+    fflush(stdout);
+    jxl::TestCodestreamParams params;
+    params.cparams.progressive_dc = 1;
+    params.box_format = (CodeStreamBoxFormat)i;
+    params.preview_mode = jxl::kSmallPreview;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    StreamPositions streampos;
+    AnalyzeCodestream(data, &streampos);
+    const std::vector<FramePositions>& fp = streampos.frames;
+    // We have preview, dc frame and regular frame.
+    EXPECT_EQ(3, fp.size());
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+    int events_wanted =
+        (JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_PREVIEW_IMAGE |
+         JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION |
+         JXL_DEC_BOX);
+    for (size_t increment : {1, 7, 27, 1024}) {
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events_wanted));
+      size_t file_pos = 0;
+      size_t box_index = 0;
+      size_t avail_in = 0;
+      for (;;) {
+        const uint8_t* next_in = data.data() + file_pos;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+        JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+        size_t remaining = JxlDecoderReleaseInput(dec);
+        size_t consumed = avail_in - remaining;
+        file_pos += consumed;
+        avail_in += increment;
+        avail_in = std::min<size_t>(avail_in, data.size() - file_pos);
+        if (status == JXL_DEC_BASIC_INFO) {
+          EXPECT_EQ(file_pos, streampos.basic_info);
+        } else if (status == JXL_DEC_COLOR_ENCODING) {
+          EXPECT_EQ(file_pos, streampos.frames[0].frame_start);
+        } else if (status == JXL_DEC_NEED_PREVIEW_OUT_BUFFER) {
+          EXPECT_EQ(file_pos, streampos.frames[0].toc_end);
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
+          EXPECT_GE(pixels2.size(), buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetPreviewOutBuffer(dec, &format, pixels2.data(),
+                                                  buffer_size));
+        } else if (status == JXL_DEC_PREVIEW_IMAGE) {
+          EXPECT_EQ(file_pos, streampos.frames[1].frame_start);
+        } else if (status == JXL_DEC_FRAME) {
+          EXPECT_EQ(file_pos, streampos.frames[2].toc_end);
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetProgressiveDetail(dec, kDC));
+        } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+          EXPECT_EQ(file_pos, streampos.frames[2].toc_end);
+          size_t buffer_size;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+          EXPECT_EQ(pixels2.size(), buffer_size);
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetImageOutBuffer(dec, &format, pixels2.data(),
+                                                pixels2.size()));
+        } else if (status == JXL_DEC_FRAME_PROGRESSION) {
+          EXPECT_EQ(file_pos, streampos.frames[2].section_end[1]);
+        } else if (status == JXL_DEC_FULL_IMAGE) {
+          EXPECT_EQ(file_pos, streampos.codestream_end);
+        } else if (status == JXL_DEC_SUCCESS) {
+          EXPECT_EQ(file_pos, streampos.codestream_end);
+          break;
+        } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+          EXPECT_LT(remaining, 12);
+          if ((i == kCSBF_None && file_pos >= 2) ||
+              (box_index > 0 && box_index < streampos.box_start.size() &&
+               file_pos >= streampos.box_start[box_index - 1] + 12 &&
+               file_pos < streampos.box_start[box_index])) {
+            EXPECT_EQ(remaining, 0);
+          }
+          if (file_pos == data.size()) break;
+        } else if (status == JXL_DEC_BOX) {
+          ASSERT_LT(box_index, streampos.box_start.size());
+          EXPECT_EQ(file_pos, streampos.box_start[box_index++]);
+        } else {
+          printf("Unexpected status: 0x%x\n", (int)status);
+          FAIL();
+        }
+      }
+      JxlDecoderDestroy(dec);
+    }
+  }
+}
+
+TEST(DecodeTest, FlushTest) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535).
+  // 29000 pixels can be above the threshold
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            29000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Lower threshold for the final (still lossy) image
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, FlushTestImageOutCallback) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  size_t bytes_per_pixel = format.num_channels * 2;
+  size_t stride = bytes_per_pixel * xsize;
+  auto callback = [&](size_t x, size_t y, size_t num_pixels,
+                      const void* pixels_row) {
+    memcpy(pixels2.data() + stride * y + bytes_per_pixel * x, pixels_row,
+           num_pixels * bytes_per_pixel);
+  };
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output callback not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutCallback(
+                                 dec, &format,
+                                 [](void* opaque, size_t x, size_t y,
+                                    size_t xsize, const void* pixels_row) {
+                                   auto cb =
+                                       static_cast<decltype(&callback)>(opaque);
+                                   (*cb)(x, y, xsize, pixels_row);
+                                 },
+                                 /*opaque=*/&callback));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  // Crude test of actual pixel data: pixel threshold of about 4% (2560/65535).
+  // 29000 pixels can be above the threshold
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            29000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Lower threshold for the final (still lossy) image
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, FlushTestLossyProgressiveAlpha) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() - 1;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            30000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            11000u);
+
+  JxlDecoderDestroy(dec);
+}
+TEST(DecodeTest, FlushTestLossyProgressiveAlphaUpsampling) {
+  size_t xsize = 533, ysize = 401;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.resampling = 2;
+  params.cparams.ec_resampling = 4;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() * 2 / 3;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            125000u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            70000u);
+
+  JxlDecoderDestroy(dec);
+}
+TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
+  // Size large enough for multiple groups, required to have progressive
+  // stages
+  size_t xsize = 333, ysize = 300;
+  uint32_t num_channels = 4;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.cparams.responsive = 1;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      num_channels, params);
+  JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  std::vector<uint8_t> pixels2;
+  pixels2.resize(pixels.size());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+
+  // Ensure that the first part contains at least the full DC of the image,
+  // otherwise flush does not work.
+  size_t first_part = data.size() / 2;
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), first_part));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(info.xsize, xsize);
+  EXPECT_EQ(info.ysize, ysize);
+
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+  // Output buffer not yet set
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderFlushImage(dec));
+
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(pixels2.size(), buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &format, pixels2.data(), pixels2.size()));
+
+  // Must process input further until we get JXL_DEC_NEED_MORE_INPUT, even if
+  // data was already input before, since the processing of the frame only
+  // happens at the JxlDecoderProcessInput call after JXL_DEC_FRAME.
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format, 2560.0),
+            2700u);
+
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+
+  size_t consumed = first_part - JxlDecoderReleaseInput(dec);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data() + consumed,
+                                                data.size() - consumed));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  EXPECT_LE(jxl::test::ComparePixels(pixels2.data(), pixels.data(), xsize,
+                                     ysize, format, format),
+            0u);
+
+  JxlDecoderDestroy(dec);
+}
+
+class DecodeProgressiveTest : public ::testing::TestWithParam<int> {};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeProgressiveTestInstantiation,
+                                   DecodeProgressiveTest,
+                                   ::testing::Range(0, 8));
+TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
+  const int params = GetParam();
+  int single_group = params & 1;
+  int lossless = (params >> 1) & 1;
+  uint32_t num_channels = 3 + ((params >> 2) & 1);
+  std::set<JxlProgressiveDetail> progressive_details = {kDC, kLastPasses,
+                                                        kPasses};
+  for (auto prog_detail : progressive_details) {
+    // Only few combinations are expected to support outputting
+    // intermediate flushes for complete DC and complete passes.
+    // The test can be updated if more cases are expected to support it.
+    bool expect_flush = (num_channels & 1) && !lossless;
+    size_t xsize, ysize;
+    if (single_group) {
+      // An image smaller than 256x256 ensures it contains only 1 group.
+      xsize = 99;
+      ysize = 100;
+    } else {
+      xsize = 277;
+      ysize = 280;
+    }
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false);
+    jxl::CodecInOut io;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        color_encoding,
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &io.Main()));
+    jxl::TestCodestreamParams params;
+    if (lossless) {
+      params.cparams.SetLossless();
+    } else {
+      params.cparams.butteraugli_distance = 0.5f;
+    }
+    jxl::PassDefinition passes[] = {
+        {2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 1, 2}, {8, 0, 1}};
+    const int kNumPasses = 5;
+    jxl::ProgressiveMode progressive_mode{passes};
+    params.progressive_mode = &progressive_mode;
+    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
+        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+        num_channels, params);
+
+    for (size_t increment : {(size_t)1, data.size()}) {
+      printf(
+          "Testing with single_group=%d, lossless=%d, "
+          "num_channels=%d, prog_detail=%d, increment=%d\n",
+          single_group, lossless, (int)num_channels, (int)prog_detail,
+          (int)increment);
+      std::vector<std::vector<uint8_t>> passes(kNumPasses + 1);
+      for (int i = 0; i <= kNumPasses; ++i) {
+        passes[i].resize(pixels.size());
+      }
+
+      JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(
+                    dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                             JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kFrames));
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSetProgressiveDetail(dec, kDCProgressive));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kDCGroups));
+      EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderSetProgressiveDetail(dec, kGroups));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetProgressiveDetail(dec, prog_detail));
+
+      uint8_t* next_in = data.data();
+      size_t avail_in = 0;
+      size_t pos = 0;
+
+      auto process_input = [&]() {
+        for (;;) {
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          if (status == JXL_DEC_NEED_MORE_INPUT && pos < data.size()) {
+            size_t chunk = std::min<size_t>(increment, data.size() - pos);
+            pos += chunk;
+            avail_in += chunk;
+            continue;
+          }
+          return status;
+        }
+      };
+
+      EXPECT_EQ(JXL_DEC_BASIC_INFO, process_input());
+      JxlBasicInfo info;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+      EXPECT_EQ(info.xsize, xsize);
+      EXPECT_EQ(info.ysize, ysize);
+
+      EXPECT_EQ(JXL_DEC_FRAME, process_input());
+
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      EXPECT_EQ(pixels.size(), buffer_size);
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                     dec, &format, passes[kNumPasses].data(),
+                                     passes[kNumPasses].size()));
+
+      auto next_pass = [&](int pass) {
+        if (prog_detail <= kDC) return kNumPasses;
+        if (prog_detail <= kLastPasses) {
+          return std::min(pass + 2, kNumPasses);
+        }
+        return pass + 1;
+      };
+
+      if (expect_flush) {
+        // Return a particular downsampling ratio only after the last
+        // pass for that downsampling was processed.
+        int expected_downsampling_ratios[] = {8, 8, 4, 4, 2};
+        for (int p = 0; p < kNumPasses; p = next_pass(p)) {
+          EXPECT_EQ(JXL_DEC_FRAME_PROGRESSION, process_input());
+          EXPECT_EQ(expected_downsampling_ratios[p],
+                    JxlDecoderGetIntendedDownsamplingRatio(dec));
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderFlushImage(dec));
+          passes[p] = passes[kNumPasses];
+        }
+      }
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, process_input());
+      EXPECT_EQ(JXL_DEC_SUCCESS, process_input());
+
+      JxlDecoderDestroy(dec);
+
+      if (!expect_flush) {
+        continue;
+      }
+      jxl::ButteraugliParams ba;
+      std::vector<float> distances(kNumPasses + 1);
+      for (int p = 0;; p = next_pass(p)) {
+        jxl::CodecInOut io1;
+        EXPECT_TRUE(jxl::ConvertFromExternal(
+            jxl::Span<const uint8_t>(passes[p].data(), passes[p].size()), xsize,
+            ysize, color_encoding,
+            /*bits_per_sample=*/16, format,
+            /*pool=*/nullptr, &io1.Main()));
+        distances[p] = ButteraugliDistance(io.frames, io1.frames, ba,
+                                           jxl::GetJxlCms(), nullptr, nullptr);
+        if (p == kNumPasses) break;
+      }
+      const float kMaxDistance[kNumPasses + 1] = {30.0f, 20.0f, 10.0f,
+                                                  5.0f,  3.0f,  2.0f};
+      EXPECT_LT(distances[kNumPasses], kMaxDistance[kNumPasses]);
+      for (int p = 0; p < kNumPasses;) {
+        int next_p = next_pass(p);
+        EXPECT_LT(distances[p], kMaxDistance[p]);
+        // Verify that the returned pass image is actually not the
+        // same as the next pass image, by checking that it has a bit
+        // worse butteraugli score.
+        EXPECT_LT(distances[next_p] * 1.1f, distances[p]);
+        p = next_p;
+      }
+    }
+  }
+}
+
+void VerifyJPEGReconstruction(const jxl::PaddedBytes& container,
+                              const jxl::PaddedBytes& jpeg_bytes) {
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), container.data(), container.size());
+  EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get()));
+  std::vector<uint8_t> reconstructed_buffer(128);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(),
+                                    reconstructed_buffer.size()));
+  size_t used = 0;
+  JxlDecoderStatus process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+  while (process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+    used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+    reconstructed_buffer.resize(reconstructed_buffer.size() * 2);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used,
+                                reconstructed_buffer.size() - used));
+    process_result = JxlDecoderProcessInput(dec.get());
+  }
+  ASSERT_EQ(JXL_DEC_FULL_IMAGE, process_result);
+  used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+  ASSERT_EQ(used, jpeg_bytes.size());
+  EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), jpeg_bytes.data(), used));
+}
+
+#if JPEGXL_ENABLE_JPEG
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructTestCodestream)) {
+  size_t xsize = 123;
+  size_t ysize = 77;
+  size_t channels = 3;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
+  jxl::PaddedBytes jpeg_codestream;
+  jxl::TestCodestreamParams params;
+  params.cparams.color_transform = jxl::ColorTransform::kNone;
+  params.box_format = kCSBF_Single;
+  params.jpeg_codestream = &jpeg_codestream;
+  params.preview_mode = jxl::kSmallPreview;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
+      channels, params);
+  VerifyJPEGReconstruction(compressed, jpeg_codestream);
+}
+#endif  // JPEGXL_ENABLE_JPEG
+
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+  jxl::CodecInOut orig_io;
+  ASSERT_TRUE(
+      jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(orig), &orig_io));
+  orig_io.metadata.m.xyb_encoded = false;
+  jxl::BitWriter writer;
+  ASSERT_TRUE(WriteCodestreamHeaders(&orig_io.metadata, &writer, nullptr));
+  writer.ZeroPadToByte();
+  jxl::PassesEncoderState enc_state;
+  jxl::CompressParams cparams;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  ASSERT_TRUE(jxl::EncodeFrame(cparams, jxl::FrameInfo{}, &orig_io.metadata,
+                               orig_io.Main(), &enc_state, jxl::GetJxlCms(),
+                               /*pool=*/nullptr, &writer,
+                               /*aux_out=*/nullptr));
+
+  jxl::PaddedBytes jpeg_data;
+  ASSERT_TRUE(
+      EncodeJPEGData(*orig_io.Main().jpeg_data.get(), &jpeg_data, cparams));
+  jxl::PaddedBytes container;
+  container.append(jxl::kContainerHeader,
+                   jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+  jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
+                       &container);
+  container.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+  jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &container);
+  jxl::PaddedBytes codestream = std::move(writer).TakeBytes();
+  container.append(codestream.data(), codestream.data() + codestream.size());
+  VerifyJPEGReconstruction(container, orig);
+}
+
+TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionMetadataTest)) {
+  const std::string jpeg_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jpg";
+  const std::string jxl_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jxl";
+  const jxl::PaddedBytes jpeg = jxl::test::ReadTestData(jpeg_path);
+  const jxl::PaddedBytes jxl = jxl::test::ReadTestData(jxl_path);
+  VerifyJPEGReconstruction(jxl, jpeg);
+}
+
+TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {
+  size_t xsize = 80, ysize = 90;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  params.box_format = kCSBF_Multi_Other_Terminated;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+  StreamPositions streampos;
+  AnalyzeCodestream(compressed, &streampos);
+
+  // The non-essential final box size including 8-byte header
+  size_t final_box_size = unk3_box_size + 8;
+  size_t last_box_begin = compressed.size() - final_box_size;
+  // Verify that the test is indeed setup correctly to be at the beginning of
+  // the 'unkn' box header.
+  ASSERT_EQ(compressed[last_box_begin + 3], final_box_size);
+  ASSERT_EQ(compressed[last_box_begin + 4], 'u');
+  ASSERT_EQ(compressed[last_box_begin + 5], 'n');
+  ASSERT_EQ(compressed[last_box_begin + 6], 'k');
+  ASSERT_EQ(compressed[last_box_begin + 7], '3');
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), last_box_begin));
+
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+  // The decoder returns success despite not having seen the final unknown box
+  // yet. This is because calling JxlDecoderCloseInput is not mandatory for
+  // backwards compatibility, so it doesn't know more bytes follow, the current
+  // bytes ended at a perfectly valid place.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  size_t remaining = JxlDecoderReleaseInput(dec);
+  // Since the test was set up to end exactly at the boundary of the final
+  // codestream box, and the decoder returned success, all bytes are expected to
+  // be consumed until the end of the  frame header.
+  EXPECT_EQ(remaining, last_box_begin - streampos.frames[0].toc_end);
+
+  // Now set the remaining non-codestream box as input.
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data() + last_box_begin,
+                               compressed.size() - last_box_begin));
+  // Even though JxlDecoderProcessInput already returned JXL_DEC_SUCCESS before,
+  // when calling it again now after setting more input, success is expected, no
+  // event occurs but the box has been successfully skipped.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+namespace {
+bool BoxTypeEquals(const std::string& type_string, JxlBoxType type) {
+  return type_string.size() == 4 && type_string[0] == type[0] &&
+         type_string[1] == type[1] && type_string[2] == type[2] &&
+         type_string[3] == type[3];
+}
+}  // namespace
+
+TEST(DecodeTest, ExtentedBoxSizeTest) {
+  const std::string jxl_path = "jxl/boxes/square-extended-size-container.jxl";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jxl_path);
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+
+  JxlBoxType type;
+  uint64_t box_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, orig.data(), orig.size()));
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("JXL ", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(12, box_size);
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("ftyp", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(20, box_size);
+  EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_TRUE(BoxTypeEquals("jxlc", type));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+  EXPECT_EQ(72, box_size);
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(BoxTest)) {
+  size_t xsize = 1, ysize = 1;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  params.box_format = kCSBF_Multi_Other_Terminated;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+
+  std::vector<std::string> expected_box_types = {
+      "JXL ", "ftyp", "jxlp", "unk1", "unk2", "jxlp", "jxlp", "jxlp", "unk3"};
+
+  // Value 0 means to not test the size: codestream is not required to be a
+  // particular exact size.
+  std::vector<size_t> expected_box_sizes = {12, 20, 0, 34, 18, 0, 0, 0, 20};
+
+  JxlBoxType type;
+  uint64_t box_size;
+  std::vector<uint8_t> contents(50);
+  size_t expected_release_size = 0;
+
+  // Cannot get these when decoding didn't start yet
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+
+  uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+  for (size_t i = 0; i < expected_box_types.size(); i++) {
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_BOX, JxlDecoderProcessInput(dec));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxSizeRaw(dec, &box_size));
+    EXPECT_TRUE(BoxTypeEquals(expected_box_types[i], type));
+    if (expected_box_sizes[i]) {
+      EXPECT_EQ(expected_box_sizes[i], box_size);
+    }
+
+    if (expected_release_size > 0) {
+      EXPECT_EQ(expected_release_size, JxlDecoderReleaseBoxBuffer(dec));
+      expected_release_size = 0;
+    }
+
+    if (type[0] == 'u' && type[1] == 'n' && type[2] == 'k') {
+      JxlDecoderSetBoxBuffer(dec, contents.data(), contents.size());
+      size_t expected_box_contents_size =
+          type[3] == '1' ? unk1_box_size
+                         : (type[3] == '2' ? unk2_box_size : unk3_box_size);
+      expected_release_size = contents.size() - expected_box_contents_size;
+    }
+    size_t consumed = avail_in - JxlDecoderReleaseInput(dec);
+    next_in += consumed;
+    avail_in -= consumed;
+  }
+
+  // After the last DEC_BOX event, check that the input position is exactly at
+  // the stat of the box header.
+  EXPECT_EQ(avail_in, expected_box_sizes.back());
+
+  // Even though all input is given, the decoder cannot assume there aren't
+  // more boxes if the input was not closed.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
+  JxlDecoderCloseInput(dec);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(ExifBrobBoxTest)) {
+  size_t xsize = 1, ysize = 1;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  jxl::TestCodestreamParams params;
+  // Lossless to verify pixels exactly after roundtrip.
+  params.cparams.SetLossless();
+  params.box_format = kCSBF_Brob_Exif;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  // Test raw brob box, not brotli-decompressing
+  for (int streaming = 0; streaming < 2; ++streaming) {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+    if (!streaming) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+      JxlDecoderCloseInput(dec);
+    }
+    // for streaming input case
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = 0;
+    size_t total_in = 0;
+    size_t step_size = 64;
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+    bool seen_brob_begin = false;
+    bool seen_brob_end = false;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (streaming) {
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          size_t amount = step_size;
+          if (total_in + amount > compressed.size()) {
+            amount = compressed.size() - total_in;
+          }
+          avail_in += amount;
+          total_in += amount;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          if (total_in == compressed.size()) JxlDecoderCloseInput(dec);
+        } else {
+          FAIL();
+          break;
+        }
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          EXPECT_EQ(false, seen_brob_end);
+          seen_brob_end = true;
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          EXPECT_EQ(box_num_output, box_brob_exif_size - 8);
+          EXPECT_EQ(
+              0, memcmp(box_buffer.data(), box_brob_exif + 8, box_num_output));
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+        if (BoxTypeEquals("brob", type)) {
+          EXPECT_EQ(false, seen_brob_begin);
+          seen_brob_begin = true;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(true, seen_brob_begin);
+    EXPECT_EQ(true, seen_brob_end);
+
+    JxlDecoderDestroy(dec);
+  }
+
+  // Test decompressed brob box
+  for (int streaming = 0; streaming < 2; ++streaming) {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
+    if (!streaming) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+      JxlDecoderCloseInput(dec);
+    }
+    // for streaming input case
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = 0;
+    size_t total_in = 0;
+    size_t step_size = 64;
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+    bool seen_exif_begin = false;
+    bool seen_exif_end = false;
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetDecompressBoxes(dec, JXL_TRUE));
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        if (streaming) {
+          size_t remaining = JxlDecoderReleaseInput(dec);
+          EXPECT_LE(remaining, avail_in);
+          next_in += avail_in - remaining;
+          avail_in = remaining;
+          size_t amount = step_size;
+          if (total_in + amount > compressed.size()) {
+            amount = compressed.size() - total_in;
+          }
+          avail_in += amount;
+          total_in += amount;
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          if (total_in == compressed.size()) JxlDecoderCloseInput(dec);
+        } else {
+          FAIL();
+          break;
+        }
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          EXPECT_EQ(false, seen_exif_end);
+          seen_exif_end = true;
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          // Expect that the output has the same size and contents as the
+          // uncompressed exif data. Only check contents if the sizes match to
+          // avoid comparing uninitialized memory in the test.
+          EXPECT_EQ(box_num_output, exif_uncompressed_size);
+          if (box_num_output == exif_uncompressed_size) {
+            EXPECT_EQ(0, memcmp(box_buffer.data(), exif_uncompressed,
+                                exif_uncompressed_size));
+          }
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_TRUE));
+        if (BoxTypeEquals("Exif", type)) {
+          EXPECT_EQ(false, seen_exif_begin);
+          seen_exif_begin = true;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(true, seen_exif_begin);
+    EXPECT_EQ(true, seen_exif_end);
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, JXL_BOXES_TEST(PartialCodestreamBoxTest)) {
+  size_t xsize = 23, ysize = 81;
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  // Lossless to verify pixels exactly after roundtrip.
+  jxl::TestCodestreamParams params;
+  params.cparams.SetLossless();
+  params.cparams.speed_tier = jxl::SpeedTier::kThunder;
+  params.box_format = kCSBF_Multi;
+  params.add_icc_profile = true;
+  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
+      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
+      params);
+
+  std::vector<uint8_t> extracted_codestream;
+
+  {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    JxlDecoderCloseInput(dec);
+
+    size_t num_jxlp = 0;
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        FAIL();
+        break;
+      } else if (status == JXL_DEC_BASIC_INFO) {
+        JxlBasicInfo info;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+        EXPECT_EQ(info.xsize, xsize);
+        EXPECT_EQ(info.ysize, ysize);
+      } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(),
+                                              pixels2.size()));
+      } else if (status == JXL_DEC_FULL_IMAGE) {
+        continue;
+      } else if (status == JXL_DEC_BOX || status == JXL_DEC_SUCCESS) {
+        if (!box_buffer.empty()) {
+          size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+          box_num_output = box_buffer.size() - remaining;
+          EXPECT_GE(box_num_output, 4);
+          // Do not insert the first 4 bytes, which are not part of the
+          // codestream, but the partial codestream box index
+          extracted_codestream.insert(extracted_codestream.end(),
+                                      box_buffer.begin() + 4,
+                                      box_buffer.begin() + box_num_output);
+          box_buffer.clear();
+        }
+        if (status == JXL_DEC_SUCCESS) break;
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec, type, JXL_FALSE));
+        if (BoxTypeEquals("jxlp", type)) {
+          num_jxlp++;
+          box_buffer.resize(8);
+          JxlDecoderSetBoxBuffer(dec, box_buffer.data(), box_buffer.size());
+        }
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    // The test file created with kCSBF_Multi is expected to have 4 jxlp boxes.
+    EXPECT_EQ(4, num_jxlp);
+
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format_orig));
+
+    JxlDecoderDestroy(dec);
+  }
+
+  // Now test whether the codestream extracted from the jxlp boxes can itself
+  // also be decoded and gives the same pixels
+  {
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_BOX));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, extracted_codestream.data(),
+                                 extracted_codestream.size()));
+    JxlDecoderCloseInput(dec);
+
+    size_t num_boxes = 0;
+
+    std::vector<uint8_t> pixels2;
+    pixels2.resize(pixels.size());
+
+    std::vector<uint8_t> box_buffer;
+    size_t box_num_output;
+
+    for (;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec);
+      if (status == JXL_DEC_NEED_MORE_INPUT) {
+        FAIL();
+        break;
+      } else if (status == JXL_DEC_BASIC_INFO) {
+        JxlBasicInfo info;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+        EXPECT_EQ(info.xsize, xsize);
+        EXPECT_EQ(info.ysize, ysize);
+      } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format_orig, pixels2.data(),
+                                              pixels2.size()));
+      } else if (status == JXL_DEC_FULL_IMAGE) {
+        continue;
+      } else if (status == JXL_DEC_BOX) {
+        num_boxes++;
+      } else if (status == JXL_DEC_BOX_NEED_MORE_OUTPUT) {
+        size_t remaining = JxlDecoderReleaseBoxBuffer(dec);
+        box_num_output = box_buffer.size() - remaining;
+        box_buffer.resize(box_buffer.size() * 2);
+        JxlDecoderSetBoxBuffer(dec, box_buffer.data() + box_num_output,
+                               box_buffer.size() - box_num_output);
+      } else if (status == JXL_DEC_SUCCESS) {
+        break;
+      } else {
+        // We do not expect any other events or errors
+        FAIL();
+        break;
+      }
+    }
+
+    EXPECT_EQ(0, num_boxes);  // The data does not use the container format.
+    EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
+                                           ysize, format_orig, format_orig));
+
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, SpotColorTest) {
+  jxl::ThreadPool* pool = nullptr;
+  jxl::CodecInOut io;
+  size_t xsize = 55, ysize = 257;
+  io.metadata.m.color_encoding = jxl::ColorEncoding::LinearSRGB();
+  jxl::Image3F main(xsize, ysize);
+  jxl::ImageF spot(xsize, ysize);
+  jxl::ZeroFillImage(&main);
+  jxl::ZeroFillImage(&spot);
+
+  for (size_t y = 0; y < ysize; y++) {
+    float* JXL_RESTRICT rowm = main.PlaneRow(1, y);
+    float* JXL_RESTRICT rows = spot.Row(y);
+    for (size_t x = 0; x < xsize; x++) {
+      rowm[x] = (x + y) * (1.f / 255.f);
+      rows[x] = ((x ^ y) & 255) * (1.f / 255.f);
+    }
+  }
+  io.SetFromImage(std::move(main), jxl::ColorEncoding::LinearSRGB());
+  jxl::ExtraChannelInfo info;
+  info.bit_depth.bits_per_sample = 8;
+  info.dim_shift = 0;
+  info.type = jxl::ExtraChannel::kSpotColor;
+  info.spot_color[0] = 0.5f;
+  info.spot_color[1] = 0.2f;
+  info.spot_color[2] = 1.f;
+  info.spot_color[3] = 0.5f;
+
+  io.metadata.m.extra_channel_info.push_back(info);
+  std::vector<jxl::ImageF> ec;
+  ec.push_back(std::move(spot));
+  io.frames[0].SetExtraChannels(std::move(ec));
+
+  jxl::CompressParams cparams;
+  cparams.speed_tier = jxl::SpeedTier::kLightning;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+
+  jxl::PaddedBytes compressed;
+  std::unique_ptr<jxl::PassesEncoderState> enc_state =
+      jxl::make_unique<jxl::PassesEncoderState>();
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, enc_state.get(), &compressed,
+                              jxl::GetJxlCms(), nullptr, pool));
+
+  for (size_t render_spot = 0; render_spot < 2; render_spot++) {
+    JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+    if (!render_spot) {
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetRenderSpotcolors(dec, JXL_FALSE));
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo binfo;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &binfo));
+    EXPECT_EQ(1u, binfo.num_extra_channels);
+    EXPECT_EQ(xsize, binfo.xsize);
+    EXPECT_EQ(ysize, binfo.ysize);
+
+    JxlExtraChannelInfo extra_info;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelInfo(dec, 0, &extra_info));
+    EXPECT_EQ((unsigned int)jxl::ExtraChannel::kSpotColor, extra_info.type);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+    size_t extra_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderExtraChannelBufferSize(dec, &format, &extra_size, 0));
+
+    std::vector<uint8_t> image(buffer_size);
+    std::vector<uint8_t> extra(extra_size);
+    size_t bytes_per_pixel = format.num_channels *
+                             jxl::test::GetDataBits(format.data_type) /
+                             jxl::kBitsPerByte;
+    size_t stride = bytes_per_pixel * binfo.xsize;
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                   dec, &format, image.data(), image.size()));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetExtraChannelBuffer(dec, &format, extra.data(),
+                                              extra.size(), 0));
+
+    EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+    // After the full image was output, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+    JxlDecoderDestroy(dec);
+
+    for (size_t y = 0; y < ysize; y++) {
+      uint8_t* JXL_RESTRICT rowm = image.data() + stride * y;
+      uint8_t* JXL_RESTRICT rows = extra.data() + xsize * y;
+      for (size_t x = 0; x < xsize; x++) {
+        if (!render_spot) {
+          // if spot color isn't rendered, main image should be as we made it
+          // (red and blue are all zeroes)
+
+          EXPECT_EQ(rowm[x * 3 + 0], 0);
+          EXPECT_EQ(rowm[x * 3 + 1], (x + y > 255 ? 255 : x + y));
+          EXPECT_EQ(rowm[x * 3 + 2], 0);
+        }
+        if (render_spot) {
+          // if spot color is rendered, expect red and blue to look like the
+          // spot color channel
+          EXPECT_LT(abs(rowm[x * 3 + 0] - (rows[x] * 0.25f)), 1);
+          EXPECT_LT(abs(rowm[x * 3 + 2] - (rows[x] * 0.5f)), 1);
+        }
+        EXPECT_EQ(rows[x], ((x ^ y) & 255));
+      }
+    }
+  }
+}
+
+TEST(DecodeTest, CloseInput) {
+  std::vector<uint8_t> partial_file = {0xff};
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(),
+                                      JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec.get(), partial_file.data(),
+                                                partial_file.size()));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get()));
+  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec.get()));
+  JxlDecoderCloseInput(dec.get());
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderProcessInput(dec.get()));
+}
diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
new file mode 100644
index 0000000000..aa57b2723f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
@@ -0,0 +1,169 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/decode_to_jpeg.h"
+
+namespace jxl {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in,
+                                           size_t* avail_in) {
+  if (!inside_box_) {
+    JXL_ABORT(
+        "processing of JPEG reconstruction data outside JPEG reconstruction "
+        "box");
+  }
+  Span<const uint8_t> to_decode;
+  if (box_until_eof_) {
+    // Until EOF means consume all data.
+    to_decode = Span<const uint8_t>(*next_in, *avail_in);
+    *next_in += *avail_in;
+    *avail_in = 0;
+  } else {
+    // Defined size means consume min(available, needed).
+    size_t avail_recon_in =
+        std::min<size_t>(*avail_in, box_size_ - buffer_.size());
+    to_decode = Span<const uint8_t>(*next_in, avail_recon_in);
+    *next_in += avail_recon_in;
+    *avail_in -= avail_recon_in;
+  }
+  bool old_data_exists = !buffer_.empty();
+  if (old_data_exists) {
+    // Append incoming data to buffer if we already had data in the buffer.
+    buffer_.insert(buffer_.end(), to_decode.data(),
+                   to_decode.data() + to_decode.size());
+    to_decode = Span<const uint8_t>(buffer_.data(), buffer_.size());
+  }
+  if (!box_until_eof_ && to_decode.size() > box_size_) {
+    JXL_ABORT("JPEG reconstruction data to decode larger than expected");
+  }
+  if (box_until_eof_ || to_decode.size() == box_size_) {
+    // If undefined size, or the right size, try to decode.
+    jpeg_data_ = make_unique<jpeg::JPEGData>();
+    const auto status = jpeg::DecodeJPEGData(to_decode, jpeg_data_.get());
+    if (status.IsFatalError()) return JXL_DEC_ERROR;
+    if (status) {
+      // Successful decoding, emit event after updating state to track that we
+      // are no longer parsing JPEG reconstruction data.
+      inside_box_ = false;
+      return JXL_DEC_JPEG_RECONSTRUCTION;
+    }
+    if (box_until_eof_) {
+      // Unsuccessful decoding and undefined size, assume incomplete data. Copy
+      // the data if we haven't already.
+      if (!old_data_exists) {
+        buffer_.insert(buffer_.end(), to_decode.data(),
+                       to_decode.data() + to_decode.size());
+      }
+    } else {
+      // Unsuccessful decoding of correct amount of data, assume error.
+      return JXL_DEC_ERROR;
+    }
+  } else {
+    // Not enough data, copy the data if we haven't already.
+    if (!old_data_exists) {
+      buffer_.insert(buffer_.end(), to_decode.data(),
+                     to_decode.data() + to_decode.size());
+    }
+  }
+  return JXL_DEC_NEED_MORE_INPUT;
+}
+
+size_t JxlToJpegDecoder::NumExifMarkers(const jpeg::JPEGData& jpeg_data) {
+  size_t num = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      num++;
+    }
+  }
+  return num;
+}
+
+size_t JxlToJpegDecoder::NumXmpMarkers(const jpeg::JPEGData& jpeg_data) {
+  size_t num = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      num++;
+    }
+  }
+  return num;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::ExifBoxContentSize(
+    const jpeg::JPEGData& jpeg_data, size_t* size) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kExifTag)) {
+        // too small for app marker header
+        return JXL_DEC_ERROR;
+      }
+      // The first 4 bytes are the TIFF header from the box contents, and are
+      // not included in the JPEG
+      *size = jpeg_data.app_data[i].size() + 4 - 3 - sizeof(jpeg::kExifTag);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::XmlBoxContentSize(
+    const jpeg::JPEGData& jpeg_data, size_t* size) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); ++i) {
+    if (jpeg_data.app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      if (jpeg_data.app_data[i].size() < 3 + sizeof(jpeg::kXMPTag)) {
+        // too small for app marker header
+        return JXL_DEC_ERROR;
+      }
+      *size = jpeg_data.app_data[i].size() - 3 - sizeof(jpeg::kXMPTag);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+JxlDecoderStatus JxlToJpegDecoder::SetExif(const uint8_t* data, size_t size,
+                                           jpeg::JPEGData* jpeg_data) {
+  for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) {
+    if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kExif) {
+      if (jpeg_data->app_data[i].size() !=
+          size + 3 + sizeof(jpeg::kExifTag) - 4)
+        return JXL_DEC_ERROR;
+      // The first 9 bytes are used for JPEG marker header.
+      jpeg_data->app_data[i][0] = 0xE1;
+      // The second and third byte are already filled in correctly
+      memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kExifTag,
+             sizeof(jpeg::kExifTag));
+      // The first 4 bytes are the TIFF header from the box contents, and are
+      // not included in the JPEG
+      memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kExifTag),
+             data + 4, size - 4);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+JxlDecoderStatus JxlToJpegDecoder::SetXmp(const uint8_t* data, size_t size,
+                                          jpeg::JPEGData* jpeg_data) {
+  for (size_t i = 0; i < jpeg_data->app_data.size(); ++i) {
+    if (jpeg_data->app_marker_type[i] == jxl::jpeg::AppMarkerType::kXMP) {
+      if (jpeg_data->app_data[i].size() != size + 3 + sizeof(jpeg::kXMPTag))
+        return JXL_DEC_ERROR;
+      // The first 9 bytes are used for JPEG marker header.
+      jpeg_data->app_data[i][0] = 0xE1;
+      // The second and third byte are already filled in correctly
+      memcpy(jpeg_data->app_data[i].data() + 3, jpeg::kXMPTag,
+             sizeof(jpeg::kXMPTag));
+      memcpy(jpeg_data->app_data[i].data() + 3 + sizeof(jpeg::kXMPTag), data,
+             size);
+      return JXL_DEC_SUCCESS;
+    }
+  }
+  return JXL_DEC_ERROR;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
new file mode 100644
index 0000000000..a64ace27a2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
@@ -0,0 +1,217 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_DECODE_TO_JPEG_H_
+#define LIB_JXL_DECODE_TO_JPEG_H_
+
+// JPEG XL to JPEG bytes decoder logic. The JxlToJpegDecoder class keeps track
+// of the decoder state needed to parse the JPEG reconstruction box and provide
+// the reconstructed JPEG to the output buffer.
+
+#include <jxl/decode.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+namespace jxl {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+class JxlToJpegDecoder {
+ public:
+  // Returns whether an output buffer is set.
+  bool IsOutputSet() const { return next_out_ != nullptr; }
+
+  // Returns whether the decoder is parsing a boxa JPEG box was parsed.
+  bool IsParsingBox() const { return inside_box_; }
+
+  // Sets the output buffer used when producing JPEG output.
+  JxlDecoderStatus SetOutputBuffer(uint8_t* data, size_t size) {
+    if (next_out_) return JXL_DEC_ERROR;
+    next_out_ = data;
+    avail_size_ = size;
+    return JXL_DEC_SUCCESS;
+  }
+
+  // Releases the buffer set with SetOutputBuffer().
+  size_t ReleaseOutputBuffer() {
+    size_t result = avail_size_;
+    next_out_ = nullptr;
+    avail_size_ = 0;
+    return result;
+  }
+
+  void StartBox(bool box_until_eof, size_t contents_size) {
+    // A new box implies that we clear the buffer.
+    buffer_.clear();
+    inside_box_ = true;
+    if (box_until_eof) {
+      box_until_eof_ = true;
+    } else {
+      box_size_ = contents_size;
+    }
+  }
+
+  // Consumes data from next_in/avail_in to reconstruct JPEG data.
+  // Uses box_size_, inside_box_ and box_until_eof_ to calculate how much to
+  // consume. Potentially stores unparsed data in buffer_.
+  // Potentially populates jpeg_data_. Potentially updates inside_box_.
+  // Returns JXL_DEC_JPEG_RECONSTRUCTION when finished, JXL_DEC_NEED_MORE_INPUT
+  // if more input is needed, JXL_DEC_ERROR on parsing error.
+  JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in);
+
+  // Returns non-owned copy of the JPEGData, only after Process finished and
+  // the JPEGData was not yet moved to an image bundle with
+  // SetImageBundleJpegData.
+  jpeg::JPEGData* GetJpegData() { return jpeg_data_.get(); }
+
+  // Returns how many exif or xmp app markers are present in the JPEG data. A
+  // return value higher than 1 would require multiple exif boxes or multiple
+  // xmp boxes in the container format, and this is not supported by the API and
+  // considered an error. May only be called after Process returned success.
+  static size_t NumExifMarkers(const jpeg::JPEGData& jpeg_data);
+  static size_t NumXmpMarkers(const jpeg::JPEGData& jpeg_data);
+
+  // Returns box content size for metadata, using the known data from the app
+  // markers.
+  static JxlDecoderStatus ExifBoxContentSize(const jpeg::JPEGData& jpeg_data,
+                                             size_t* size);
+  static JxlDecoderStatus XmlBoxContentSize(const jpeg::JPEGData& jpeg_data,
+                                            size_t* size);
+
+  // Returns JXL_DEC_ERROR if there is no exif/XMP marker or the data size
+  // does not match, or this function is called before Process returned
+  // success, JXL_DEC_SUCCESS otherwise. As input, provide the full box contents
+  // but not the box header. In case of exif, this includes the 4-byte TIFF
+  // header, even though it won't be copied into the JPEG.
+  static JxlDecoderStatus SetExif(const uint8_t* data, size_t size,
+                                  jpeg::JPEGData* jpeg_data);
+  static JxlDecoderStatus SetXmp(const uint8_t* data, size_t size,
+                                 jpeg::JPEGData* jpeg_data);
+
+  // Sets the JpegData of the ImageBundle passed if there is anything to set.
+  // Releases the JpegData from this decoder if set.
+  Status SetImageBundleJpegData(ImageBundle* ib) {
+    if (IsOutputSet() && jpeg_data_ != nullptr) {
+      if (!jpeg::SetJPEGDataFromICC(ib->metadata()->color_encoding.ICC(),
+                                    jpeg_data_.get())) {
+        return false;
+      }
+      ib->jpeg_data.reset(jpeg_data_.release());
+    }
+    return true;
+  }
+
+  JxlDecoderStatus WriteOutput(const jpeg::JPEGData& jpeg_data) {
+    // Copy JPEG bytestream if desired.
+    uint8_t* tmp_next_out = next_out_;
+    size_t tmp_avail_size = avail_size_;
+    auto write = [&tmp_next_out, &tmp_avail_size](const uint8_t* buf,
+                                                  size_t len) {
+      size_t to_write = std::min<size_t>(tmp_avail_size, len);
+      if (to_write != 0) memcpy(tmp_next_out, buf, to_write);
+      tmp_next_out += to_write;
+      tmp_avail_size -= to_write;
+      return to_write;
+    };
+    Status write_result = jpeg::WriteJpeg(jpeg_data, write);
+    if (!write_result) {
+      if (tmp_avail_size == 0) {
+        return JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+      }
+      return JXL_DEC_ERROR;
+    }
+    next_out_ = tmp_next_out;
+    avail_size_ = tmp_avail_size;
+    return JXL_DEC_SUCCESS;
+  }
+
+ private:
+  // Content of the most recently parsed JPEG reconstruction box if any.
+  std::vector<uint8_t> buffer_;
+
+  // Decoded content of the most recently parsed JPEG reconstruction box is
+  // stored here.
+  std::unique_ptr<jpeg::JPEGData> jpeg_data_;
+
+  // True if the decoder is currently reading bytes inside a JPEG reconstruction
+  // box.
+  bool inside_box_ = false;
+
+  // True if the JPEG reconstruction box had undefined size (all remaining
+  // bytes).
+  bool box_until_eof_ = false;
+  // Size of most recently parsed JPEG reconstruction box contents.
+  size_t box_size_ = 0;
+
+  // Next bytes to write JPEG reconstruction to.
+  uint8_t* next_out_ = nullptr;
+  // Available bytes to write JPEG reconstruction to.
+  size_t avail_size_ = 0;
+};
+
+#else
+
+// Fake class that disables support for decoding JPEG XL to JPEG.
+class JxlToJpegDecoder {
+ public:
+  bool IsOutputSet() const { return false; }
+  bool IsParsingBox() const { return false; }
+
+  JxlDecoderStatus SetOutputBuffer(uint8_t* /* data */, size_t /* size */) {
+    return JXL_DEC_ERROR;
+  }
+  size_t ReleaseOutputBuffer() { return 0; }
+
+  void StartBox(bool /* box_until_eof */, size_t /* contents_size */) {}
+
+  JxlDecoderStatus Process(const uint8_t** next_in, size_t* avail_in) {
+    return JXL_DEC_ERROR;
+  }
+  jpeg::JPEGData* GetJpegData() { return nullptr; }
+
+  Status SetImageBundleJpegData(ImageBundle* /* ib */) { return true; }
+
+  static size_t NumExifMarkers(const jpeg::JPEGData& /*jpeg_data*/) {
+    return 0;
+  }
+  static size_t NumXmpMarkers(const jpeg::JPEGData& /*jpeg_data*/) { return 0; }
+  static size_t ExifBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/,
+                                   size_t* /*size*/) {
+    return JXL_DEC_ERROR;
+  }
+  static size_t XmlBoxContentSize(const jpeg::JPEGData& /*jpeg_data*/,
+                                  size_t* /*size*/) {
+    return JXL_DEC_ERROR;
+  }
+  static JxlDecoderStatus SetExif(const uint8_t* /*data*/, size_t /*size*/,
+                                  jpeg::JPEGData* /*jpeg_data*/) {
+    return JXL_DEC_ERROR;
+  }
+  static JxlDecoderStatus SetXmp(const uint8_t* /*data*/, size_t /*size*/,
+                                 jpeg::JPEGData* /*jpeg_data*/) {
+    return JXL_DEC_ERROR;
+  }
+
+  JxlDecoderStatus WriteOutput(const jpeg::JPEGData& /* jpeg_data */) {
+    return JXL_DEC_SUCCESS;
+  }
+};
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_DECODE_TO_JPEG_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
new file mode 100644
index 0000000000..2b4d84196f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
@@ -0,0 +1,1168 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ac_strategy.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_ac_strategy.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/fast_math-inl.h"
+
+// Some of the floating point constants in this file and in other
+// files in the libjxl project have been obtained using the
+// tools/optimizer/simplex_fork.py tool. It is a variation of
+// Nelder-Mead optimization, and we generally try to minimize
+// BPP * pnorm aggregate as reported by the benchmark_xl tool,
+// but occasionally the values are optimized by using additional
+// constraints such as maintaining a certain density, or ratio of
+// popularity of integral transforms. Jyrki visually reviews all
+// such changes and often makes manual changes to maintain good
+// visual quality to changes where butteraugli was not sufficiently
+// sensitive to some kind of degradation. Unfortunately image quality
+// is still more of an art than science.
+
+// This must come before the begin/end_target, but HWY_ONCE is only true
+// after that, so use an "include guard".
+#ifndef LIB_JXL_ENC_AC_STRATEGY_
+#define LIB_JXL_ENC_AC_STRATEGY_
+// Parameters of the heuristic are marked with a OPTIMIZE comment.
+namespace jxl {
+
+// Debugging utilities.
+
+// Returns a linear sRGB color (as bytes) for each AC strategy.
+const uint8_t* TypeColor(const uint8_t& raw_strategy) {
+  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
+  static_assert(AcStrategy::kNumValidStrategies == 27, "Change colors");
+  static constexpr uint8_t kColors[][3] = {
+      {0xFF, 0xFF, 0x00},  // DCT8
+      {0xFF, 0x80, 0x80},  // HORNUSS
+      {0xFF, 0x80, 0x80},  // DCT2x2
+      {0xFF, 0x80, 0x80},  // DCT4x4
+      {0x80, 0xFF, 0x00},  // DCT16x16
+      {0x00, 0xC0, 0x00},  // DCT32x32
+      {0xC0, 0xFF, 0x00},  // DCT16x8
+      {0xC0, 0xFF, 0x00},  // DCT8x16
+      {0x00, 0xFF, 0x00},  // DCT32x8
+      {0x00, 0xFF, 0x00},  // DCT8x32
+      {0x00, 0xFF, 0x00},  // DCT32x16
+      {0x00, 0xFF, 0x00},  // DCT16x32
+      {0xFF, 0x80, 0x00},  // DCT4x8
+      {0xFF, 0x80, 0x00},  // DCT8x4
+      {0xFF, 0xFF, 0x80},  // AFV0
+      {0xFF, 0xFF, 0x80},  // AFV1
+      {0xFF, 0xFF, 0x80},  // AFV2
+      {0xFF, 0xFF, 0x80},  // AFV3
+      {0x00, 0xC0, 0xFF},  // DCT64x64
+      {0x00, 0xFF, 0xFF},  // DCT64x32
+      {0x00, 0xFF, 0xFF},  // DCT32x64
+      {0x00, 0x40, 0xFF},  // DCT128x128
+      {0x00, 0x80, 0xFF},  // DCT128x64
+      {0x00, 0x80, 0xFF},  // DCT64x128
+      {0x00, 0x00, 0xC0},  // DCT256x256
+      {0x00, 0x00, 0xFF},  // DCT256x128
+      {0x00, 0x00, 0xFF},  // DCT128x256
+  };
+  return kColors[raw_strategy];
+}
+
+const uint8_t* TypeMask(const uint8_t& raw_strategy) {
+  JXL_ASSERT(AcStrategy::IsRawStrategyValid(raw_strategy));
+  static_assert(AcStrategy::kNumValidStrategies == 27, "Add masks");
+  // implicitly, first row and column is made dark
+  static constexpr uint8_t kMask[][64] = {
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // DCT8
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 1, 1, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 1, 0, 0, 1, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // HORNUSS
+      {
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          1, 0, 1, 0, 1, 0, 1, 0,  //
+      },                           // 2x2
+      {
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+      },                           // 4x4
+      {},                          // DCT16x16 (unused)
+      {},                          // DCT32x32 (unused)
+      {},                          // DCT16x8 (unused)
+      {},                          // DCT8x16 (unused)
+      {},                          // DCT32x8 (unused)
+      {},                          // DCT8x32 (unused)
+      {},                          // DCT32x16 (unused)
+      {},                          // DCT16x32 (unused)
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 1, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // DCT4x8
+      {
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+          0, 0, 0, 0, 1, 0, 0, 0,  //
+      },                           // DCT8x4
+      {
+          1, 1, 1, 1, 1, 0, 0, 0,  //
+          1, 1, 1, 1, 0, 0, 0, 0,  //
+          1, 1, 1, 0, 0, 0, 0, 0,  //
+          1, 1, 0, 0, 0, 0, 0, 0,  //
+          1, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // AFV0
+      {
+          0, 0, 0, 0, 1, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 1, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 1, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 1,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+      },                           // AFV1
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 0, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 0, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 0, 0, 0, 0, 0,  //
+          1, 1, 1, 1, 0, 0, 0, 0,  //
+      },                           // AFV2
+      {
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 0,  //
+          0, 0, 0, 0, 0, 0, 0, 1,  //
+          0, 0, 0, 0, 0, 0, 1, 1,  //
+          0, 0, 0, 0, 0, 1, 1, 1,  //
+      },                           // AFV3
+  };
+  return kMask[raw_strategy];
+}
+
+void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
+                    size_t ysize, const char* tag, AuxOut* aux_out) {
+  Image3F color_acs(xsize, ysize);
+  for (size_t y = 0; y < ysize; y++) {
+    float* JXL_RESTRICT rows[3] = {
+        color_acs.PlaneRow(0, y),
+        color_acs.PlaneRow(1, y),
+        color_acs.PlaneRow(2, y),
+    };
+    const AcStrategyRow acs_row = ac_strategy.ConstRow(y / kBlockDim);
+    for (size_t x = 0; x < xsize; x++) {
+      AcStrategy acs = acs_row[x / kBlockDim];
+      const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
+      for (size_t c = 0; c < 3; c++) {
+        rows[c][x] = color[c] / 255.f;
+      }
+    }
+  }
+  size_t stride = color_acs.PixelsPerRow();
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t by = 0; by < DivCeil(ysize, kBlockDim); by++) {
+      float* JXL_RESTRICT row = color_acs.PlaneRow(c, by * kBlockDim);
+      const AcStrategyRow acs_row = ac_strategy.ConstRow(by);
+      for (size_t bx = 0; bx < DivCeil(xsize, kBlockDim); bx++) {
+        AcStrategy acs = acs_row[bx];
+        if (!acs.IsFirstBlock()) continue;
+        const uint8_t* JXL_RESTRICT color = TypeColor(acs.RawStrategy());
+        const uint8_t* JXL_RESTRICT mask = TypeMask(acs.RawStrategy());
+        if (acs.covered_blocks_x() == 1 && acs.covered_blocks_y() == 1) {
+          for (size_t iy = 0; iy < kBlockDim && by * kBlockDim + iy < ysize;
+               iy++) {
+            for (size_t ix = 0; ix < kBlockDim && bx * kBlockDim + ix < xsize;
+                 ix++) {
+              if (mask[iy * kBlockDim + ix]) {
+                row[iy * stride + bx * kBlockDim + ix] = color[c] / 800.f;
+              }
+            }
+          }
+        }
+        // draw block edges
+        for (size_t ix = 0; ix < kBlockDim * acs.covered_blocks_x() &&
+                            bx * kBlockDim + ix < xsize;
+             ix++) {
+          row[0 * stride + bx * kBlockDim + ix] = color[c] / 350.f;
+        }
+        for (size_t iy = 0; iy < kBlockDim * acs.covered_blocks_y() &&
+                            by * kBlockDim + iy < ysize;
+             iy++) {
+          row[iy * stride + bx * kBlockDim + 0] = color[c] / 350.f;
+        }
+      }
+    }
+  }
+  aux_out->DumpImage(tag, color_acs);
+}
+
+}  // namespace jxl
+#endif  // LIB_JXL_ENC_AC_STRATEGY_
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Round;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+bool MultiBlockTransformCrossesHorizontalBoundary(
+    const AcStrategyImage& ac_strategy, size_t start_x, size_t y,
+    size_t end_x) {
+  if (start_x >= ac_strategy.xsize() || y >= ac_strategy.ysize()) {
+    return false;
+  }
+  if (y % 8 == 0) {
+    // Nothing crosses 64x64 boundaries, and the memory on the other side
+    // of the 64x64 block may still uninitialized.
+    return false;
+  }
+  end_x = std::min(end_x, ac_strategy.xsize());
+  // The first multiblock might be before the start_x, let's adjust it
+  // to point to the first IsFirstBlock() == true block we find by backward
+  // tracing.
+  AcStrategyRow row = ac_strategy.ConstRow(y);
+  const size_t start_x_limit = start_x & ~7;
+  while (start_x != start_x_limit && !row[start_x].IsFirstBlock()) {
+    --start_x;
+  }
+  for (size_t x = start_x; x < end_x;) {
+    if (row[x].IsFirstBlock()) {
+      x += row[x].covered_blocks_x();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MultiBlockTransformCrossesVerticalBoundary(
+    const AcStrategyImage& ac_strategy, size_t x, size_t start_y,
+    size_t end_y) {
+  if (x >= ac_strategy.xsize() || start_y >= ac_strategy.ysize()) {
+    return false;
+  }
+  if (x % 8 == 0) {
+    // Nothing crosses 64x64 boundaries, and the memory on the other side
+    // of the 64x64 block may still uninitialized.
+    return false;
+  }
+  end_y = std::min(end_y, ac_strategy.ysize());
+  // The first multiblock might be before the start_y, let's adjust it
+  // to point to the first IsFirstBlock() == true block we find by backward
+  // tracing.
+  const size_t start_y_limit = start_y & ~7;
+  while (start_y != start_y_limit &&
+         !ac_strategy.ConstRow(start_y)[x].IsFirstBlock()) {
+    --start_y;
+  }
+
+  for (size_t y = start_y; y < end_y;) {
+    AcStrategyRow row = ac_strategy.ConstRow(y);
+    if (row[x].IsFirstBlock()) {
+      y += row[x].covered_blocks_y();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+static const float kChromaErrorWeight[AcStrategy::kNumValidStrategies] = {
+    0.95f,  // DCT = 0,
+    1.0f,   // IDENTITY = 1,
+    0.5f,   // DCT2X2 = 2,
+    1.0f,   // DCT4X4 = 3,
+    2.0f,   // DCT16X16 = 4,
+    2.0f,   // DCT32X32 = 5,
+    1.4f,   // DCT16X8 = 6,
+    1.4f,   // DCT8X16 = 7,
+    2.0f,   // DCT32X8 = 8,
+    2.0f,   // DCT8X32 = 9,
+    2.0f,   // DCT32X16 = 10,
+    2.0f,   // DCT16X32 = 11,
+    2.0f,   // DCT4X8 = 12,
+    2.0f,   // DCT8X4 = 13,
+    1.7f,   // AFV0 = 14,
+    1.7f,   // AFV1 = 15,
+    1.7f,   // AFV2 = 16,
+    1.7f,   // AFV3 = 17,
+    2.0f,   // DCT64X64 = 18,
+    2.0f,   // DCT64X32 = 19,
+    2.0f,   // DCT32X64 = 20,
+    2.0f,   // DCT128X128 = 21,
+    2.0f,   // DCT128X64 = 22,
+    2.0f,   // DCT64X128 = 23,
+    2.0f,   // DCT256X256 = 24,
+    2.0f,   // DCT256X128 = 25,
+    2.0f,   // DCT128X256 = 26,
+};
+
+float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
+                      const ACSConfig& config,
+                      const float* JXL_RESTRICT cmap_factors, float* block,
+                      float* scratch_space, uint32_t* quantized) {
+  const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
+
+  // Apply transform.
+  for (size_t c = 0; c < 3; c++) {
+    float* JXL_RESTRICT block_c = block + size * c;
+    TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
+                        config.src_stride, block_c, scratch_space);
+  }
+
+  HWY_FULL(float) df;
+
+  const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
+  // avoid large blocks when there is a lot going on in red-green.
+  float cmul[3] = {kChromaErrorWeight[acs.RawStrategy()], 1.0f, 1.0f};
+  float quant_norm8 = 0;
+  float masking = 0;
+  if (num_blocks == 1) {
+    // When it is only one 8x8, we don't need aggregation of values.
+    quant_norm8 = config.Quant(x / 8, y / 8);
+    masking = 2.0f * config.Masking(x / 8, y / 8);
+  } else if (num_blocks == 2) {
+    // Taking max instead of 8th norm seems to work
+    // better for smallest blocks up to 16x8. Jyrki couldn't get
+    // improvements in trying the same for 16x16 blocks.
+    if (acs.covered_blocks_y() == 2) {
+      quant_norm8 =
+          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
+      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
+                                config.Masking(x / 8, y / 8 + 1));
+    } else {
+      quant_norm8 =
+          std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
+      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
+                                config.Masking(x / 8 + 1, y / 8));
+    }
+  } else {
+    float masking_norm2 = 0;
+    float masking_max = 0;
+    // Load QF value, calculate empirical heuristic on masking field
+    // for weighting the information loss. Information loss manifests
+    // itself as ringing, and masking could hide it.
+    for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+      for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+        float qval = config.Quant(x / 8 + ix, y / 8 + iy);
+        qval *= qval;
+        qval *= qval;
+        quant_norm8 += qval * qval;
+        float maskval = config.Masking(x / 8 + ix, y / 8 + iy);
+        masking_max = std::max<float>(masking_max, maskval);
+        masking_norm2 += maskval * maskval;
+      }
+    }
+    quant_norm8 /= num_blocks;
+    quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f);
+    masking_norm2 = sqrt(masking_norm2 / num_blocks);
+    // This is a highly empirical formula.
+    masking = (masking_norm2 + masking_max);
+  }
+  const auto q = Set(df, quant_norm8);
+
+  // Compute entropy.
+  float entropy = config.base_entropy;
+  auto info_loss = Zero(df);
+  auto info_loss2 = Zero(df);
+
+  for (size_t c = 0; c < 3; c++) {
+    const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c);
+    const auto cmap_factor = Set(df, cmap_factors[c]);
+
+    auto entropy_v = Zero(df);
+    auto nzeros_v = Zero(df);
+    auto cost1 = Set(df, config.cost1);
+    auto cost2 = Set(df, config.cost2);
+    auto cost_delta = Set(df, config.cost_delta);
+    for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
+      const auto in = Load(df, block + c * size + i);
+      const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
+      const auto im = Load(df, inv_matrix + i);
+      const auto val = Mul(Sub(in, in_y), Mul(im, q));
+      const auto rval = Round(val);
+      const auto diff = AbsDiff(val, rval);
+      info_loss = Add(info_loss, diff);
+      info_loss2 = MulAdd(diff, diff, info_loss2);
+      const auto q = Abs(rval);
+      const auto q_is_zero = Eq(q, Zero(df));
+      entropy_v = Add(entropy_v, IfThenElseZero(Ge(q, Set(df, 1.5f)), cost2));
+      // We used to have q * C here, but that cost model seems to
+      // be punishing large values more than necessary. Sqrt tries
+      // to avoid large values less aggressively. Having high accuracy
+      // around zero is most important at low qualities, and there
+      // we have directly specified costs for 0, 1, and 2.
+      entropy_v = MulAdd(Sqrt(q), cost_delta, entropy_v);
+      nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
+    }
+    entropy_v = MulAdd(nzeros_v, cost1, entropy_v);
+
+    entropy += cmul[c] * GetLane(SumOfLanes(df, entropy_v));
+    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
+    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
+    // number of non-zeros of the block.
+    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
+    // Also add #bit of #bit of num_nonzeros, to estimate the ANS cost, with a
+    // bias.
+    entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
+  }
+  float ret =
+      entropy +
+      masking *
+          ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) +
+           (config.info_loss_multiplier2 *
+            sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2)))));
+  return ret;
+}
+
+uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
+                             const ACSConfig& config,
+                             const float* JXL_RESTRICT cmap_factors,
+                             AcStrategyImage* JXL_RESTRICT ac_strategy,
+                             float* block, float* scratch_space,
+                             uint32_t* quantized, float* entropy_out) {
+  struct TransformTry8x8 {
+    AcStrategy::Type type;
+    int encoding_speed_tier_max_limit;
+    float entropy_add;
+    float entropy_mul;
+  };
+  static const TransformTry8x8 kTransforms8x8[] = {
+      {
+          AcStrategy::Type::DCT,
+          9,
+          3.0f,
+          0.745f,
+      },
+      {
+          AcStrategy::Type::DCT4X4,
+          5,
+          4.0f,
+          0.7f,
+      },
+      {
+          AcStrategy::Type::DCT2X2,
+          5,
+          0.0f,
+          0.66f,
+      },
+      {
+          AcStrategy::Type::DCT4X8,
+          4,
+          0.0f,
+          0.700754622182473063f,
+      },
+      {
+          AcStrategy::Type::DCT8X4,
+          4,
+          0.0f,
+          0.700754622182473063f,
+      },
+      {
+          AcStrategy::Type::IDENTITY,
+          5,
+          8.0f,
+          0.81217614513585534f,
+      },
+      {
+          AcStrategy::Type::AFV0,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV1,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV2,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+      {
+          AcStrategy::Type::AFV3,
+          4,
+          3.0f,
+          0.70086131125719425f,
+      },
+  };
+  double best = 1e30;
+  uint8_t best_tx = kTransforms8x8[0].type;
+  for (auto tx : kTransforms8x8) {
+    if (tx.encoding_speed_tier_max_limit < encoding_speed_tier) {
+      continue;
+    }
+    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
+    float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block,
+                                    scratch_space, quantized);
+    entropy = tx.entropy_add + tx.entropy_mul * entropy;
+    if (entropy < best) {
+      best_tx = tx.type;
+      best = entropy;
+    }
+  }
+  *entropy_out = best;
+  return best_tx;
+}
+
+// bx, by addresses the 64x64 block at 8x8 subresolution
+// cx, cy addresses the left, upper 8x8 block position of the candidate
+// transform.
+void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx,
+                 size_t cy, const ACSConfig& config,
+                 const float* JXL_RESTRICT cmap_factors,
+                 AcStrategyImage* JXL_RESTRICT ac_strategy,
+                 const float entropy_mul, const uint8_t candidate_priority,
+                 uint8_t* priority, float* JXL_RESTRICT entropy_estimate,
+                 float* block, float* scratch_space, uint32_t* quantized) {
+  AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
+  float entropy_current = 0;
+  for (size_t iy = 0; iy < acs.covered_blocks_y(); ++iy) {
+    for (size_t ix = 0; ix < acs.covered_blocks_x(); ++ix) {
+      if (priority[(cy + iy) * 8 + (cx + ix)] >= candidate_priority) {
+        // Transform would reuse already allocated blocks and
+        // lead to invalid overlaps, for example DCT64X32 vs.
+        // DCT32X64.
+        return;
+      }
+      entropy_current += entropy_estimate[(cy + iy) * 8 + (cx + ix)];
+    }
+  }
+  float entropy_candidate =
+      entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config,
+                                    cmap_factors, block, scratch_space,
+                                    quantized);
+  if (entropy_candidate >= entropy_current) return;
+  // Accept the candidate.
+  for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+    for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+      entropy_estimate[(cy + iy) * 8 + cx + ix] = 0;
+      priority[(cy + iy) * 8 + cx + ix] = candidate_priority;
+    }
+  }
+  ac_strategy->Set(bx + cx, by + cy, acs_raw);
+  entropy_estimate[cy * 8 + cx] = entropy_candidate;
+}
+
+static void SetEntropyForTransform(size_t cx, size_t cy,
+                                   const AcStrategy::Type acs_raw,
+                                   float entropy,
+                                   float* JXL_RESTRICT entropy_estimate) {
+  const AcStrategy acs = AcStrategy::FromRawStrategy(acs_raw);
+  for (size_t dy = 0; dy < acs.covered_blocks_y(); ++dy) {
+    for (size_t dx = 0; dx < acs.covered_blocks_x(); ++dx) {
+      entropy_estimate[(cy + dy) * 8 + cx + dx] = 0.0;
+    }
+  }
+  entropy_estimate[cy * 8 + cx] = entropy;
+}
+
+AcStrategy::Type AcsSquare(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT16X16;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT32X32;
+  } else {
+    return AcStrategy::Type::DCT64X64;
+  }
+}
+
+AcStrategy::Type AcsVerticalSplit(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT16X8;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT32X16;
+  } else {
+    return AcStrategy::Type::DCT64X32;
+  }
+}
+
+AcStrategy::Type AcsHorizontalSplit(size_t blocks) {
+  if (blocks == 2) {
+    return AcStrategy::Type::DCT8X16;
+  } else if (blocks == 4) {
+    return AcStrategy::Type::DCT16X32;
+  } else {
+    return AcStrategy::Type::DCT32X64;
+  }
+}
+
+// The following function tries to merge smaller transforms into
+// squares and the rectangles originating from a single middle division
+// (horizontal or vertical) fairly.
+//
+// This is now generalized to concern about squares
+// of blocks X blocks size, where a block is 8x8 pixels.
+void FindBestFirstLevelDivisionForSquare(
+    size_t blocks, bool allow_square_transform, size_t bx, size_t by, size_t cx,
+    size_t cy, const ACSConfig& config, const float* JXL_RESTRICT cmap_factors,
+    AcStrategyImage* JXL_RESTRICT ac_strategy, const float entropy_mul_JXK,
+    const float entropy_mul_JXJ, float* JXL_RESTRICT entropy_estimate,
+    float* block, float* scratch_space, uint32_t* quantized) {
+  // We denote J for the larger dimension here, and K for the smaller.
+  // For example, for 32x32 block splitting, J would be 32, K 16.
+  const size_t blocks_half = blocks / 2;
+  const AcStrategy::Type acs_rawJXK = AcsVerticalSplit(blocks);
+  const AcStrategy::Type acs_rawKXJ = AcsHorizontalSplit(blocks);
+  const AcStrategy::Type acs_rawJXJ = AcsSquare(blocks);
+  const AcStrategy acsJXK = AcStrategy::FromRawStrategy(acs_rawJXK);
+  const AcStrategy acsKXJ = AcStrategy::FromRawStrategy(acs_rawKXJ);
+  const AcStrategy acsJXJ = AcStrategy::FromRawStrategy(acs_rawJXJ);
+  AcStrategyRow row0 = ac_strategy->ConstRow(by + cy + 0);
+  AcStrategyRow row1 = ac_strategy->ConstRow(by + cy + blocks_half);
+  // Let's check if we can consider a JXJ block here at all.
+  // This is not necessary in the basic use of hierarchically merging
+  // blocks in the simplest possible way, but is needed when we try other
+  // 'floating' options of merging, possibly after a simple hierarchical
+  // merge has been explored.
+  if (MultiBlockTransformCrossesHorizontalBoundary(*ac_strategy, bx + cx,
+                                                   by + cy, bx + cx + blocks) ||
+      MultiBlockTransformCrossesHorizontalBoundary(
+          *ac_strategy, bx + cx, by + cy + blocks, bx + cx + blocks) ||
+      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx, by + cy,
+                                                 by + cy + blocks) ||
+      MultiBlockTransformCrossesVerticalBoundary(*ac_strategy, bx + cx + blocks,
+                                                 by + cy, by + cy + blocks)) {
+    return;  // not suitable for JxJ analysis, some transforms leak out.
+  }
+  // For floating transforms there may be
+  // already blocks selected that make either or both JXK and
+  // KXJ not feasible for this location.
+  const bool allow_JXK = !MultiBlockTransformCrossesVerticalBoundary(
+      *ac_strategy, bx + cx + blocks_half, by + cy, by + cy + blocks);
+  const bool allow_KXJ = !MultiBlockTransformCrossesHorizontalBoundary(
+      *ac_strategy, bx + cx, by + cy + blocks_half, bx + cx + blocks);
+  // Current entropies aggregated on NxN resolution.
+  float entropy[2][2] = {};
+  for (size_t dy = 0; dy < blocks; ++dy) {
+    for (size_t dx = 0; dx < blocks; ++dx) {
+      entropy[dy / blocks_half][dx / blocks_half] +=
+          entropy_estimate[(cy + dy) * 8 + (cx + dx)];
+    }
+  }
+  float entropy_JXK_left = std::numeric_limits<float>::max();
+  float entropy_JXK_right = std::numeric_limits<float>::max();
+  float entropy_KXJ_top = std::numeric_limits<float>::max();
+  float entropy_KXJ_bottom = std::numeric_limits<float>::max();
+  float entropy_JXJ = std::numeric_limits<float>::max();
+  if (allow_JXK) {
+    if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) {
+      entropy_JXK_left =
+          entropy_mul_JXK *
+          EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+                          cmap_factors, block, scratch_space, quantized);
+    }
+    if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) {
+      entropy_JXK_right =
+          entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8,
+                                            (by + cy + 0) * 8, config,
+                                            cmap_factors, block, scratch_space,
+                                            quantized);
+    }
+  }
+  if (allow_KXJ) {
+    if (row0[bx + cx].RawStrategy() != acs_rawKXJ) {
+      entropy_KXJ_top =
+          entropy_mul_JXK *
+          EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+                          cmap_factors, block, scratch_space, quantized);
+    }
+    if (row1[bx + cx].RawStrategy() != acs_rawKXJ) {
+      entropy_KXJ_bottom =
+          entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8,
+                                            (by + cy + blocks_half) * 8, config,
+                                            cmap_factors, block, scratch_space,
+                                            quantized);
+    }
+  }
+  if (allow_square_transform) {
+    // We control the exploration of the square transform separately so that
+    // we can turn it off at high decoding speeds for 32x32, but still allow
+    // exploring 16x32 and 32x16.
+    entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8,
+                                                    (by + cy + 0) * 8, config,
+                                                    cmap_factors, block,
+                                                    scratch_space, quantized);
+  }
+
+  // Test if this block should have JXK or KXJ transforms,
+  // because it can have only one or the other.
+  float costJxN = std::min(entropy_JXK_left, entropy[0][0] + entropy[1][0]) +
+                  std::min(entropy_JXK_right, entropy[0][1] + entropy[1][1]);
+  float costNxJ = std::min(entropy_KXJ_top, entropy[0][0] + entropy[0][1]) +
+                  std::min(entropy_KXJ_bottom, entropy[1][0] + entropy[1][1]);
+  if (entropy_JXJ < costJxN && entropy_JXJ < costNxJ) {
+    ac_strategy->Set(bx + cx, by + cy, acs_rawJXJ);
+    SetEntropyForTransform(cx, cy, acs_rawJXJ, entropy_JXJ, entropy_estimate);
+  } else if (costJxN < costNxJ) {
+    if (entropy_JXK_left < entropy[0][0] + entropy[1][0]) {
+      ac_strategy->Set(bx + cx, by + cy, acs_rawJXK);
+      SetEntropyForTransform(cx, cy, acs_rawJXK, entropy_JXK_left,
+                             entropy_estimate);
+    }
+    if (entropy_JXK_right < entropy[0][1] + entropy[1][1]) {
+      ac_strategy->Set(bx + cx + blocks_half, by + cy, acs_rawJXK);
+      SetEntropyForTransform(cx + blocks_half, cy, acs_rawJXK,
+                             entropy_JXK_right, entropy_estimate);
+    }
+  } else {
+    if (entropy_KXJ_top < entropy[0][0] + entropy[0][1]) {
+      ac_strategy->Set(bx + cx, by + cy, acs_rawKXJ);
+      SetEntropyForTransform(cx, cy, acs_rawKXJ, entropy_KXJ_top,
+                             entropy_estimate);
+    }
+    if (entropy_KXJ_bottom < entropy[1][0] + entropy[1][1]) {
+      ac_strategy->Set(bx + cx, by + cy + blocks_half, acs_rawKXJ);
+      SetEntropyForTransform(cx, cy + blocks_half, acs_rawKXJ,
+                             entropy_KXJ_bottom, entropy_estimate);
+    }
+  }
+}
+
+void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
+                    const ACSConfig& config, const Rect& rect) {
+  // Main philosophy here:
+  // 1. First find best 8x8 transform for each area.
+  // 2. Merging them into larger transforms where possibly, but
+  // starting from the smallest transforms (16x8 and 8x16).
+  // Additional complication: 16x8 and 8x16 are considered
+  // simultanouesly and fairly against each other.
+  // We are looking at 64x64 squares since the YtoX and YtoB
+  // maps happen to be at that resolution, and having
+  // integral transforms cross these boundaries leads to
+  // additional complications.
+  const CompressParams& cparams = enc_state->cparams;
+  const float butteraugli_target = cparams.butteraugli_distance;
+  AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy;
+  // TODO(veluca): reuse allocations
+  auto mem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  auto qmem = hwy::AllocateAligned<uint32_t>(AcStrategy::kMaxCoeffArea);
+  uint32_t* JXL_RESTRICT quantized = qmem.get();
+  float* JXL_RESTRICT block = mem.get();
+  float* JXL_RESTRICT scratch_space = mem.get() + 3 * AcStrategy::kMaxCoeffArea;
+  size_t bx = rect.x0();
+  size_t by = rect.y0();
+  JXL_ASSERT(rect.xsize() <= 8);
+  JXL_ASSERT(rect.ysize() <= 8);
+  size_t tx = bx / kColorTileDimInBlocks;
+  size_t ty = by / kColorTileDimInBlocks;
+  const float cmap_factors[3] = {
+      enc_state->shared.cmap.YtoXRatio(
+          enc_state->shared.cmap.ytox_map.ConstRow(ty)[tx]),
+      0.0f,
+      enc_state->shared.cmap.YtoBRatio(
+          enc_state->shared.cmap.ytob_map.ConstRow(ty)[tx]),
+  };
+  if (cparams.speed_tier > SpeedTier::kHare) return;
+  // First compute the best 8x8 transform for each square. Later, we do not
+  // experiment with different combinations, but only use the best of the 8x8s
+  // when DCT8X8 is specified in the tree search.
+  // 8x8 transforms have 10 variants, but every larger transform is just a DCT.
+  float entropy_estimate[64] = {};
+  // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
+  // low butteraugli_target distances.
+  static const float k8x8mul1 = -0.55;
+  static const float k8x8mul2 = 1.0;
+  static const float k8x8base = 1.4;
+  const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
+  for (size_t iy = 0; iy < rect.ysize(); iy++) {
+    for (size_t ix = 0; ix < rect.xsize(); ix++) {
+      float entropy = 0.0;
+      const uint8_t best_of_8x8s = FindBest8x8Transform(
+          8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
+          config, cmap_factors, ac_strategy, block, scratch_space, quantized,
+          &entropy);
+      ac_strategy->Set(bx + ix, by + iy,
+                       static_cast<AcStrategy::Type>(best_of_8x8s));
+      entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
+    }
+  }
+  // Merge when a larger transform is better than the previously
+  // searched best combination of 8x8 transforms.
+  struct MergeTry {
+    AcStrategy::Type type;
+    uint8_t priority;
+    uint8_t decoding_speed_tier_max_limit;
+    uint8_t encoding_speed_tier_max_limit;
+    float entropy_mul;
+  };
+  static const float k8X16mul1 = -0.55;
+  static const float k8X16mul2 = 0.865;
+  static const float k8X16base = 1.6;
+  const float entropy_mul16X8 =
+      k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base);
+  //  const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f;
+
+  static const float k16X16mul1 = -0.35;
+  static const float k16X16mul2 = 0.798;
+  static const float k16X16base = 2.0;
+  const float entropy_mul16X16 =
+      k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base);
+  //  const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f;
+
+  static const float k32X16mul1 = -0.1;
+  static const float k32X16mul2 = 0.854;
+  static const float k32X16base = 2.5;
+  const float entropy_mul16X32 =
+      k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base);
+
+  const float entropy_mul32X32 = 0.93;
+  const float entropy_mul64X64 = 1.52f;
+  // TODO(jyrki): Consider this feedback in further changes:
+  // Also effectively when the multipliers for smaller blocks are
+  // below 1, this raises the bar for the bigger blocks even higher
+  // in that sense these constants are not independent (e.g. changing
+  // the constant for DCT16x32 by -5% (making it more likely) also
+  // means that DCT32x32 becomes harder to do when starting from
+  // two DCT16x32s). It might be better to make them more independent,
+  // e.g. by not applying the multiplier when storing the new entropy
+  // estimates in TryMergeToACSCandidate().
+  const MergeTry kTransformsForMerge[9] = {
+      {AcStrategy::Type::DCT16X8, 2, 4, 5, entropy_mul16X8},
+      {AcStrategy::Type::DCT8X16, 2, 4, 5, entropy_mul16X8},
+      // FindBestFirstLevelDivisionForSquare looks for DCT16X16 and its
+      // subdivisions. {AcStrategy::Type::DCT16X16, 3, entropy_mul16X16},
+      {AcStrategy::Type::DCT16X32, 4, 4, 4, entropy_mul16X32},
+      {AcStrategy::Type::DCT32X16, 4, 4, 4, entropy_mul16X32},
+      // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
+      // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5,
+      // 0.9822994906548809f},
+      {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.29f},
+      {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.29f},
+      // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f},
+  };
+  /*
+  These sizes not yet included in merge heuristic:
+  set(AcStrategy::Type::DCT32X8, 0.0f, 2.261390410971102f);
+  set(AcStrategy::Type::DCT8X32, 0.0f, 2.261390410971102f);
+  set(AcStrategy::Type::DCT128X128, 0.0f, 1.0f);
+  set(AcStrategy::Type::DCT128X64, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT64X128, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT256X256, 0.0f, 1.0f);
+  set(AcStrategy::Type::DCT256X128, 0.0f, 0.73f);
+  set(AcStrategy::Type::DCT128X256, 0.0f, 0.73f);
+  */
+
+  // Priority is a tricky kludge to avoid collisions so that transforms
+  // don't overlap.
+  uint8_t priority[64] = {};
+  bool enable_32x32 = cparams.decoding_speed_tier < 4;
+  for (auto tx : kTransformsForMerge) {
+    if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
+      continue;
+    }
+    AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
+
+    for (size_t cy = 0; cy + acs.covered_blocks_y() - 1 < rect.ysize();
+         cy += acs.covered_blocks_y()) {
+      for (size_t cx = 0; cx + acs.covered_blocks_x() - 1 < rect.xsize();
+           cx += acs.covered_blocks_x()) {
+        if (cy + 7 < rect.ysize() && cx + 7 < rect.xsize()) {
+          if (cparams.decoding_speed_tier < 4 &&
+              tx.type == AcStrategy::Type::DCT32X64) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 8 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  8, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                  tx.entropy_mul, entropy_mul64X64, entropy_estimate, block,
+                  scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT32X16) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
+            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
+          // already covered by FindBest32X32
+          continue;
+        }
+
+        if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
+          if (tx.type == AcStrategy::Type::DCT16X32) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 4 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
+                  ac_strategy, tx.entropy_mul, entropy_mul32X32,
+                  entropy_estimate, block, scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT32X16) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT16X32 && cy % 4 != 0) ||
+            (tx.type == AcStrategy::Type::DCT32X16 && cx % 4 != 0)) {
+          // already covered by FindBest32X32
+          continue;
+        }
+        if (cy + 1 < rect.ysize() && cx + 1 < rect.xsize()) {
+          if (tx.type == AcStrategy::Type::DCT8X16) {
+            // We handle both DCT8X16 and DCT16X8 at the same time.
+            if ((cy | cx) % 2 == 0) {
+              FindBestFirstLevelDivisionForSquare(
+                  2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                  tx.entropy_mul, entropy_mul16X16, entropy_estimate, block,
+                  scratch_space, quantized);
+            }
+            continue;
+          } else if (tx.type == AcStrategy::Type::DCT16X8) {
+            // We handled both DCT8X16 and DCT16X8 at the same time,
+            // and that is above. The last column and last row,
+            // when the last column or last row is odd numbered,
+            // are still handled by TryMergeAcs.
+            continue;
+          }
+        }
+        if ((tx.type == AcStrategy::Type::DCT8X16 && cy % 2 == 1) ||
+            (tx.type == AcStrategy::Type::DCT16X8 && cx % 2 == 1)) {
+          // already covered by FindBestFirstLevelDivisionForSquare
+          continue;
+        }
+        // All other merge sizes are handled here.
+        // Some of the DCT16X8s and DCT8X16s will still leak through here
+        // when there is an odd number of 8x8 blocks, then the last row
+        // and column will get their DCT16X8s and DCT8X16s through the
+        // normal integral transform merging process.
+        TryMergeAcs(tx.type, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+                    tx.entropy_mul, tx.priority, &priority[0], entropy_estimate,
+                    block, scratch_space, quantized);
+      }
+    }
+  }
+  if (cparams.speed_tier >= SpeedTier::kHare) {
+    return;
+  }
+  // Here we still try to do some non-aligned matching, find a few more
+  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
+  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
+    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
+      if ((cy | cx) % 2 != 0) {
+        FindBestFirstLevelDivisionForSquare(
+            2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+            entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
+            scratch_space, quantized);
+      }
+    }
+  }
+  // Non-aligned matching for 32X32, 16X32 and 32X16.
+  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
+  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
+    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
+      if ((cy | cx) % 4 == 0) {
+        continue;  // Already tried with loop above (DCT16X32 case).
+      }
+      FindBestFirstLevelDivisionForSquare(
+          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
+          scratch_space, quantized);
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ProcessRectACS);
+
+void AcStrategyHeuristics::Init(const Image3F& src,
+                                PassesEncoderState* enc_state) {
+  this->enc_state = enc_state;
+  config.dequant = &enc_state->shared.matrices;
+  const CompressParams& cparams = enc_state->cparams;
+  const float butteraugli_target = cparams.butteraugli_distance;
+
+  if (cparams.speed_tier >= SpeedTier::kCheetah) {
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1));  // DCT8 only
+  } else {
+    uint32_t acs_mask = 0;
+    // All transforms up to 64x64.
+    for (size_t i = 0; i < AcStrategy::DCT128X128; i++) {
+      acs_mask |= (1 << i);
+    }
+    JXL_CHECK(enc_state->shared.matrices.EnsureComputed(acs_mask));
+  }
+
+  // Image row pointers and strides.
+  config.quant_field_row = enc_state->initial_quant_field.Row(0);
+  config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();
+  auto& mask = enc_state->initial_quant_masking;
+  if (mask.xsize() > 0 && mask.ysize() > 0) {
+    config.masking_field_row = mask.Row(0);
+    config.masking_field_stride = mask.PixelsPerRow();
+  }
+
+  config.src_rows[0] = src.ConstPlaneRow(0, 0);
+  config.src_rows[1] = src.ConstPlaneRow(1, 0);
+  config.src_rows[2] = src.ConstPlaneRow(2, 0);
+  config.src_stride = src.PixelsPerRow();
+
+  // Entropy estimate is composed of two factors:
+  //  - estimate of the number of bits that will be used by the block
+  //  - information loss due to quantization
+  // The following constant controls the relative weights of these components.
+  config.info_loss_multiplier = 138.0f;
+  config.info_loss_multiplier2 = 50.46839691767866;
+  // TODO(jyrki): explore base_entropy setting more.
+  // A small value (0?) works better at high distance, while a larger value
+  // may be more effective at low distance/high bpp.
+  config.base_entropy = 0.0;
+  config.zeros_mul = 7.565053364251793f;
+  // Lots of +1 and -1 coefficients at high quality, it is
+  // beneficial to favor them. At low qualities zeros matter more
+  // and +1 / -1 coefficients are already quite harmful.
+  float slope = std::min<float>(1.0f, butteraugli_target * (1.0f / 3));
+  config.cost1 = 1 + slope * 8.8703248061477744f;
+  config.cost2 = 4.4628149885273363f;
+  config.cost_delta = 5.3359184934516337f;
+  JXL_ASSERT(enc_state->shared.ac_strategy.xsize() ==
+             enc_state->shared.frame_dim.xsize_blocks);
+  JXL_ASSERT(enc_state->shared.ac_strategy.ysize() ==
+             enc_state->shared.frame_dim.ysize_blocks);
+}
+
+void AcStrategyHeuristics::ProcessRect(const Rect& rect) {
+  PROFILER_FUNC;
+  const CompressParams& cparams = enc_state->cparams;
+  // In Falcon mode, use DCT8 everywhere and uniform quantization.
+  if (cparams.speed_tier >= SpeedTier::kCheetah) {
+    enc_state->shared.ac_strategy.FillDCT8(rect);
+    return;
+  }
+  HWY_DYNAMIC_DISPATCH(ProcessRectACS)
+  (enc_state, config, rect);
+}
+
+void AcStrategyHeuristics::Finalize(AuxOut* aux_out) {
+  const auto& ac_strategy = enc_state->shared.ac_strategy;
+  // Accounting and debug output.
+  if (aux_out != nullptr) {
+    aux_out->num_small_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::IDENTITY) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT2X2) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X4);
+    aux_out->num_dct4x8_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT4X8) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X4);
+    aux_out->num_afv_blocks = ac_strategy.CountBlocks(AcStrategy::Type::AFV0) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV1) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV2) +
+                              ac_strategy.CountBlocks(AcStrategy::Type::AFV3);
+    aux_out->num_dct8_blocks = ac_strategy.CountBlocks(AcStrategy::Type::DCT);
+    aux_out->num_dct8x16_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X16) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X8);
+    aux_out->num_dct8x32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT8X32) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X8);
+    aux_out->num_dct16_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X16);
+    aux_out->num_dct16x32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT16X32) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X16);
+    aux_out->num_dct32_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X32);
+    aux_out->num_dct32x64_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT32X64) +
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X32);
+    aux_out->num_dct64_blocks =
+        ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64);
+  }
+
+  if (WantDebugOutput(aux_out)) {
+    DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize,
+                   enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out);
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
new file mode 100644
index 0000000000..1ce3442ccf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_AC_STRATEGY_H_
+#define LIB_JXL_ENC_AC_STRATEGY_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quant_weights.h"
+
+// `FindBestAcStrategy` uses heuristics to choose which AC strategy should be
+// used in each block, as well as the initial quantization field.
+
+namespace jxl {
+
+struct AuxOut;
+
+// AC strategy selection: utility struct.
+
+struct ACSConfig {
+  const DequantMatrices* JXL_RESTRICT dequant;
+  float info_loss_multiplier;
+  float info_loss_multiplier2;
+  float* JXL_RESTRICT quant_field_row;
+  size_t quant_field_stride;
+  float* JXL_RESTRICT masking_field_row;
+  size_t masking_field_stride;
+  const float* JXL_RESTRICT src_rows[3];
+  size_t src_stride;
+  // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 +
+  // cost2 + sqrt(q) * cost_delta.
+  float cost1;
+  float cost2;
+  float cost_delta;
+  float base_entropy;
+  float zeros_mul;
+  const float& Pixel(size_t c, size_t x, size_t y) const {
+    return src_rows[c][y * src_stride + x];
+  }
+  float Masking(size_t bx, size_t by) const {
+    JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0);
+    return masking_field_row[by * masking_field_stride + bx];
+  }
+  float Quant(size_t bx, size_t by) const {
+    JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0);
+    return quant_field_row[by * quant_field_stride + bx];
+  }
+};
+
+struct AcStrategyHeuristics {
+  void Init(const Image3F& src, PassesEncoderState* enc_state);
+  void ProcessRect(const Rect& rect);
+  void Finalize(AuxOut* aux_out);
+  ACSConfig config;
+  PassesEncoderState* enc_state;
+};
+
+// Debug.
+void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
+                    size_t ysize, const char* tag, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_AC_STRATEGY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
new file mode 100644
index 0000000000..f54204b059
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@@ -0,0 +1,1145 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_adaptive_quantization.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_adaptive_quantization.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_group.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// The following functions modulate an exponent (out_val) and return the updated
+// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
+
+// Hack for mask estimation. Eventually replace this code with butteraugli's
+// masking.
+float ComputeMaskForAcStrategyUse(const float out_val) {
+  const float kMul = 1.0f;
+  const float kOffset = 0.001f;
+  return kMul / (out_val + kOffset);
+}
+
+template <class D, class V>
+V ComputeMask(const D d, const V out_val) {
+  const auto kBase = Set(d, -0.76471879237038032f);
+  const auto kMul4 = Set(d, 4.4585596705216615f);
+  const auto kMul2 = Set(d, 17.282053892620215f);
+  const auto kOffset2 = Set(d, 302.36961315317848f);
+  const auto kMul3 = Set(d, 7.0561261998705858f);
+  const auto kOffset3 = Set(d, 2.3179635626140773f);
+  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
+  const auto kMul0 = Set(d, 0.80061762862741759f);
+  const auto k1 = Set(d, 1.0f);
+
+  // Avoid division by zero.
+  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
+  const auto v2 = Div(k1, Add(v1, kOffset2));
+  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
+  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
+  // TODO(jyrki):
+  // A log or two here could make sense. In butteraugli we have effectively
+  // log(log(x + C)) for this kind of use, as a single log is used in
+  // saturating visual masking and here the modulation values are exponential,
+  // another log would counter that.
+  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
+}
+
+// mul and mul2 represent a scaling difference between jxl and butteraugli.
+static const float kSGmul = 226.77216153508914f;
+static const float kSGmul2 = 1.0f / 73.377132366608819f;
+static const float kLog2 = 0.693147181f;
+// Includes correction factor for std::log -> log2.
+static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
+static const float kSGVOffset = 7.7825991679894591f;
+
+template <bool invert, typename D, typename V>
+V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
+  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
+  // is related to the number of photons.
+  //
+  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
+  // This ratio allows quantization to move from jxl's opsin space to
+  // butteraugli's log-gamma space.
+  float kEpsilon = 1e-2;
+  v = ZeroIfNegative(v);
+  const auto kNumMul = Set(d, kSGRetMul * 3 * kSGmul);
+  const auto kVOffset = Set(d, kSGVOffset * kLog2 + kEpsilon);
+  const auto kDenMul = Set(d, kLog2 * kSGmul);
+
+  const auto v2 = Mul(v, v);
+
+  const auto num = MulAdd(kNumMul, v2, Set(d, kEpsilon));
+  const auto den = MulAdd(Mul(kDenMul, v), v2, kVOffset);
+  return invert ? Div(num, den) : Div(den, num);
+}
+
+template <bool invert = false>
+static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(
+      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
+}
+
+// TODO(veluca): this function computes an approximation of the derivative of
+// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
+// exact derivatives. For reference, SimpleGamma was:
+/*
+template <typename D, typename V>
+V SimpleGamma(const D d, V v) {
+  // A simple HDR compatible gamma function.
+  const auto mul = Set(d, kSGmul);
+  const auto kRetMul = Set(d, kSGRetMul);
+  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
+  const auto kVOffset = Set(d, kSGVOffset);
+
+  v *= mul;
+
+  // This should happen rarely, but may lead to a NaN, which is rather
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
+  v = ZeroIfNegative(v);
+  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
+}
+*/
+
+template <class D, class V>
+V GammaModulation(const D d, const size_t x, const size_t y,
+                  const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) {
+  const float kBias = 0.16f;
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]);
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]);
+  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]);
+  auto overall_ratio = Zero(d);
+  auto bias = Set(d, kBias);
+  auto half = Set(d, 0.5f);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto iny = Add(Load(d, row_in_y + x + dx), bias);
+      const auto inx = Load(d, row_in_x + x + dx);
+      const auto r = Sub(iny, inx);
+      const auto g = Add(iny, inx);
+      const auto ratio_r =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, r);
+      const auto ratio_g =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, g);
+      const auto avg_ratio = Mul(half, Add(ratio_r, ratio_g));
+
+      overall_ratio = Add(overall_ratio, avg_ratio);
+    }
+  }
+  overall_ratio = Mul(SumOfLanes(d, overall_ratio), Set(d, 1.0f / 64));
+  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
+  // less than that.
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
+}
+
+template <class D, class V>
+V ColorModulation(const D d, const size_t x, const size_t y,
+                  const ImageF& xyb_x, const ImageF& xyb_y, const ImageF& xyb_b,
+                  const double butteraugli_target, V out_val) {
+  static const float kStrengthMul = 4.2456542701250122f;
+  static const float kRedRampStart = 0.18748564245760829f;
+  static const float kRedRampLength = 0.16701783842516479f;
+  static const float kBlueRampLength = 0.16117602661852037f;
+  static const float kBlueRampStart = 0.47897504338287333f;
+  const float strength = kStrengthMul * (1.0f - 0.15f * butteraugli_target);
+  if (strength < 0) {
+    return out_val;
+  }
+  // x values are smaller than y and b values, need to take the difference into
+  // account.
+  const float red_strength = strength * 6.0f;
+  const float blue_strength = strength;
+  {
+    // Reduce some bits from areas not blue or red.
+    const float offset = strength * -0.007;  // 9174542291185913f;
+    out_val = Add(out_val, Set(d, offset));
+  }
+  // Calculate how much of the 8x8 block is covered with blue or red.
+  auto blue_coverage = Zero(d);
+  auto red_coverage = Zero(d);
+  auto bias_y = Set(d, 0.2f);
+  auto bias_y_add = Set(d, 0.1f);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
+    const float* const JXL_RESTRICT row_in_b = xyb_b.Row(y + dy);
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto pixel_y = Load(d, row_in_y + x + dx);
+      // Estimate redness-greeness relative to the intensity.
+      const auto pixel_xpy = Div(Abs(Load(d, row_in_x + x + dx)),
+                                 Max(Add(bias_y_add, pixel_y), bias_y));
+      const auto pixel_x =
+          Max(Set(d, 0.0f), Sub(pixel_xpy, Set(d, kRedRampStart)));
+      const auto pixel_b =
+          Max(Set(d, 0.0f), Sub(Load(d, row_in_b + x + dx),
+                                Add(pixel_y, Set(d, kBlueRampStart))));
+      const auto blue_slope = Min(pixel_b, Set(d, kBlueRampLength));
+      const auto red_slope = Min(pixel_x, Set(d, kRedRampLength));
+      red_coverage = Add(red_coverage, red_slope);
+      blue_coverage = Add(blue_coverage, blue_slope);
+    }
+  }
+
+  // Saturate when the high red or high blue coverage is above a level.
+  // The idea here is that if a certain fraction of the block is red or
+  // blue we consider as if it was fully red or blue.
+  static const float ratio = 28.0f;  // out of 64 pixels.
+
+  auto overall_red_coverage = SumOfLanes(d, red_coverage);
+  overall_red_coverage =
+      Min(overall_red_coverage, Set(d, ratio * kRedRampLength));
+  overall_red_coverage =
+      Mul(overall_red_coverage, Set(d, red_strength / ratio));
+
+  auto overall_blue_coverage = SumOfLanes(d, blue_coverage);
+  overall_blue_coverage =
+      Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength));
+  overall_blue_coverage =
+      Mul(overall_blue_coverage, Set(d, blue_strength / ratio));
+
+  return Add(overall_red_coverage, Add(overall_blue_coverage, out_val));
+}
+
+// Change precision in 8x8 blocks that have high frequency content.
+template <class D, class V>
+V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
+               const V out_val) {
+  // Zero out the invalid differences for the rightmost value per row.
+  const Rebind<uint32_t, D> du;
+  HWY_ALIGN constexpr uint32_t kMaskRight[kBlockDim] = {~0u, ~0u, ~0u, ~0u,
+                                                        ~0u, ~0u, ~0u, 0};
+
+  auto sum = Zero(d);  // sum of absolute differences with right and below
+
+  static const float valmin = 0.52489909479039587f;
+  auto valminv = Set(d, valmin);
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x;
+    const float* JXL_RESTRICT row_in_next =
+        dy == 7 ? row_in : xyb.Row(y + dy + 1) + x;
+
+    // In SCALAR, there is no guarantee of having extra row padding.
+    // Hence, we need to ensure we don't access pixels outside the row itself.
+    // In SIMD modes, however, rows are padded, so it's safe to access one
+    // garbage value after the row. The vector then gets masked with kMaskRight
+    // to remove the influence of that value.
+#if HWY_TARGET != HWY_SCALAR
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+#else
+    for (size_t dx = 0; dx < 7; dx += Lanes(d)) {
+#endif
+      const auto p = Load(d, row_in + dx);
+      const auto pr = LoadU(d, row_in + dx + 1);
+      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
+      sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr))));
+
+      const auto pd = Load(d, row_in_next + dx);
+      sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
+    }
+#if HWY_TARGET == HWY_SCALAR
+    const auto p = Load(d, row_in + 7);
+    const auto pd = Load(d, row_in_next + 7);
+    sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
+#endif
+  }
+  // more negative value gives more bpp
+  static const float kOffset = -2.6545897672771526;
+  static const float kMul = -0.049868161744916512;
+
+  sum = SumOfLanes(d, sum);
+  float scalar_sum = GetLane(sum);
+  static const float maxsum = 7.9076877647025947f;
+  static const float minsum = 0.53640540945659809f;
+  scalar_sum = std::min(maxsum, scalar_sum);
+  scalar_sum = std::max(minsum, scalar_sum);
+  scalar_sum += kOffset;
+  scalar_sum *= kMul;
+  return Add(Set(d, scalar_sum), out_val);
+}
+
+void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
+                         const ImageF& xyb_y, const ImageF& xyb_b,
+                         const float scale, const Rect& rect, ImageF* out) {
+  JXL_ASSERT(SameSize(xyb_x, xyb_y));
+  JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize());
+  JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize());
+
+  float base_level = 0.48f * scale;
+  float kDampenRampStart = 2.0f;
+  float kDampenRampEnd = 14.0f;
+  float dampen = 1.0f;
+  if (butteraugli_target >= kDampenRampStart) {
+    dampen = 1.0f - ((butteraugli_target - kDampenRampStart) /
+                     (kDampenRampEnd - kDampenRampStart));
+    if (dampen < 0) {
+      dampen = 0;
+    }
+  }
+  const float mul = scale * dampen;
+  const float add = (1.0f - dampen) * base_level;
+  for (size_t iy = rect.y0(); iy < rect.y0() + rect.ysize(); iy++) {
+    const size_t y = iy * 8;
+    float* const JXL_RESTRICT row_out = out->Row(iy);
+    const HWY_CAPPED(float, kBlockDim) df;
+    for (size_t ix = rect.x0(); ix < rect.x0() + rect.xsize(); ix++) {
+      size_t x = ix * 8;
+      auto out_val = Set(df, row_out[ix]);
+      out_val = ComputeMask(df, out_val);
+      out_val = HfModulation(df, x, y, xyb_y, out_val);
+      out_val = ColorModulation(df, x, y, xyb_x, xyb_y, xyb_b,
+                                butteraugli_target, out_val);
+      out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val);
+      // We want multiplicative quantization field, so everything
+      // until this point has been modulating the exponent.
+      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
+    }
+  }
+}
+
+template <typename D, typename V>
+V MaskingSqrt(const D d, V v) {
+  static const float kLogOffset = 27.97044946785558f;
+  static const float kMul = 211.53333281566171f;
+  const auto mul_v = Set(d, kMul * 1e8);
+  const auto offset_v = Set(d, kLogOffset);
+  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
+}
+
+float MaskingSqrt(const float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(MaskingSqrt(DScalar(), vscalar));
+}
+
+void StoreMin4(const float v, float& min0, float& min1, float& min2,
+               float& min3) {
+  if (v < min3) {
+    if (v < min0) {
+      min3 = min2;
+      min2 = min1;
+      min1 = min0;
+      min0 = v;
+    } else if (v < min1) {
+      min3 = min2;
+      min2 = min1;
+      min1 = v;
+    } else if (v < min2) {
+      min3 = min2;
+      min2 = v;
+    } else {
+      min3 = v;
+    }
+  }
+}
+
+// Look for smooth areas near the area of degradation.
+// If the areas are generally smooth, don't do masking.
+// Output is downsampled 2x.
+void FuzzyErosion(const Rect& from_rect, const ImageF& from,
+                  const Rect& to_rect, ImageF* to) {
+  const size_t xsize = from.xsize();
+  const size_t ysize = from.ysize();
+  constexpr int kStep = 1;
+  static_assert(kStep == 1, "Step must be 1");
+  JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize());
+  JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize());
+  for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
+    size_t y = fy + from_rect.y0();
+    size_t ym1 = y >= kStep ? y - kStep : y;
+    size_t yp1 = y + kStep < ysize ? y + kStep : y;
+    const float* rowt = from.Row(ym1);
+    const float* row = from.Row(y);
+    const float* rowb = from.Row(yp1);
+    float* row_out = to_rect.Row(to, fy / 2);
+    for (size_t fx = 0; fx < from_rect.xsize(); ++fx) {
+      size_t x = fx + from_rect.x0();
+      size_t xm1 = x >= kStep ? x - kStep : x;
+      size_t xp1 = x + kStep < xsize ? x + kStep : x;
+      float min0 = row[x];
+      float min1 = row[xm1];
+      float min2 = row[xp1];
+      float min3 = rowt[xm1];
+      // Sort the first four values.
+      if (min0 > min1) std::swap(min0, min1);
+      if (min0 > min2) std::swap(min0, min2);
+      if (min0 > min3) std::swap(min0, min3);
+      if (min1 > min2) std::swap(min1, min2);
+      if (min1 > min3) std::swap(min1, min3);
+      if (min2 > min3) std::swap(min2, min3);
+      // The remaining five values of a 3x3 neighbourhood.
+      StoreMin4(rowt[x], min0, min1, min2, min3);
+      StoreMin4(rowt[xp1], min0, min1, min2, min3);
+      StoreMin4(rowb[xm1], min0, min1, min2, min3);
+      StoreMin4(rowb[x], min0, min1, min2, min3);
+      StoreMin4(rowb[xp1], min0, min1, min2, min3);
+      static const float kMul0 = 0.125f;
+      static const float kMul1 = 0.075f;
+      static const float kMul2 = 0.06f;
+      static const float kMul3 = 0.05f;
+      float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3;
+      if (fx % 2 == 0 && fy % 2 == 0) {
+        row_out[fx / 2] = v;
+      } else {
+        row_out[fx / 2] += v;
+      }
+    }
+  }
+}
+
+struct AdaptiveQuantizationImpl {
+  void Init(const Image3F& xyb) {
+    JXL_DASSERT(xyb.xsize() % kBlockDim == 0);
+    JXL_DASSERT(xyb.ysize() % kBlockDim == 0);
+    const size_t xsize = xyb.xsize();
+    const size_t ysize = xyb.ysize();
+    aq_map = ImageF(xsize / kBlockDim, ysize / kBlockDim);
+  }
+  void PrepareBuffers(size_t num_threads) {
+    diff_buffer = ImageF(kEncTileDim + 8, num_threads);
+    for (size_t i = pre_erosion.size(); i < num_threads; i++) {
+      pre_erosion.emplace_back(kEncTileDimInBlocks * 2 + 2,
+                               kEncTileDimInBlocks * 2 + 2);
+    }
+  }
+
+  void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
+                   const Rect& rect, const int thread, ImageF* mask) {
+    PROFILER_ZONE("aq DiffPrecompute");
+    const size_t xsize = xyb.xsize();
+    const size_t ysize = xyb.ysize();
+
+    // The XYB gamma is 3.0 to be able to decode faster with two muls.
+    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
+    // We approximate the gamma difference by adding one cubic root into
+    // the adaptive quantization. This gives us a total gamma of 2.6666
+    // for quantization uses.
+    const float match_gamma_offset = 0.019;
+
+    const HWY_FULL(float) df;
+
+    size_t y_start = rect.y0() * 8;
+    size_t y_end = y_start + rect.ysize() * 8;
+
+    size_t x0 = rect.x0() * 8;
+    size_t x1 = x0 + rect.xsize() * 8;
+    if (x0 != 0) x0 -= 4;
+    if (x1 != xyb.xsize()) x1 += 4;
+    if (y_start != 0) y_start -= 4;
+    if (y_end != xyb.ysize()) y_end += 4;
+    pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4);
+
+    static const float limit = 0.2f;
+    // Computes image (padded to multiple of 8x8) of local pixel differences.
+    // Subsample both directions by 4.
+    for (size_t y = y_start; y < y_end; ++y) {
+      size_t y2 = y + 1 < ysize ? y + 1 : y;
+      size_t y1 = y > 0 ? y - 1 : y;
+
+      const float* row_in = xyb.PlaneRow(1, y);
+      const float* row_in1 = xyb.PlaneRow(1, y1);
+      const float* row_in2 = xyb.PlaneRow(1, y2);
+      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
+
+      auto scalar_pixel = [&](size_t x) {
+        const size_t x2 = x + 1 < xsize ? x + 1 : x;
+        const size_t x1 = x > 0 ? x - 1 : x;
+        const float base =
+            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
+        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
+            row_in[x] + match_gamma_offset);
+        float diff = gammac * (row_in[x] - base);
+        diff *= diff;
+        if (diff >= limit) {
+          diff = limit;
+        }
+        diff = MaskingSqrt(diff);
+        if ((y % 4) != 0) {
+          row_out[x - x0] += diff;
+        } else {
+          row_out[x - x0] = diff;
+        }
+      };
+
+      size_t x = x0;
+      // First pixel of the row.
+      if (x0 == 0) {
+        scalar_pixel(x0);
+        ++x;
+      }
+      // SIMD
+      const auto match_gamma_offset_v = Set(df, match_gamma_offset);
+      const auto quarter = Set(df, 0.25f);
+      for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) {
+        const auto in = LoadU(df, row_in + x);
+        const auto in_r = LoadU(df, row_in + x + 1);
+        const auto in_l = LoadU(df, row_in + x - 1);
+        const auto in_t = LoadU(df, row_in2 + x);
+        const auto in_b = LoadU(df, row_in1 + x);
+        auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
+        auto gammacv =
+            RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
+                df, Add(in, match_gamma_offset_v));
+        auto diff = Mul(gammacv, Sub(in, base));
+        diff = Mul(diff, diff);
+        diff = Min(diff, Set(df, limit));
+        diff = MaskingSqrt(df, diff);
+        if ((y & 3) != 0) {
+          diff = Add(diff, LoadU(df, row_out + x - x0));
+        }
+        StoreU(diff, df, row_out + x - x0);
+      }
+      // Scalar
+      for (; x < x1; ++x) {
+        scalar_pixel(x);
+      }
+      if (y % 4 == 3) {
+        float* row_dout = pre_erosion[thread].Row((y - y_start) / 4);
+        for (size_t x = 0; x < (x1 - x0) / 4; x++) {
+          row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
+                         row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
+                        0.25f;
+        }
+      }
+    }
+    Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
+                   rect.xsize() * 2, rect.ysize() * 2);
+    FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map);
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      const float* aq_map_row = rect.ConstRow(aq_map, y);
+      float* mask_row = rect.Row(mask, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        mask_row[x] = ComputeMaskForAcStrategyUse(aq_map_row[x]);
+      }
+    }
+    PerBlockModulations(butteraugli_target, xyb.Plane(0), xyb.Plane(1),
+                        xyb.Plane(2), scale, rect, &aq_map);
+  }
+  std::vector<ImageF> pre_erosion;
+  ImageF aq_map;
+  ImageF diff_buffer;
+};
+
+ImageF AdaptiveQuantizationMap(const float butteraugli_target,
+                               const Image3F& xyb,
+                               const FrameDimensions& frame_dim, float scale,
+                               ThreadPool* pool, ImageF* mask) {
+  PROFILER_ZONE("aq AdaptiveQuantMap");
+
+  AdaptiveQuantizationImpl impl;
+  impl.Init(xyb);
+  *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  JXL_CHECK(RunOnPool(
+      pool, 0,
+      DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(frame_dim.ysize_blocks, kEncTileDimInBlocks),
+      [&](const size_t num_threads) {
+        impl.PrepareBuffers(num_threads);
+        return true;
+      },
+      [&](const uint32_t tid, const size_t thread) {
+        size_t n_enc_tiles =
+            DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks);
+        size_t tx = tid % n_enc_tiles;
+        size_t ty = tid / n_enc_tiles;
+        size_t by0 = ty * kEncTileDimInBlocks;
+        size_t by1 =
+            std::min((ty + 1) * kEncTileDimInBlocks, frame_dim.ysize_blocks);
+        size_t bx0 = tx * kEncTileDimInBlocks;
+        size_t bx1 =
+            std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks);
+        Rect r(bx0, by0, bx1 - bx0, by1 - by0);
+        impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask);
+      },
+      "AQ DiffPrecompute"));
+
+  return std::move(impl).aq_map;
+}
+
+}  // namespace
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(AdaptiveQuantizationMap);
+
+namespace {
+// If true, prints the quantization maps at each iteration.
+bool FLAGS_dump_quant_state = false;
+
+void DumpHeatmap(const AuxOut* aux_out, const std::string& label,
+                 const ImageF& image, float good_threshold,
+                 float bad_threshold) {
+  Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold);
+  char filename[200];
+  snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
+           aux_out->num_butteraugli_iters);
+  aux_out->DumpImage(filename, heatmap);
+}
+
+void DumpHeatmaps(const AuxOut* aux_out, float ba_target,
+                  const ImageF& quant_field, const ImageF& tile_heatmap,
+                  const ImageF& bt_diffmap) {
+  if (!WantDebugOutput(aux_out)) return;
+  ImageF inv_qmap(quant_field.xsize(), quant_field.ysize());
+  for (size_t y = 0; y < quant_field.ysize(); ++y) {
+    const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
+    float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
+    for (size_t x = 0; x < quant_field.xsize(); ++x) {
+      row_inv_q[x] = 1.0f / row_q[x];  // never zero
+    }
+  }
+  DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target,
+              6.0f * ba_target);
+  DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target,
+              1.5f * ba_target);
+  // matches heat maps produced by the command line tool.
+  DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5),
+              ButteraugliFuzzyInverse(0.5));
+}
+
+ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
+                   const AcStrategyImage& ac_strategy) {
+  PROFILER_FUNC;
+  const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size;
+  const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size;
+  ImageF tile_distmap(tile_xsize, tile_ysize);
+  size_t distmap_stride = tile_distmap.PixelsPerRow();
+  for (int tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(tile_y);
+    float* JXL_RESTRICT dist_row = tile_distmap.Row(tile_y);
+    for (int tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      AcStrategy acs = ac_strategy_row[tile_x];
+      if (!acs.IsFirstBlock()) continue;
+      int this_tile_xsize = acs.covered_blocks_x() * tile_size;
+      int this_tile_ysize = acs.covered_blocks_y() * tile_size;
+      int y_begin = std::max<int>(0, tile_size * tile_y - margin);
+      int y_end = std::min<int>(distmap.ysize(),
+                                tile_size * tile_y + this_tile_ysize + margin);
+      int x_begin = std::max<int>(0, tile_size * tile_x - margin);
+      int x_end = std::min<int>(distmap.xsize(),
+                                tile_size * tile_x + this_tile_xsize + margin);
+      float dist_norm = 0.0;
+      double pixels = 0;
+      for (int y = y_begin; y < y_end; ++y) {
+        float ymul = 1.0;
+        constexpr float kBorderMul = 0.98f;
+        constexpr float kCornerMul = 0.7f;
+        if (margin != 0 && (y == y_begin || y == y_end - 1)) {
+          ymul = kBorderMul;
+        }
+        const float* const JXL_RESTRICT row = distmap.Row(y);
+        for (int x = x_begin; x < x_end; ++x) {
+          float xmul = ymul;
+          if (margin != 0 && (x == x_begin || x == x_end - 1)) {
+            if (xmul == 1.0) {
+              xmul = kBorderMul;
+            } else {
+              xmul = kCornerMul;
+            }
+          }
+          float v = row[x];
+          v *= v;
+          v *= v;
+          v *= v;
+          v *= v;
+          dist_norm += xmul * v;
+          pixels += xmul;
+        }
+      }
+      if (pixels == 0) pixels = 1;
+      // 16th norm is less than the max norm, we reduce the difference
+      // with this normalization factor.
+      constexpr float kTileNorm = 1.2f;
+      const float tile_dist =
+          kTileNorm * std::pow(dist_norm / pixels, 1.0f / 16.0f);
+      dist_row[tile_x] = tile_dist;
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          dist_row[tile_x + distmap_stride * iy + ix] = tile_dist;
+        }
+      }
+    }
+  }
+  return tile_distmap;
+}
+
+static const float kDcQuantPow = 0.83;
+static const float kDcQuant = 1.095924047623553f;
+static const float kAcQuant = 0.80751132443618624f;
+
+void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
+                          PassesEncoderState* enc_state,
+                          const JxlCmsInterface& cms, ThreadPool* pool,
+                          AuxOut* aux_out) {
+  const CompressParams& cparams = enc_state->cparams;
+  if (cparams.resampling > 1 &&
+      cparams.original_butteraugli_distance <= 4.0 * cparams.resampling) {
+    // For downsampled opsin image, the butteraugli based adaptive quantization
+    // loop would only make the size bigger without improving the distance much,
+    // so in this case we enable it only for very high butteraugli targets.
+    return;
+  }
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
+  ImageF& quant_field = enc_state->initial_quant_field;
+
+  // TODO(veluca): this should really be rather handled on the
+  // ButteraugliComparator side.
+  struct TemporaryShrink {
+    TemporaryShrink(ImageBundle& bundle, size_t xsize, size_t ysize)
+        : bundle(bundle),
+          orig_xsize(bundle.xsize()),
+          orig_ysize(bundle.ysize()) {
+      bundle.ShrinkTo(xsize, ysize);
+    }
+    TemporaryShrink(const TemporaryShrink&) = delete;
+    TemporaryShrink(TemporaryShrink&&) = delete;
+
+    ~TemporaryShrink() { bundle.ShrinkTo(orig_xsize, orig_ysize); }
+
+    ImageBundle& bundle;
+    size_t orig_xsize;
+    size_t orig_ysize;
+  } t(const_cast<ImageBundle&>(linear),
+      enc_state->shared.frame_header.nonserialized_metadata->xsize(),
+      enc_state->shared.frame_header.nonserialized_metadata->ysize());
+
+  const float butteraugli_target = cparams.butteraugli_distance;
+  const float original_butteraugli = cparams.original_butteraugli_distance;
+  ButteraugliParams params = cparams.ba_params;
+  params.intensity_target = linear.metadata()->IntensityTarget();
+  // Hack the default intensity target value to be 80.0, the intensity
+  // target of sRGB images and a more reasonable viewing default than
+  // JPEG XL file format's default.
+  if (fabs(params.intensity_target - 255.0f) < 1e-3) {
+    params.intensity_target = 80.0f;
+  }
+  JxlButteraugliComparator comparator(params, cms);
+  JXL_CHECK(comparator.SetReferenceImage(linear));
+  bool lower_is_better =
+      (comparator.GoodQualityScore() < comparator.BadQualityScore());
+  const float initial_quant_dc = InitialQuantDC(butteraugli_target);
+  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
+                   &quant_field);
+  ImageF tile_distmap;
+  ImageF initial_quant_field = CopyImage(quant_field);
+
+  float initial_qf_min, initial_qf_max;
+  ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max);
+  float initial_qf_ratio = initial_qf_max / initial_qf_min;
+  float qf_max_deviation_low = std::sqrt(250 / initial_qf_ratio);
+  float asymmetry = 2;
+  if (qf_max_deviation_low < asymmetry) asymmetry = qf_max_deviation_low;
+  float qf_lower = initial_qf_min / (asymmetry * qf_max_deviation_low);
+  float qf_higher = initial_qf_max * (qf_max_deviation_low / asymmetry);
+
+  JXL_ASSERT(qf_higher / qf_lower < 253);
+
+  constexpr int kOriginalComparisonRound = 1;
+  int iters = cparams.max_butteraugli_iters;
+  if (iters > 7) {
+    iters = 7;
+  }
+  if (cparams.speed_tier != SpeedTier::kTortoise) {
+    iters = 2;
+  }
+  for (int i = 0; i < iters + 1; ++i) {
+    if (FLAGS_dump_quant_state) {
+      printf("\nQuantization field:\n");
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          printf(" %.5f", quant_field.Row(y)[x]);
+        }
+        printf("\n");
+      }
+    }
+    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+    ImageBundle dec_linear = RoundtripImage(opsin, enc_state, cms, pool);
+    PROFILER_ZONE("enc Butteraugli");
+    float score;
+    ImageF diffmap;
+    JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score));
+    if (!lower_is_better) {
+      score = -score;
+      diffmap = ScaleImage(-1.0f, diffmap);
+    }
+    tile_distmap = TileDistMap(diffmap, 8 * cparams.resampling, 0,
+                               enc_state->shared.ac_strategy);
+    if (WantDebugOutput(aux_out)) {
+      aux_out->DumpImage(("dec" + ToString(i)).c_str(), *dec_linear.color());
+      DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap,
+                   diffmap);
+    }
+    if (aux_out != nullptr) ++aux_out->num_butteraugli_iters;
+    if (cparams.log_search_state) {
+      float minval, maxval;
+      ImageMinMax(quant_field, &minval, &maxval);
+      printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters);
+      printf("Butteraugli distance: %f  (target = %f)\n", score,
+             original_butteraugli);
+      printf("quant range: %f ... %f  DC quant: %f\n", minval, maxval,
+             initial_quant_dc);
+      if (FLAGS_dump_quant_state) {
+        quantizer.DumpQuantizationMap(raw_quant_field);
+      }
+    }
+
+    if (i == iters) break;
+
+    double kPow[8] = {
+        0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    };
+    double kPowMod[8] = {
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    };
+    if (i == kOriginalComparisonRound) {
+      // Don't allow optimization to make the quant field a lot worse than
+      // what the initial guess was. This allows the AC field to have enough
+      // precision to reduce the oscillations due to the dc reconstruction.
+      double kInitMul = 0.6;
+      const double kOneMinusInitMul = 1.0 - kInitMul;
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        const float* const JXL_RESTRICT row_init = initial_quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          double clamp = kOneMinusInitMul * row_q[x] + kInitMul * row_init[x];
+          if (row_q[x] < clamp) {
+            row_q[x] = clamp;
+            if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+            if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+          }
+        }
+      }
+    }
+
+    double cur_pow = 0.0;
+    if (i < 7) {
+      cur_pow = kPow[i] + (original_butteraugli - 1.0) * kPowMod[i];
+      if (cur_pow < 0) {
+        cur_pow = 0;
+      }
+    }
+    if (cur_pow == 0.0) {
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          const float diff = row_dist[x] / original_butteraugli;
+          if (diff > 1.0f) {
+            float old = row_q[x];
+            row_q[x] *= diff;
+            int qf_old = old * quantizer.InvGlobalScale() + 0.5;
+            int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5;
+            if (qf_old == qf_new) {
+              row_q[x] = old + quantizer.Scale();
+            }
+          }
+          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+        }
+      }
+    } else {
+      for (size_t y = 0; y < quant_field.ysize(); ++y) {
+        const float* const JXL_RESTRICT row_dist = tile_distmap.Row(y);
+        float* const JXL_RESTRICT row_q = quant_field.Row(y);
+        for (size_t x = 0; x < quant_field.xsize(); ++x) {
+          const float diff = row_dist[x] / original_butteraugli;
+          if (diff <= 1.0f) {
+            row_q[x] *= std::pow(diff, cur_pow);
+          } else {
+            float old = row_q[x];
+            row_q[x] *= diff;
+            int qf_old = old * quantizer.InvGlobalScale() + 0.5;
+            int qf_new = row_q[x] * quantizer.InvGlobalScale() + 0.5;
+            if (qf_old == qf_new) {
+              row_q[x] = old + quantizer.Scale();
+            }
+          }
+          if (row_q[x] > qf_higher) row_q[x] = qf_higher;
+          if (row_q[x] < qf_lower) row_q[x] = qf_lower;
+        }
+      }
+    }
+  }
+  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+}
+
+void FindBestQuantizationMaxError(const Image3F& opsin,
+                                  PassesEncoderState* enc_state,
+                                  const JxlCmsInterface& cms, ThreadPool* pool,
+                                  AuxOut* aux_out) {
+  // TODO(szabadka): Make this work for non-opsin color spaces.
+  const CompressParams& cparams = enc_state->cparams;
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  ImageI& raw_quant_field = enc_state->shared.raw_quant_field;
+  ImageF& quant_field = enc_state->initial_quant_field;
+
+  // TODO(veluca): better choice of this value.
+  const float initial_quant_dc =
+      16 * std::sqrt(0.1f / cparams.butteraugli_distance);
+  AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
+                   &quant_field);
+
+  const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0],
+                                1.0f / enc_state->cparams.max_error[1],
+                                1.0f / enc_state->cparams.max_error[2]};
+
+  for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) {
+    quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+    if (aux_out) {
+      aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin);
+    }
+    ImageBundle decoded = RoundtripImage(opsin, enc_state, cms, pool);
+    if (aux_out) {
+      aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color());
+    }
+
+    for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) {
+      AcStrategyRow ac_strategy_row =
+          enc_state->shared.ac_strategy.ConstRow(by);
+      for (size_t bx = 0; bx < enc_state->shared.frame_dim.xsize_blocks; bx++) {
+        AcStrategy acs = ac_strategy_row[bx];
+        if (!acs.IsFirstBlock()) continue;
+        float max_error = 0;
+        for (size_t c = 0; c < 3; c++) {
+          for (size_t y = by * kBlockDim;
+               y < (by + acs.covered_blocks_y()) * kBlockDim; y++) {
+            if (y >= decoded.ysize()) continue;
+            const float* JXL_RESTRICT in_row = opsin.ConstPlaneRow(c, y);
+            const float* JXL_RESTRICT dec_row =
+                decoded.color()->ConstPlaneRow(c, y);
+            for (size_t x = bx * kBlockDim;
+                 x < (bx + acs.covered_blocks_x()) * kBlockDim; x++) {
+              if (x >= decoded.xsize()) continue;
+              max_error = std::max(
+                  std::abs(in_row[x] - dec_row[x]) * inv_max_err[c], max_error);
+            }
+          }
+        }
+        // Target an error between max_error/2 and max_error.
+        // If the error in the varblock is above the target, increase the qf to
+        // compensate. If the error is below the target, decrease the qf.
+        // However, to avoid an excessive increase of the qf, only do so if the
+        // error is less than half the maximum allowed error.
+        const float qf_mul = (max_error < 0.5f)   ? max_error * 2.0f
+                             : (max_error > 1.0f) ? max_error
+                                                  : 1.0f;
+        for (size_t qy = by; qy < by + acs.covered_blocks_y(); qy++) {
+          float* JXL_RESTRICT quant_field_row = quant_field.Row(qy);
+          for (size_t qx = bx; qx < bx + acs.covered_blocks_x(); qx++) {
+            quant_field_row[qx] *= qf_mul;
+          }
+        }
+      }
+    }
+  }
+  quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
+}
+
+}  // namespace
+
+void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
+                      ImageF* quant_field) {
+  // Replace the whole quant_field in non-8x8 blocks with the maximum of each
+  // 8x8 block.
+  size_t stride = quant_field->PixelsPerRow();
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y);
+    float* JXL_RESTRICT quant_row = rect.Row(quant_field, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      AcStrategy acs = ac_strategy_row[x];
+      if (!acs.IsFirstBlock()) continue;
+      JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize());
+      JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize());
+      float max = quant_row[x];
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          max = std::max(quant_row[x + ix + iy * stride], max);
+        }
+      }
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          quant_row[x + ix + iy * stride] = max;
+        }
+      }
+    }
+  }
+}
+
+float InitialQuantDC(float butteraugli_target) {
+  const float kDcMul = 0.3;  // Butteraugli target where non-linearity kicks in.
+  const float butteraugli_target_dc = std::max<float>(
+      0.5f * butteraugli_target,
+      std::min<float>(butteraugli_target,
+                      kDcMul * std::pow((1.0f / kDcMul) * butteraugli_target,
+                                        kDcQuantPow)));
+  // We want the maximum DC value to be at most 2**15 * kInvDCQuant / quant_dc.
+  // The maximum DC value might not be in the kXybRange because of inverse
+  // gaborish, so we add some slack to the maximum theoretical quant obtained
+  // this way (64).
+  return std::min(kDcQuant / butteraugli_target_dc, 50.f);
+}
+
+ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin,
+                         const FrameDimensions& frame_dim, ThreadPool* pool,
+                         float rescale, ImageF* mask) {
+  PROFILER_FUNC;
+  const float quant_ac = kAcQuant / butteraugli_target;
+  return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
+      butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask);
+}
+
+void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
+                       PassesEncoderState* enc_state,
+                       const JxlCmsInterface& cms, ThreadPool* pool,
+                       AuxOut* aux_out, double rescale) {
+  const CompressParams& cparams = enc_state->cparams;
+  if (cparams.max_error_mode) {
+    PROFILER_ZONE("enc find best maxerr");
+    FindBestQuantizationMaxError(opsin, enc_state, cms, pool, aux_out);
+  } else if (cparams.speed_tier <= SpeedTier::kKitten) {
+    // Normal encoding to a butteraugli score.
+    PROFILER_ZONE("enc find best2");
+    FindBestQuantization(*linear, opsin, enc_state, cms, pool, aux_out);
+  }
+}
+
+ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
+                           const JxlCmsInterface& cms, ThreadPool* pool) {
+  PROFILER_ZONE("enc roundtrip");
+  std::unique_ptr<PassesDecoderState> dec_state =
+      jxl::make_unique<PassesDecoderState>();
+  JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata(
+      *enc_state->shared.metadata));
+  dec_state->shared = &enc_state->shared;
+  JXL_ASSERT(opsin.ysize() % kBlockDim == 0);
+
+  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
+  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
+  const size_t num_groups = xsize_groups * ysize_groups;
+
+  size_t num_special_frames = enc_state->special_frames.size();
+
+  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
+      jxl::make_unique<ModularFrameEncoder>(enc_state->shared.frame_header,
+                                            enc_state->cparams);
+  JXL_CHECK(InitializePassesEncoder(opsin, cms, pool, enc_state,
+                                    modular_frame_encoder.get(), nullptr));
+  JXL_CHECK(dec_state->Init());
+  JXL_CHECK(dec_state->InitForAC(pool));
+
+  ImageBundle decoded(&enc_state->shared.metadata->m);
+  decoded.origin = enc_state->shared.frame_header.frame_origin;
+  decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()),
+                       dec_state->output_encoding_info.color_encoding);
+
+  PassesDecoderState::PipelineOptions options;
+  options.use_slow_render_pipeline = false;
+  options.coalescing = true;
+  options.render_spotcolors = false;
+
+  // Same as dec_state->shared->frame_header.nonserialized_metadata->m
+  const ImageMetadata& metadata = *decoded.metadata();
+
+  JXL_CHECK(dec_state->PreparePipeline(&decoded, options));
+
+  hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches;
+  const auto allocate_storage = [&](const size_t num_threads) -> Status {
+    JXL_RETURN_IF_ERROR(
+        dec_state->render_pipeline->PrepareForThreads(num_threads,
+                                                      /*use_group_ids=*/false));
+    group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads);
+    return true;
+  };
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
+      ComputeSigma(dec_state->shared->BlockGroupRect(group_index),
+                   dec_state.get());
+    }
+    RenderPipelineInput input =
+        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
+    JXL_CHECK(DecodeGroupForRoundtrip(
+        enc_state->coeffs, group_index, dec_state.get(),
+        &group_dec_caches[thread], thread, input, &decoded, nullptr));
+    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
+      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
+      FillPlane(0.0f, ri.first, ri.second);
+    }
+    input.Done();
+  };
+  JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group,
+                      "AQ loop"));
+
+  // Ensure we don't create any new special frames.
+  enc_state->special_frames.resize(num_special_frames);
+
+  return decoded;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
new file mode 100644
index 0000000000..a63c574492
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
+#define LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+
+// Heuristics to find a good quantizer for a given image. InitialQuantField
+// produces a quantization field (i.e. relative quantization amounts for each
+// block) out of an opsin-space image. `InitialQuantField` uses heuristics,
+// `FindBestQuantizer` (in non-fast mode) will run multiple encoding-decoding
+// steps and try to improve the given quant field.
+
+namespace jxl {
+
+struct AuxOut;
+
+// Computes the decoded image for a given set of compression parameters. Mainly
+// used in the FindBestQuantization loops and in some tests.
+// TODO(veluca): this doesn't seem the best possible file for this function.
+ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
+                           const JxlCmsInterface& cms, ThreadPool* pool);
+
+// Returns an image subsampled by kBlockDim in each direction. If the value
+// at pixel (x,y) in the returned image is greater than 1.0, it means that
+// more fine-grained quantization should be used in the corresponding block
+// of the input image, while a value less than 1.0 indicates that less
+// fine-grained quantization should be enough. Returns a mask, too, which
+// can later be used to make better decisions about ac strategy.
+ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin,
+                         const FrameDimensions& frame_dim, ThreadPool* pool,
+                         float rescale, ImageF* initial_quant_mask);
+
+float InitialQuantDC(float butteraugli_target);
+
+void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
+                      ImageF* quant_field);
+
+// Returns a quantizer that uses an adjusted version of the provided
+// quant_field. Also computes the dequant_map corresponding to the given
+// dequant_float_map and chosen quantization levels.
+// `linear` is only used in Kitten mode or slower.
+void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
+                       PassesEncoderState* enc_state,
+                       const JxlCmsInterface& cms, ThreadPool* pool,
+                       AuxOut* aux_out, double rescale = 1.0);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.cc b/third_party/jpeg-xl/lib/jxl/enc_ans.cc
new file mode 100644
index 0000000000..4249426bc9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans.cc
@@ -0,0 +1,1688 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ans.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cluster.h"
+#include "lib/jxl/enc_context_map.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_huffman.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+namespace {
+
+bool ans_fuzzer_friendly_ = false;
+
+static const int kMaxNumSymbolsForSmallCode = 4;
+
+void ANSBuildInfoTable(const ANSHistBin* counts, const AliasTable::Entry* table,
+                       size_t alphabet_size, size_t log_alpha_size,
+                       ANSEncSymbolInfo* info) {
+  size_t log_entry_size = ANS_LOG_TAB_SIZE - log_alpha_size;
+  size_t entry_size_minus_1 = (1 << log_entry_size) - 1;
+  // create valid alias table for empty streams.
+  for (size_t s = 0; s < std::max<size_t>(1, alphabet_size); ++s) {
+    const ANSHistBin freq = s == alphabet_size ? ANS_TAB_SIZE : counts[s];
+    info[s].freq_ = static_cast<uint16_t>(freq);
+#ifdef USE_MULT_BY_RECIPROCAL
+    if (freq != 0) {
+      info[s].ifreq_ =
+          ((1ull << RECIPROCAL_PRECISION) + info[s].freq_ - 1) / info[s].freq_;
+    } else {
+      info[s].ifreq_ = 1;  // shouldn't matter (symbol shouldn't occur), but...
+    }
+#endif
+    info[s].reverse_map_.resize(freq);
+  }
+  for (int i = 0; i < ANS_TAB_SIZE; i++) {
+    AliasTable::Symbol s =
+        AliasTable::Lookup(table, i, log_entry_size, entry_size_minus_1);
+    info[s.value].reverse_map_[s.offset] = i;
+  }
+}
+
+float EstimateDataBits(const ANSHistBin* histogram, const ANSHistBin* counts,
+                       size_t len) {
+  float sum = 0.0f;
+  int total_histogram = 0;
+  int total_counts = 0;
+  for (size_t i = 0; i < len; ++i) {
+    total_histogram += histogram[i];
+    total_counts += counts[i];
+    if (histogram[i] > 0) {
+      JXL_ASSERT(counts[i] > 0);
+      // += histogram[i] * -log(counts[i]/total_counts)
+      sum += histogram[i] *
+             std::max(0.0f, ANS_LOG_TAB_SIZE - FastLog2f(counts[i]));
+    }
+  }
+  if (total_histogram > 0) {
+    // Used only in assert.
+    (void)total_counts;
+    JXL_ASSERT(total_counts == ANS_TAB_SIZE);
+  }
+  return sum;
+}
+
+float EstimateDataBitsFlat(const ANSHistBin* histogram, size_t len) {
+  const float flat_bits = std::max(FastLog2f(len), 0.0f);
+  float total_histogram = 0;
+  for (size_t i = 0; i < len; ++i) {
+    total_histogram += histogram[i];
+  }
+  return total_histogram * flat_bits;
+}
+
+// Static Huffman code for encoding logcounts. The last symbol is used as RLE
+// sequence.
+static const uint8_t kLogCountBitLengths[ANS_LOG_TAB_SIZE + 2] = {
+    5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 6, 7, 7,
+};
+static const uint8_t kLogCountSymbols[ANS_LOG_TAB_SIZE + 2] = {
+    17, 11, 15, 3, 9, 7, 4, 2, 5, 6, 0, 33, 1, 65,
+};
+
+// Returns the difference between largest count that can be represented and is
+// smaller than "count" and smallest representable count larger than "count".
+static int SmallestIncrement(uint32_t count, uint32_t shift) {
+  int bits = count == 0 ? -1 : FloorLog2Nonzero(count);
+  int drop_bits = bits - GetPopulationCountPrecision(bits, shift);
+  return drop_bits < 0 ? 1 : (1 << drop_bits);
+}
+
+template <bool minimize_error_of_sum>
+bool RebalanceHistogram(const float* targets, int max_symbol, int table_size,
+                        uint32_t shift, int* omit_pos, ANSHistBin* counts) {
+  int sum = 0;
+  float sum_nonrounded = 0.0;
+  int remainder_pos = 0;  // if all of them are handled in first loop
+  int remainder_log = -1;
+  for (int n = 0; n < max_symbol; ++n) {
+    if (targets[n] > 0 && targets[n] < 1.0f) {
+      counts[n] = 1;
+      sum_nonrounded += targets[n];
+      sum += counts[n];
+    }
+  }
+  const float discount_ratio =
+      (table_size - sum) / (table_size - sum_nonrounded);
+  JXL_ASSERT(discount_ratio > 0);
+  JXL_ASSERT(discount_ratio <= 1.0f);
+  // Invariant for minimize_error_of_sum == true:
+  // abs(sum - sum_nonrounded)
+  //   <= SmallestIncrement(max(targets[])) + max_symbol
+  for (int n = 0; n < max_symbol; ++n) {
+    if (targets[n] >= 1.0f) {
+      sum_nonrounded += targets[n];
+      counts[n] =
+          static_cast<ANSHistBin>(targets[n] * discount_ratio);  // truncate
+      if (counts[n] == 0) counts[n] = 1;
+      if (counts[n] == table_size) counts[n] = table_size - 1;
+      // Round the count to the closest nonzero multiple of SmallestIncrement
+      // (when minimize_error_of_sum is false) or one of two closest so as to
+      // keep the sum as close as possible to sum_nonrounded.
+      int inc = SmallestIncrement(counts[n], shift);
+      counts[n] -= counts[n] & (inc - 1);
+      // TODO(robryk): Should we rescale targets[n]?
+      const float target =
+          minimize_error_of_sum ? (sum_nonrounded - sum) : targets[n];
+      if (counts[n] == 0 ||
+          (target > counts[n] + inc / 2 && counts[n] + inc < table_size)) {
+        counts[n] += inc;
+      }
+      sum += counts[n];
+      const int count_log = FloorLog2Nonzero(static_cast<uint32_t>(counts[n]));
+      if (count_log > remainder_log) {
+        remainder_pos = n;
+        remainder_log = count_log;
+      }
+    }
+  }
+  JXL_ASSERT(remainder_pos != -1);
+  // NOTE: This is the only place where counts could go negative. We could
+  // detect that, return false and make ANSHistBin uint32_t.
+  counts[remainder_pos] -= sum - table_size;
+  *omit_pos = remainder_pos;
+  return counts[remainder_pos] > 0;
+}
+
+Status NormalizeCounts(ANSHistBin* counts, int* omit_pos, const int length,
+                       const int precision_bits, uint32_t shift,
+                       int* num_symbols, int* symbols) {
+  const int32_t table_size = 1 << precision_bits;  // target sum / table size
+  uint64_t total = 0;
+  int max_symbol = 0;
+  int symbol_count = 0;
+  for (int n = 0; n < length; ++n) {
+    total += counts[n];
+    if (counts[n] > 0) {
+      if (symbol_count < kMaxNumSymbolsForSmallCode) {
+        symbols[symbol_count] = n;
+      }
+      ++symbol_count;
+      max_symbol = n + 1;
+    }
+  }
+  *num_symbols = symbol_count;
+  if (symbol_count == 0) {
+    return true;
+  }
+  if (symbol_count == 1) {
+    counts[symbols[0]] = table_size;
+    return true;
+  }
+  if (symbol_count > table_size)
+    return JXL_FAILURE("Too many entries in an ANS histogram");
+
+  const float norm = 1.f * table_size / total;
+  std::vector<float> targets(max_symbol);
+  for (size_t n = 0; n < targets.size(); ++n) {
+    targets[n] = norm * counts[n];
+  }
+  if (!RebalanceHistogram<false>(&targets[0], max_symbol, table_size, shift,
+                                 omit_pos, counts)) {
+    // Use an alternative rebalancing mechanism if the one above failed
+    // to create a histogram that is positive wherever the original one was.
+    if (!RebalanceHistogram<true>(&targets[0], max_symbol, table_size, shift,
+                                  omit_pos, counts)) {
+      return JXL_FAILURE("Logic error: couldn't rebalance a histogram");
+    }
+  }
+  return true;
+}
+
+struct SizeWriter {
+  size_t size = 0;
+  void Write(size_t num, size_t bits) { size += num; }
+};
+
+template <typename Writer>
+void StoreVarLenUint8(size_t n, Writer* writer) {
+  JXL_DASSERT(n <= 255);
+  if (n == 0) {
+    writer->Write(1, 0);
+  } else {
+    writer->Write(1, 1);
+    size_t nbits = FloorLog2Nonzero(n);
+    writer->Write(3, nbits);
+    writer->Write(nbits, n - (1ULL << nbits));
+  }
+}
+
+template <typename Writer>
+void StoreVarLenUint16(size_t n, Writer* writer) {
+  JXL_DASSERT(n <= 65535);
+  if (n == 0) {
+    writer->Write(1, 0);
+  } else {
+    writer->Write(1, 1);
+    size_t nbits = FloorLog2Nonzero(n);
+    writer->Write(4, nbits);
+    writer->Write(nbits, n - (1ULL << nbits));
+  }
+}
+
+template <typename Writer>
+bool EncodeCounts(const ANSHistBin* counts, const int alphabet_size,
+                  const int omit_pos, const int num_symbols, uint32_t shift,
+                  const int* symbols, Writer* writer) {
+  bool ok = true;
+  if (num_symbols <= 2) {
+    // Small tree marker to encode 1-2 symbols.
+    writer->Write(1, 1);
+    if (num_symbols == 0) {
+      writer->Write(1, 0);
+      StoreVarLenUint8(0, writer);
+    } else {
+      writer->Write(1, num_symbols - 1);
+      for (int i = 0; i < num_symbols; ++i) {
+        StoreVarLenUint8(symbols[i], writer);
+      }
+    }
+    if (num_symbols == 2) {
+      writer->Write(ANS_LOG_TAB_SIZE, counts[symbols[0]]);
+    }
+  } else {
+    // Mark non-small tree.
+    writer->Write(1, 0);
+    // Mark non-flat histogram.
+    writer->Write(1, 0);
+
+    // Precompute sequences for RLE encoding. Contains the number of identical
+    // values starting at a given index. Only contains the value at the first
+    // element of the series.
+    std::vector<uint32_t> same(alphabet_size, 0);
+    int last = 0;
+    for (int i = 1; i < alphabet_size; i++) {
+      // Store the sequence length once different symbol reached, or we're at
+      // the end, or the length is longer than we can encode, or we are at
+      // the omit_pos. We don't support including the omit_pos in an RLE
+      // sequence because this value may use a different amount of log2 bits
+      // than standard, it is too complex to handle in the decoder.
+      if (counts[i] != counts[last] || i + 1 == alphabet_size ||
+          (i - last) >= 255 || i == omit_pos || i == omit_pos + 1) {
+        same[last] = (i - last);
+        last = i + 1;
+      }
+    }
+
+    int length = 0;
+    std::vector<int> logcounts(alphabet_size);
+    int omit_log = 0;
+    for (int i = 0; i < alphabet_size; ++i) {
+      JXL_ASSERT(counts[i] <= ANS_TAB_SIZE);
+      JXL_ASSERT(counts[i] >= 0);
+      if (i == omit_pos) {
+        length = i + 1;
+      } else if (counts[i] > 0) {
+        logcounts[i] = FloorLog2Nonzero(static_cast<uint32_t>(counts[i])) + 1;
+        length = i + 1;
+        if (i < omit_pos) {
+          omit_log = std::max(omit_log, logcounts[i] + 1);
+        } else {
+          omit_log = std::max(omit_log, logcounts[i]);
+        }
+      }
+    }
+    logcounts[omit_pos] = omit_log;
+
+    // Elias gamma-like code for shift. Only difference is that if the number
+    // of bits to be encoded is equal to FloorLog2(ANS_LOG_TAB_SIZE+1), we skip
+    // the terminating 0 in unary coding.
+    int upper_bound_log = FloorLog2Nonzero(ANS_LOG_TAB_SIZE + 1);
+    int log = FloorLog2Nonzero(shift + 1);
+    writer->Write(log, (1 << log) - 1);
+    if (log != upper_bound_log) writer->Write(1, 0);
+    writer->Write(log, ((1 << log) - 1) & (shift + 1));
+
+    // Since num_symbols >= 3, we know that length >= 3, therefore we encode
+    // length - 3.
+    if (length - 3 > 255) {
+      // Pretend that everything is OK, but complain about correctness later.
+      StoreVarLenUint8(255, writer);
+      ok = false;
+    } else {
+      StoreVarLenUint8(length - 3, writer);
+    }
+
+    // The logcount values are encoded with a static Huffman code.
+    static const size_t kMinReps = 4;
+    size_t rep = ANS_LOG_TAB_SIZE + 1;
+    for (int i = 0; i < length; ++i) {
+      if (i > 0 && same[i - 1] > kMinReps) {
+        // Encode the RLE symbol and skip the repeated ones.
+        writer->Write(kLogCountBitLengths[rep], kLogCountSymbols[rep]);
+        StoreVarLenUint8(same[i - 1] - kMinReps - 1, writer);
+        i += same[i - 1] - 2;
+        continue;
+      }
+      writer->Write(kLogCountBitLengths[logcounts[i]],
+                    kLogCountSymbols[logcounts[i]]);
+    }
+    for (int i = 0; i < length; ++i) {
+      if (i > 0 && same[i - 1] > kMinReps) {
+        // Skip symbols encoded by RLE.
+        i += same[i - 1] - 2;
+        continue;
+      }
+      if (logcounts[i] > 1 && i != omit_pos) {
+        int bitcount = GetPopulationCountPrecision(logcounts[i] - 1, shift);
+        int drop_bits = logcounts[i] - 1 - bitcount;
+        JXL_CHECK((counts[i] & ((1 << drop_bits) - 1)) == 0);
+        writer->Write(bitcount, (counts[i] >> drop_bits) - (1 << bitcount));
+      }
+    }
+  }
+  return ok;
+}
+
+void EncodeFlatHistogram(const int alphabet_size, BitWriter* writer) {
+  // Mark non-small tree.
+  writer->Write(1, 0);
+  // Mark uniform histogram.
+  writer->Write(1, 1);
+  JXL_ASSERT(alphabet_size > 0);
+  // Encode alphabet size.
+  StoreVarLenUint8(alphabet_size - 1, writer);
+}
+
+float ComputeHistoAndDataCost(const ANSHistBin* histogram, size_t alphabet_size,
+                              uint32_t method) {
+  if (method == 0) {  // Flat code
+    return ANS_LOG_TAB_SIZE + 2 +
+           EstimateDataBitsFlat(histogram, alphabet_size);
+  }
+  // Non-flat: shift = method-1.
+  uint32_t shift = method - 1;
+  std::vector<ANSHistBin> counts(histogram, histogram + alphabet_size);
+  int omit_pos = 0;
+  int num_symbols;
+  int symbols[kMaxNumSymbolsForSmallCode] = {};
+  JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size,
+                            ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols));
+  SizeWriter writer;
+  // Ignore the correctness, no real encoding happens at this stage.
+  (void)EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols, shift,
+                     symbols, &writer);
+  return writer.size +
+         EstimateDataBits(histogram, counts.data(), alphabet_size);
+}
+
+uint32_t ComputeBestMethod(
+    const ANSHistBin* histogram, size_t alphabet_size, float* cost,
+    HistogramParams::ANSHistogramStrategy ans_histogram_strategy) {
+  size_t method = 0;
+  float fcost = ComputeHistoAndDataCost(histogram, alphabet_size, 0);
+  auto try_shift = [&](size_t shift) {
+    float c = ComputeHistoAndDataCost(histogram, alphabet_size, shift + 1);
+    if (c < fcost) {
+      method = shift + 1;
+      fcost = c;
+    }
+  };
+  switch (ans_histogram_strategy) {
+    case HistogramParams::ANSHistogramStrategy::kPrecise: {
+      for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift++) {
+        try_shift(shift);
+      }
+      break;
+    }
+    case HistogramParams::ANSHistogramStrategy::kApproximate: {
+      for (uint32_t shift = 0; shift <= ANS_LOG_TAB_SIZE; shift += 2) {
+        try_shift(shift);
+      }
+      break;
+    }
+    case HistogramParams::ANSHistogramStrategy::kFast: {
+      try_shift(0);
+      try_shift(ANS_LOG_TAB_SIZE / 2);
+      try_shift(ANS_LOG_TAB_SIZE);
+      break;
+    }
+  };
+  *cost = fcost;
+  return method;
+}
+
+}  // namespace
+
+// Returns an estimate of the cost of encoding this histogram and the
+// corresponding data.
+size_t BuildAndStoreANSEncodingData(
+    HistogramParams::ANSHistogramStrategy ans_histogram_strategy,
+    const ANSHistBin* histogram, size_t alphabet_size, size_t log_alpha_size,
+    bool use_prefix_code, ANSEncSymbolInfo* info, BitWriter* writer) {
+  if (use_prefix_code) {
+    if (alphabet_size <= 1) return 0;
+    std::vector<uint32_t> histo(alphabet_size);
+    for (size_t i = 0; i < alphabet_size; i++) {
+      histo[i] = histogram[i];
+      JXL_CHECK(histogram[i] >= 0);
+    }
+    size_t cost = 0;
+    {
+      std::vector<uint8_t> depths(alphabet_size);
+      std::vector<uint16_t> bits(alphabet_size);
+      if (writer == nullptr) {
+        BitWriter tmp_writer;
+        BitWriter::Allotment allotment(
+            &tmp_writer, 8 * alphabet_size + 8);  // safe upper bound
+        BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(),
+                                 bits.data(), &tmp_writer);
+        allotment.ReclaimAndCharge(&tmp_writer, 0, /*aux_out=*/nullptr);
+        cost = tmp_writer.BitsWritten();
+      } else {
+        size_t start = writer->BitsWritten();
+        BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(),
+                                 bits.data(), writer);
+        cost = writer->BitsWritten() - start;
+      }
+      for (size_t i = 0; i < alphabet_size; i++) {
+        info[i].bits = depths[i] == 0 ? 0 : bits[i];
+        info[i].depth = depths[i];
+      }
+    }
+    // Estimate data cost.
+    for (size_t i = 0; i < alphabet_size; i++) {
+      cost += histogram[i] * info[i].depth;
+    }
+    return cost;
+  }
+  JXL_ASSERT(alphabet_size <= ANS_TAB_SIZE);
+  // Ensure we ignore trailing zeros in the histogram.
+  if (alphabet_size != 0) {
+    size_t largest_symbol = 0;
+    for (size_t i = 0; i < alphabet_size; i++) {
+      if (histogram[i] != 0) largest_symbol = i;
+    }
+    alphabet_size = largest_symbol + 1;
+  }
+  float cost;
+  uint32_t method = ComputeBestMethod(histogram, alphabet_size, &cost,
+                                      ans_histogram_strategy);
+  JXL_ASSERT(cost >= 0);
+  int num_symbols;
+  int symbols[kMaxNumSymbolsForSmallCode] = {};
+  std::vector<ANSHistBin> counts(histogram, histogram + alphabet_size);
+  if (!counts.empty()) {
+    size_t sum = 0;
+    for (size_t i = 0; i < counts.size(); i++) {
+      sum += counts[i];
+    }
+    if (sum == 0) {
+      counts[0] = ANS_TAB_SIZE;
+    }
+  }
+  if (method == 0) {
+    counts = CreateFlatHistogram(alphabet_size, ANS_TAB_SIZE);
+    AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE];
+    InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a);
+    ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info);
+    if (writer != nullptr) {
+      EncodeFlatHistogram(alphabet_size, writer);
+    }
+    return cost;
+  }
+  int omit_pos = 0;
+  uint32_t shift = method - 1;
+  JXL_CHECK(NormalizeCounts(counts.data(), &omit_pos, alphabet_size,
+                            ANS_LOG_TAB_SIZE, shift, &num_symbols, symbols));
+  AliasTable::Entry a[ANS_MAX_ALPHABET_SIZE];
+  InitAliasTable(counts, ANS_TAB_SIZE, log_alpha_size, a);
+  ANSBuildInfoTable(counts.data(), a, alphabet_size, log_alpha_size, info);
+  if (writer != nullptr) {
+    bool ok = EncodeCounts(counts.data(), alphabet_size, omit_pos, num_symbols,
+                           shift, symbols, writer);
+    (void)ok;
+    JXL_DASSERT(ok);
+  }
+  return cost;
+}
+
+float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size) {
+  float c;
+  ComputeBestMethod(data, alphabet_size, &c,
+                    HistogramParams::ANSHistogramStrategy::kFast);
+  return c;
+}
+
+template <typename Writer>
+void EncodeUintConfig(const HybridUintConfig uint_config, Writer* writer,
+                      size_t log_alpha_size) {
+  writer->Write(CeilLog2Nonzero(log_alpha_size + 1),
+                uint_config.split_exponent);
+  if (uint_config.split_exponent == log_alpha_size) {
+    return;  // msb/lsb don't matter.
+  }
+  size_t nbits = CeilLog2Nonzero(uint_config.split_exponent + 1);
+  writer->Write(nbits, uint_config.msb_in_token);
+  nbits = CeilLog2Nonzero(uint_config.split_exponent -
+                          uint_config.msb_in_token + 1);
+  writer->Write(nbits, uint_config.lsb_in_token);
+}
+template <typename Writer>
+void EncodeUintConfigs(const std::vector<HybridUintConfig>& uint_config,
+                       Writer* writer, size_t log_alpha_size) {
+  // TODO(veluca): RLE?
+  for (size_t i = 0; i < uint_config.size(); i++) {
+    EncodeUintConfig(uint_config[i], writer, log_alpha_size);
+  }
+}
+template void EncodeUintConfigs(const std::vector<HybridUintConfig>&,
+                                BitWriter*, size_t);
+
+namespace {
+
+void ChooseUintConfigs(const HistogramParams& params,
+                       const std::vector<std::vector<Token>>& tokens,
+                       const std::vector<uint8_t>& context_map,
+                       std::vector<Histogram>* clustered_histograms,
+                       EntropyEncodingData* codes, size_t* log_alpha_size) {
+  codes->uint_config.resize(clustered_histograms->size());
+
+  if (params.uint_method == HistogramParams::HybridUintMethod::kNone) return;
+  if (params.uint_method == HistogramParams::HybridUintMethod::k000) {
+    codes->uint_config.clear();
+    codes->uint_config.resize(clustered_histograms->size(),
+                              HybridUintConfig(0, 0, 0));
+    return;
+  }
+  if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) {
+    codes->uint_config.clear();
+    codes->uint_config.resize(clustered_histograms->size(),
+                              HybridUintConfig(2, 0, 1));
+    return;
+  }
+
+  // Brute-force method that tries a few options.
+  std::vector<HybridUintConfig> configs;
+  if (params.uint_method == HistogramParams::HybridUintMethod::kBest) {
+    configs = {
+        HybridUintConfig(4, 2, 0),  // default
+        HybridUintConfig(4, 1, 0),  // less precise
+        HybridUintConfig(4, 2, 1),  // add sign
+        HybridUintConfig(4, 2, 2),  // add sign+parity
+        HybridUintConfig(4, 1, 2),  // add parity but less msb
+        // Same as above, but more direct coding.
+        HybridUintConfig(5, 2, 0), HybridUintConfig(5, 1, 0),
+        HybridUintConfig(5, 2, 1), HybridUintConfig(5, 2, 2),
+        HybridUintConfig(5, 1, 2),
+        // Same as above, but less direct coding.
+        HybridUintConfig(3, 2, 0), HybridUintConfig(3, 1, 0),
+        HybridUintConfig(3, 2, 1), HybridUintConfig(3, 1, 2),
+        // For near-lossless.
+        HybridUintConfig(4, 1, 3), HybridUintConfig(5, 1, 4),
+        HybridUintConfig(5, 2, 3), HybridUintConfig(6, 1, 5),
+        HybridUintConfig(6, 2, 4), HybridUintConfig(6, 0, 0),
+        // Other
+        HybridUintConfig(0, 0, 0),   // varlenuint
+        HybridUintConfig(2, 0, 1),   // works well for ctx map
+        HybridUintConfig(7, 0, 0),   // direct coding
+        HybridUintConfig(8, 0, 0),   // direct coding
+        HybridUintConfig(9, 0, 0),   // direct coding
+        HybridUintConfig(10, 0, 0),  // direct coding
+        HybridUintConfig(11, 0, 0),  // direct coding
+        HybridUintConfig(12, 0, 0),  // direct coding
+    };
+  } else if (params.uint_method == HistogramParams::HybridUintMethod::kFast) {
+    configs = {
+        HybridUintConfig(4, 2, 0),  // default
+        HybridUintConfig(4, 1, 2),  // add parity but less msb
+        HybridUintConfig(0, 0, 0),  // smallest histograms
+        HybridUintConfig(2, 0, 1),  // works well for ctx map
+    };
+  }
+
+  std::vector<float> costs(clustered_histograms->size(),
+                           std::numeric_limits<float>::max());
+  std::vector<uint32_t> extra_bits(clustered_histograms->size());
+  std::vector<uint8_t> is_valid(clustered_histograms->size());
+  size_t max_alpha =
+      codes->use_prefix_code ? PREFIX_MAX_ALPHABET_SIZE : ANS_MAX_ALPHABET_SIZE;
+  for (HybridUintConfig cfg : configs) {
+    std::fill(is_valid.begin(), is_valid.end(), true);
+    std::fill(extra_bits.begin(), extra_bits.end(), 0);
+
+    for (size_t i = 0; i < clustered_histograms->size(); i++) {
+      (*clustered_histograms)[i].Clear();
+    }
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token token = tokens[i][j];
+        // TODO(veluca): do not ignore lz77 commands.
+        if (token.is_lz77_length) continue;
+        size_t histo = context_map[token.context];
+        uint32_t tok, nbits, bits;
+        cfg.Encode(token.value, &tok, &nbits, &bits);
+        if (tok >= max_alpha ||
+            (codes->lz77.enabled && tok >= codes->lz77.min_symbol)) {
+          is_valid[histo] = false;
+          continue;
+        }
+        extra_bits[histo] += nbits;
+        (*clustered_histograms)[histo].Add(tok);
+      }
+    }
+
+    for (size_t i = 0; i < clustered_histograms->size(); i++) {
+      if (!is_valid[i]) continue;
+      float cost = (*clustered_histograms)[i].PopulationCost() + extra_bits[i];
+      // add signaling cost of the hybriduintconfig itself
+      cost += CeilLog2Nonzero(cfg.split_exponent + 1);
+      cost += CeilLog2Nonzero(cfg.split_exponent - cfg.msb_in_token + 1);
+      if (cost < costs[i]) {
+        codes->uint_config[i] = cfg;
+        costs[i] = cost;
+      }
+    }
+  }
+
+  // Rebuild histograms.
+  for (size_t i = 0; i < clustered_histograms->size(); i++) {
+    (*clustered_histograms)[i].Clear();
+  }
+  *log_alpha_size = 4;
+  for (size_t i = 0; i < tokens.size(); ++i) {
+    for (size_t j = 0; j < tokens[i].size(); ++j) {
+      const Token token = tokens[i][j];
+      uint32_t tok, nbits, bits;
+      size_t histo = context_map[token.context];
+      (token.is_lz77_length ? codes->lz77.length_uint_config
+                            : codes->uint_config[histo])
+          .Encode(token.value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes->lz77.min_symbol : 0;
+      (*clustered_histograms)[histo].Add(tok);
+      while (tok >= (1u << *log_alpha_size)) (*log_alpha_size)++;
+    }
+  }
+#if JXL_ENABLE_ASSERT
+  size_t max_log_alpha_size = codes->use_prefix_code ? PREFIX_MAX_BITS : 8;
+  JXL_ASSERT(*log_alpha_size <= max_log_alpha_size);
+#endif
+}
+
+class HistogramBuilder {
+ public:
+  explicit HistogramBuilder(const size_t num_contexts)
+      : histograms_(num_contexts) {}
+
+  void VisitSymbol(int symbol, size_t histo_idx) {
+    JXL_DASSERT(histo_idx < histograms_.size());
+    histograms_[histo_idx].Add(symbol);
+  }
+
+  // NOTE: `layer` is only for clustered_entropy; caller does ReclaimAndCharge.
+  size_t BuildAndStoreEntropyCodes(
+      const HistogramParams& params,
+      const std::vector<std::vector<Token>>& tokens, EntropyEncodingData* codes,
+      std::vector<uint8_t>* context_map, bool use_prefix_code,
+      BitWriter* writer, size_t layer, AuxOut* aux_out) const {
+    size_t cost = 0;
+    codes->encoding_info.clear();
+    std::vector<Histogram> clustered_histograms(histograms_);
+    context_map->resize(histograms_.size());
+    if (histograms_.size() > 1) {
+      if (!ans_fuzzer_friendly_) {
+        std::vector<uint32_t> histogram_symbols;
+        ClusterHistograms(params, histograms_, kClustersLimit,
+                          &clustered_histograms, &histogram_symbols);
+        for (size_t c = 0; c < histograms_.size(); ++c) {
+          (*context_map)[c] = static_cast<uint8_t>(histogram_symbols[c]);
+        }
+      } else {
+        fill(context_map->begin(), context_map->end(), 0);
+        size_t max_symbol = 0;
+        for (const Histogram& h : histograms_) {
+          max_symbol = std::max(h.data_.size(), max_symbol);
+        }
+        size_t num_symbols = 1 << CeilLog2Nonzero(max_symbol + 1);
+        clustered_histograms.resize(1);
+        clustered_histograms[0].Clear();
+        for (size_t i = 0; i < num_symbols; i++) {
+          clustered_histograms[0].Add(i);
+        }
+      }
+      if (writer != nullptr) {
+        EncodeContextMap(*context_map, clustered_histograms.size(), writer,
+                         layer, aux_out);
+      }
+    }
+    if (aux_out != nullptr) {
+      for (size_t i = 0; i < clustered_histograms.size(); ++i) {
+        aux_out->layers[layer].clustered_entropy +=
+            clustered_histograms[i].ShannonEntropy();
+      }
+    }
+    codes->use_prefix_code = use_prefix_code;
+    size_t log_alpha_size = codes->lz77.enabled ? 8 : 7;  // Sane default.
+    if (ans_fuzzer_friendly_) {
+      codes->uint_config.clear();
+      codes->uint_config.resize(1, HybridUintConfig(7, 0, 0));
+    } else {
+      ChooseUintConfigs(params, tokens, *context_map, &clustered_histograms,
+                        codes, &log_alpha_size);
+    }
+    if (log_alpha_size < 5) log_alpha_size = 5;
+    SizeWriter size_writer;  // Used if writer == nullptr to estimate costs.
+    cost += 1;
+    if (writer) writer->Write(1, use_prefix_code);
+
+    if (use_prefix_code) {
+      log_alpha_size = PREFIX_MAX_BITS;
+    } else {
+      cost += 2;
+    }
+    if (writer == nullptr) {
+      EncodeUintConfigs(codes->uint_config, &size_writer, log_alpha_size);
+    } else {
+      if (!use_prefix_code) writer->Write(2, log_alpha_size - 5);
+      EncodeUintConfigs(codes->uint_config, writer, log_alpha_size);
+    }
+    if (use_prefix_code) {
+      for (size_t c = 0; c < clustered_histograms.size(); ++c) {
+        size_t num_symbol = 1;
+        for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) {
+          if (clustered_histograms[c].data_[i]) num_symbol = i + 1;
+        }
+        if (writer) {
+          StoreVarLenUint16(num_symbol - 1, writer);
+        } else {
+          StoreVarLenUint16(num_symbol - 1, &size_writer);
+        }
+      }
+    }
+    cost += size_writer.size;
+    for (size_t c = 0; c < clustered_histograms.size(); ++c) {
+      size_t num_symbol = 1;
+      for (size_t i = 0; i < clustered_histograms[c].data_.size(); i++) {
+        if (clustered_histograms[c].data_[i]) num_symbol = i + 1;
+      }
+      codes->encoding_info.emplace_back();
+      codes->encoding_info.back().resize(std::max<size_t>(1, num_symbol));
+
+      BitWriter::Allotment allotment(writer, 256 + num_symbol * 24);
+      cost += BuildAndStoreANSEncodingData(
+          params.ans_histogram_strategy, clustered_histograms[c].data_.data(),
+          num_symbol, log_alpha_size, use_prefix_code,
+          codes->encoding_info.back().data(), writer);
+      allotment.FinishedHistogram(writer);
+      allotment.ReclaimAndCharge(writer, layer, aux_out);
+    }
+    return cost;
+  }
+
+  const Histogram& Histo(size_t i) const { return histograms_[i]; }
+
+ private:
+  std::vector<Histogram> histograms_;
+};
+
+class SymbolCostEstimator {
+ public:
+  SymbolCostEstimator(size_t num_contexts, bool force_huffman,
+                      const std::vector<std::vector<Token>>& tokens,
+                      const LZ77Params& lz77) {
+    HistogramBuilder builder(num_contexts);
+    // Build histograms for estimating lz77 savings.
+    HybridUintConfig uint_config;
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token token = tokens[i][j];
+        uint32_t tok, nbits, bits;
+        (token.is_lz77_length ? lz77.length_uint_config : uint_config)
+            .Encode(token.value, &tok, &nbits, &bits);
+        tok += token.is_lz77_length ? lz77.min_symbol : 0;
+        builder.VisitSymbol(tok, token.context);
+      }
+    }
+    max_alphabet_size_ = 0;
+    for (size_t i = 0; i < num_contexts; i++) {
+      max_alphabet_size_ =
+          std::max(max_alphabet_size_, builder.Histo(i).data_.size());
+    }
+    bits_.resize(num_contexts * max_alphabet_size_);
+    // TODO(veluca): SIMD?
+    add_symbol_cost_.resize(num_contexts);
+    for (size_t i = 0; i < num_contexts; i++) {
+      float inv_total = 1.0f / (builder.Histo(i).total_count_ + 1e-8f);
+      float total_cost = 0;
+      for (size_t j = 0; j < builder.Histo(i).data_.size(); j++) {
+        size_t cnt = builder.Histo(i).data_[j];
+        float cost = 0;
+        if (cnt != 0 && cnt != builder.Histo(i).total_count_) {
+          cost = -FastLog2f(cnt * inv_total);
+          if (force_huffman) cost = std::ceil(cost);
+        } else if (cnt == 0) {
+          cost = ANS_LOG_TAB_SIZE;  // Highest possible cost.
+        }
+        bits_[i * max_alphabet_size_ + j] = cost;
+        total_cost += cost * builder.Histo(i).data_[j];
+      }
+      // Penalty for adding a lz77 symbol to this contest (only used for static
+      // cost model). Higher penalty for contexts that have a very low
+      // per-symbol entropy.
+      add_symbol_cost_[i] = std::max(0.0f, 6.0f - total_cost * inv_total);
+    }
+  }
+  float Bits(size_t ctx, size_t sym) const {
+    return bits_[ctx * max_alphabet_size_ + sym];
+  }
+  float LenCost(size_t ctx, size_t len, const LZ77Params& lz77) const {
+    uint32_t nbits, bits, tok;
+    lz77.length_uint_config.Encode(len, &tok, &nbits, &bits);
+    tok += lz77.min_symbol;
+    return nbits + Bits(ctx, tok);
+  }
+  float DistCost(size_t len, const LZ77Params& lz77) const {
+    uint32_t nbits, bits, tok;
+    HybridUintConfig().Encode(len, &tok, &nbits, &bits);
+    return nbits + Bits(lz77.nonserialized_distance_context, tok);
+  }
+  float AddSymbolCost(size_t idx) const { return add_symbol_cost_[idx]; }
+
+ private:
+  size_t max_alphabet_size_;
+  std::vector<float> bits_;
+  std::vector<float> add_symbol_cost_;
+};
+
+void ApplyLZ77_RLE(const HistogramParams& params, size_t num_contexts,
+                   const std::vector<std::vector<Token>>& tokens,
+                   LZ77Params& lz77,
+                   std::vector<std::vector<Token>>& tokens_lz77) {
+  // TODO(veluca): tune heuristics here.
+  SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77);
+  float bit_decrease = 0;
+  size_t total_symbols = 0;
+  tokens_lz77.resize(tokens.size());
+  std::vector<float> sym_cost;
+  HybridUintConfig uint_config;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    total_symbols += in.size();
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+    out.reserve(in.size());
+    for (size_t i = 0; i < in.size(); i++) {
+      size_t num_to_copy = 0;
+      size_t distance_symbol = 0;  // 1 for RLE.
+      if (distance_multiplier != 0) {
+        distance_symbol = 1;  // Special distance 1 if enabled.
+        JXL_DASSERT(kSpecialDistances[1][0] == 1);
+        JXL_DASSERT(kSpecialDistances[1][1] == 0);
+      }
+      if (i > 0) {
+        for (; i + num_to_copy < in.size(); num_to_copy++) {
+          if (in[i + num_to_copy].value != in[i - 1].value) {
+            break;
+          }
+        }
+      }
+      if (num_to_copy == 0) {
+        out.push_back(in[i]);
+        continue;
+      }
+      float cost = sym_cost[i + num_to_copy] - sym_cost[i];
+      // This subtraction might overflow, but that's OK.
+      size_t lz77_len = num_to_copy - lz77.min_length;
+      float lz77_cost = num_to_copy >= lz77.min_length
+                            ? CeilLog2Nonzero(lz77_len + 1) + 1
+                            : 0;
+      if (num_to_copy < lz77.min_length || cost <= lz77_cost) {
+        for (size_t j = 0; j < num_to_copy; j++) {
+          out.push_back(in[i + j]);
+        }
+        i += num_to_copy - 1;
+        continue;
+      }
+      // Output the LZ77 length
+      out.emplace_back(in[i].context, lz77_len);
+      out.back().is_lz77_length = true;
+      i += num_to_copy - 1;
+      bit_decrease += cost - lz77_cost;
+      // Output the LZ77 copy distance.
+      out.emplace_back(lz77.nonserialized_distance_context, distance_symbol);
+    }
+  }
+
+  if (bit_decrease > total_symbols * 0.2 + 16) {
+    lz77.enabled = true;
+  }
+}
+
+// Hash chain for LZ77 matching
+struct HashChain {
+  size_t size_;
+  std::vector<uint32_t> data_;
+
+  unsigned hash_num_values_ = 32768;
+  unsigned hash_mask_ = hash_num_values_ - 1;
+  unsigned hash_shift_ = 5;
+
+  std::vector<int> head;
+  std::vector<uint32_t> chain;
+  std::vector<int> val;
+
+  // Speed up repetitions of zero
+  std::vector<int> headz;
+  std::vector<uint32_t> chainz;
+  std::vector<uint32_t> zeros;
+  uint32_t numzeros = 0;
+
+  size_t window_size_;
+  size_t window_mask_;
+  size_t min_length_;
+  size_t max_length_;
+
+  // Map of special distance codes.
+  std::unordered_map<int, int> special_dist_table_;
+  size_t num_special_distances_ = 0;
+
+  uint32_t maxchainlength = 256;  // window_size_ to allow all
+
+  HashChain(const Token* data, size_t size, size_t window_size,
+            size_t min_length, size_t max_length, size_t distance_multiplier)
+      : size_(size),
+        window_size_(window_size),
+        window_mask_(window_size - 1),
+        min_length_(min_length),
+        max_length_(max_length) {
+    data_.resize(size);
+    for (size_t i = 0; i < size; i++) {
+      data_[i] = data[i].value;
+    }
+
+    head.resize(hash_num_values_, -1);
+    val.resize(window_size_, -1);
+    chain.resize(window_size_);
+    for (uint32_t i = 0; i < window_size_; ++i) {
+      chain[i] = i;  // same value as index indicates uninitialized
+    }
+
+    zeros.resize(window_size_);
+    headz.resize(window_size_ + 1, -1);
+    chainz.resize(window_size_);
+    for (uint32_t i = 0; i < window_size_; ++i) {
+      chainz[i] = i;
+    }
+    // Translate distance to special distance code.
+    if (distance_multiplier) {
+      // Count down, so if due to small distance multiplier multiple distances
+      // map to the same code, the smallest code will be used in the end.
+      for (int i = kNumSpecialDistances - 1; i >= 0; --i) {
+        int xi = kSpecialDistances[i][0];
+        int yi = kSpecialDistances[i][1];
+        int distance = yi * distance_multiplier + xi;
+        // Ensure that we map distance 1 to the lowest symbols.
+        if (distance < 1) distance = 1;
+        special_dist_table_[distance] = i;
+      }
+      num_special_distances_ = kNumSpecialDistances;
+    }
+  }
+
+  uint32_t GetHash(size_t pos) const {
+    uint32_t result = 0;
+    if (pos + 2 < size_) {
+      // TODO(lode): take the MSB's of the uint32_t values into account as well,
+      // given that the hash code itself is less than 32 bits.
+      result ^= (uint32_t)(data_[pos + 0] << 0u);
+      result ^= (uint32_t)(data_[pos + 1] << hash_shift_);
+      result ^= (uint32_t)(data_[pos + 2] << (hash_shift_ * 2));
+    } else {
+      // No need to compute hash of last 2 bytes, the length 2 is too short.
+      return 0;
+    }
+    return result & hash_mask_;
+  }
+
+  uint32_t CountZeros(size_t pos, uint32_t prevzeros) const {
+    size_t end = pos + window_size_;
+    if (end > size_) end = size_;
+    if (prevzeros > 0) {
+      if (prevzeros >= window_mask_ && data_[end - 1] == 0 &&
+          end == pos + window_size_) {
+        return prevzeros;
+      } else {
+        return prevzeros - 1;
+      }
+    }
+    uint32_t num = 0;
+    while (pos + num < end && data_[pos + num] == 0) num++;
+    return num;
+  }
+
+  void Update(size_t pos) {
+    uint32_t hashval = GetHash(pos);
+    uint32_t wpos = pos & window_mask_;
+
+    val[wpos] = (int)hashval;
+    if (head[hashval] != -1) chain[wpos] = head[hashval];
+    head[hashval] = wpos;
+
+    if (pos > 0 && data_[pos] != data_[pos - 1]) numzeros = 0;
+    numzeros = CountZeros(pos, numzeros);
+
+    zeros[wpos] = numzeros;
+    if (headz[numzeros] != -1) chainz[wpos] = headz[numzeros];
+    headz[numzeros] = wpos;
+  }
+
+  void Update(size_t pos, size_t len) {
+    for (size_t i = 0; i < len; i++) {
+      Update(pos + i);
+    }
+  }
+
+  template <typename CB>
+  void FindMatches(size_t pos, int max_dist, const CB& found_match) const {
+    uint32_t wpos = pos & window_mask_;
+    uint32_t hashval = GetHash(pos);
+    uint32_t hashpos = chain[wpos];
+
+    int prev_dist = 0;
+    int end = std::min<int>(pos + max_length_, size_);
+    uint32_t chainlength = 0;
+    uint32_t best_len = 0;
+    for (;;) {
+      int dist = (hashpos <= wpos) ? (wpos - hashpos)
+                                   : (wpos - hashpos + window_mask_ + 1);
+      if (dist < prev_dist) break;
+      prev_dist = dist;
+      uint32_t len = 0;
+      if (dist > 0) {
+        int i = pos;
+        int j = pos - dist;
+        if (numzeros > 3) {
+          int r = std::min<int>(numzeros - 1, zeros[hashpos]);
+          if (i + r >= end) r = end - i - 1;
+          i += r;
+          j += r;
+        }
+        while (i < end && data_[i] == data_[j]) {
+          i++;
+          j++;
+        }
+        len = i - pos;
+        // This can trigger even if the new length is slightly smaller than the
+        // best length, because it is possible for a slightly cheaper distance
+        // symbol to occur.
+        if (len >= min_length_ && len + 2 >= best_len) {
+          auto it = special_dist_table_.find(dist);
+          int dist_symbol = (it == special_dist_table_.end())
+                                ? (num_special_distances_ + dist - 1)
+                                : it->second;
+          found_match(len, dist_symbol);
+          if (len > best_len) best_len = len;
+        }
+      }
+
+      chainlength++;
+      if (chainlength >= maxchainlength) break;
+
+      if (numzeros >= 3 && len > numzeros) {
+        if (hashpos == chainz[hashpos]) break;
+        hashpos = chainz[hashpos];
+        if (zeros[hashpos] != numzeros) break;
+      } else {
+        if (hashpos == chain[hashpos]) break;
+        hashpos = chain[hashpos];
+        if (val[hashpos] != (int)hashval) break;  // outdated hash value
+      }
+    }
+  }
+  void FindMatch(size_t pos, int max_dist, size_t* result_dist_symbol,
+                 size_t* result_len) const {
+    *result_dist_symbol = 0;
+    *result_len = 1;
+    FindMatches(pos, max_dist, [&](size_t len, size_t dist_symbol) {
+      if (len > *result_len ||
+          (len == *result_len && *result_dist_symbol > dist_symbol)) {
+        *result_len = len;
+        *result_dist_symbol = dist_symbol;
+      }
+    });
+  }
+};
+
+float LenCost(size_t len) {
+  uint32_t nbits, bits, tok;
+  HybridUintConfig(1, 0, 0).Encode(len, &tok, &nbits, &bits);
+  constexpr float kCostTable[] = {
+      2.797667318563126,  3.213177690381199,  2.5706009246743737,
+      2.408392498667534,  2.829649191872326,  3.3923087753324577,
+      4.029267451554331,  4.415576699706408,  4.509357574741465,
+      9.21481543803004,   10.020590190114898, 11.858671627804766,
+      12.45853300490526,  11.713105831990857, 12.561996324849314,
+      13.775477692278367, 13.174027068768641,
+  };
+  size_t table_size = sizeof kCostTable / sizeof *kCostTable;
+  if (tok >= table_size) tok = table_size - 1;
+  return kCostTable[tok] + nbits;
+}
+
+// TODO(veluca): this does not take into account usage or non-usage of distance
+// multipliers.
+float DistCost(size_t dist) {
+  uint32_t nbits, bits, tok;
+  HybridUintConfig(7, 0, 0).Encode(dist, &tok, &nbits, &bits);
+  constexpr float kCostTable[] = {
+      6.368282626312716,  5.680793277090298,  8.347404197105247,
+      7.641619201599141,  6.914328374119438,  7.959808291537444,
+      8.70023120759855,   8.71378518934703,   9.379132523982769,
+      9.110472749092708,  9.159029569270908,  9.430936766731973,
+      7.278284055315169,  7.8278514904267755, 10.026641158289236,
+      9.976049229827066,  9.64351607048908,   9.563403863480442,
+      10.171474111762747, 10.45950155077234,  9.994813912104219,
+      10.322524683741156, 8.465808729388186,  8.756254166066853,
+      10.160930174662234, 10.247329273413435, 10.04090403724809,
+      10.129398517544082, 9.342311691539546,  9.07608009102374,
+      10.104799540677513, 10.378079384990906, 10.165828974075072,
+      10.337595322341553, 7.940557464567944,  10.575665823319431,
+      11.023344321751955, 10.736144698831827, 11.118277044595054,
+      7.468468230648442,  10.738305230932939, 10.906980780216568,
+      10.163468216353817, 10.17805759656433,  11.167283670483565,
+      11.147050200274544, 10.517921919244333, 10.651764778156886,
+      10.17074446448919,  11.217636876224745, 11.261630721139484,
+      11.403140815247259, 10.892472096873417, 11.1859607804481,
+      8.017346947551262,  7.895143720278828,  11.036577113822025,
+      11.170562110315794, 10.326988722591086, 10.40872184751056,
+      11.213498225466386, 11.30580635516863,  10.672272515665442,
+      10.768069466228063, 11.145257364153565, 11.64668307145549,
+      10.593156194627339, 11.207499484844943, 10.767517766396908,
+      10.826629811407042, 10.737764794499988, 10.6200448518045,
+      10.191315385198092, 8.468384171390085,  11.731295299170432,
+      11.824619886654398, 10.41518844301179,  10.16310536548649,
+      10.539423685097576, 10.495136599328031, 10.469112847728267,
+      11.72057686174922,  10.910326337834674, 11.378921834673758,
+      11.847759036098536, 11.92071647623854,  10.810628276345282,
+      11.008601085273893, 11.910326337834674, 11.949212023423133,
+      11.298614839104337, 11.611603659010392, 10.472930394619985,
+      11.835564720850282, 11.523267392285337, 12.01055816679611,
+      8.413029688994023,  11.895784139536406, 11.984679534970505,
+      11.220654278717394, 11.716311684833672, 10.61036646226114,
+      10.89849965960364,  10.203762898863669, 10.997560826267238,
+      11.484217379438984, 11.792836176993665, 12.24310468755171,
+      11.464858097919262, 12.212747017409377, 11.425595666074955,
+      11.572048533398757, 12.742093965163013, 11.381874288645637,
+      12.191870445817015, 11.683156920035426, 11.152442115262197,
+      11.90303691580457,  11.653292787169159, 11.938615382266098,
+      16.970641701570223, 16.853602280380002, 17.26240782594733,
+      16.644655390108507, 17.14310889757499,  16.910935455445955,
+      17.505678976959697, 17.213498225466388, 2.4162310293553024,
+      3.494587244462329,  3.5258600986408344, 3.4959806589517095,
+      3.098390886949687,  3.343454654302911,  3.588847442290287,
+      4.14614790111827,   5.152948641990529,  7.433696808092598,
+      9.716311684833672,
+  };
+  size_t table_size = sizeof kCostTable / sizeof *kCostTable;
+  if (tok >= table_size) tok = table_size - 1;
+  return kCostTable[tok] + nbits;
+}
+
+void ApplyLZ77_LZ77(const HistogramParams& params, size_t num_contexts,
+                    const std::vector<std::vector<Token>>& tokens,
+                    LZ77Params& lz77,
+                    std::vector<std::vector<Token>>& tokens_lz77) {
+  // TODO(veluca): tune heuristics here.
+  SymbolCostEstimator sce(num_contexts, params.force_huffman, tokens, lz77);
+  float bit_decrease = 0;
+  size_t total_symbols = 0;
+  tokens_lz77.resize(tokens.size());
+  HybridUintConfig uint_config;
+  std::vector<float> sym_cost;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    total_symbols += in.size();
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+
+    out.reserve(in.size());
+    size_t max_distance = in.size();
+    size_t min_length = lz77.min_length;
+    JXL_ASSERT(min_length >= 3);
+    size_t max_length = in.size();
+
+    // Use next power of two as window size.
+    size_t window_size = 1;
+    while (window_size < max_distance && window_size < kWindowSize) {
+      window_size <<= 1;
+    }
+
+    HashChain chain(in.data(), in.size(), window_size, min_length, max_length,
+                    distance_multiplier);
+    size_t len, dist_symbol;
+
+    const size_t max_lazy_match_len = 256;  // 0 to disable lazy matching
+
+    // Whether the next symbol was already updated (to test lazy matching)
+    bool already_updated = false;
+    for (size_t i = 0; i < in.size(); i++) {
+      out.push_back(in[i]);
+      if (!already_updated) chain.Update(i);
+      already_updated = false;
+      chain.FindMatch(i, max_distance, &dist_symbol, &len);
+      if (len >= min_length) {
+        if (len < max_lazy_match_len && i + 1 < in.size()) {
+          // Try length at next symbol lazy matching
+          chain.Update(i + 1);
+          already_updated = true;
+          size_t len2, dist_symbol2;
+          chain.FindMatch(i + 1, max_distance, &dist_symbol2, &len2);
+          if (len2 > len) {
+            // Use the lazy match. Add literal, and use the next length starting
+            // from the next byte.
+            ++i;
+            already_updated = false;
+            len = len2;
+            dist_symbol = dist_symbol2;
+            out.push_back(in[i]);
+          }
+        }
+
+        float cost = sym_cost[i + len] - sym_cost[i];
+        size_t lz77_len = len - lz77.min_length;
+        float lz77_cost = LenCost(lz77_len) + DistCost(dist_symbol) +
+                          sce.AddSymbolCost(out.back().context);
+
+        if (lz77_cost <= cost) {
+          out.back().value = len - min_length;
+          out.back().is_lz77_length = true;
+          out.emplace_back(lz77.nonserialized_distance_context, dist_symbol);
+          bit_decrease += cost - lz77_cost;
+        } else {
+          // LZ77 match ignored, and symbol already pushed. Push all other
+          // symbols and skip.
+          for (size_t j = 1; j < len; j++) {
+            out.push_back(in[i + j]);
+          }
+        }
+
+        if (already_updated) {
+          chain.Update(i + 2, len - 2);
+          already_updated = false;
+        } else {
+          chain.Update(i + 1, len - 1);
+        }
+        i += len - 1;
+      } else {
+        // Literal, already pushed
+      }
+    }
+  }
+
+  if (bit_decrease > total_symbols * 0.2 + 16) {
+    lz77.enabled = true;
+  }
+}
+
+void ApplyLZ77_Optimal(const HistogramParams& params, size_t num_contexts,
+                       const std::vector<std::vector<Token>>& tokens,
+                       LZ77Params& lz77,
+                       std::vector<std::vector<Token>>& tokens_lz77) {
+  std::vector<std::vector<Token>> tokens_for_cost_estimate;
+  ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_for_cost_estimate);
+  // If greedy-LZ77 does not give better compression than no-lz77, no reason to
+  // run the optimal matching.
+  if (!lz77.enabled) return;
+  SymbolCostEstimator sce(num_contexts + 1, params.force_huffman,
+                          tokens_for_cost_estimate, lz77);
+  tokens_lz77.resize(tokens.size());
+  HybridUintConfig uint_config;
+  std::vector<float> sym_cost;
+  std::vector<uint32_t> dist_symbols;
+  for (size_t stream = 0; stream < tokens.size(); stream++) {
+    size_t distance_multiplier =
+        params.image_widths.size() > stream ? params.image_widths[stream] : 0;
+    const auto& in = tokens[stream];
+    auto& out = tokens_lz77[stream];
+    // Cumulative sum of bit costs.
+    sym_cost.resize(in.size() + 1);
+    for (size_t i = 0; i < in.size(); i++) {
+      uint32_t tok, nbits, unused_bits;
+      uint_config.Encode(in[i].value, &tok, &nbits, &unused_bits);
+      sym_cost[i + 1] = sce.Bits(in[i].context, tok) + nbits + sym_cost[i];
+    }
+
+    out.reserve(in.size());
+    size_t max_distance = in.size();
+    size_t min_length = lz77.min_length;
+    JXL_ASSERT(min_length >= 3);
+    size_t max_length = in.size();
+
+    // Use next power of two as window size.
+    size_t window_size = 1;
+    while (window_size < max_distance && window_size < kWindowSize) {
+      window_size <<= 1;
+    }
+
+    HashChain chain(in.data(), in.size(), window_size, min_length, max_length,
+                    distance_multiplier);
+
+    struct MatchInfo {
+      uint32_t len;
+      uint32_t dist_symbol;
+      uint32_t ctx;
+      float total_cost = std::numeric_limits<float>::max();
+    };
+    // Total cost to encode the first N symbols.
+    std::vector<MatchInfo> prefix_costs(in.size() + 1);
+    prefix_costs[0].total_cost = 0;
+
+    size_t rle_length = 0;
+    size_t skip_lz77 = 0;
+    for (size_t i = 0; i < in.size(); i++) {
+      chain.Update(i);
+      float lit_cost =
+          prefix_costs[i].total_cost + sym_cost[i + 1] - sym_cost[i];
+      if (prefix_costs[i + 1].total_cost > lit_cost) {
+        prefix_costs[i + 1].dist_symbol = 0;
+        prefix_costs[i + 1].len = 1;
+        prefix_costs[i + 1].ctx = in[i].context;
+        prefix_costs[i + 1].total_cost = lit_cost;
+      }
+      if (skip_lz77 > 0) {
+        skip_lz77--;
+        continue;
+      }
+      dist_symbols.clear();
+      chain.FindMatches(i, max_distance,
+                        [&dist_symbols](size_t len, size_t dist_symbol) {
+                          if (dist_symbols.size() <= len) {
+                            dist_symbols.resize(len + 1, dist_symbol);
+                          }
+                          if (dist_symbol < dist_symbols[len]) {
+                            dist_symbols[len] = dist_symbol;
+                          }
+                        });
+      if (dist_symbols.size() <= min_length) continue;
+      {
+        size_t best_cost = dist_symbols.back();
+        for (size_t j = dist_symbols.size() - 1; j >= min_length; j--) {
+          if (dist_symbols[j] < best_cost) {
+            best_cost = dist_symbols[j];
+          }
+          dist_symbols[j] = best_cost;
+        }
+      }
+      for (size_t j = min_length; j < dist_symbols.size(); j++) {
+        // Cost model that uses results from lazy LZ77.
+        float lz77_cost = sce.LenCost(in[i].context, j - min_length, lz77) +
+                          sce.DistCost(dist_symbols[j], lz77);
+        float cost = prefix_costs[i].total_cost + lz77_cost;
+        if (prefix_costs[i + j].total_cost > cost) {
+          prefix_costs[i + j].len = j;
+          prefix_costs[i + j].dist_symbol = dist_symbols[j] + 1;
+          prefix_costs[i + j].ctx = in[i].context;
+          prefix_costs[i + j].total_cost = cost;
+        }
+      }
+      // We are in a RLE sequence: skip all the symbols except the first 8 and
+      // the last 8. This avoid quadratic costs for sequences with long runs of
+      // the same symbol.
+      if ((dist_symbols.back() == 0 && distance_multiplier == 0) ||
+          (dist_symbols.back() == 1 && distance_multiplier != 0)) {
+        rle_length++;
+      } else {
+        rle_length = 0;
+      }
+      if (rle_length >= 8 && dist_symbols.size() > 9) {
+        skip_lz77 = dist_symbols.size() - 10;
+        rle_length = 0;
+      }
+    }
+    size_t pos = in.size();
+    while (pos > 0) {
+      bool is_lz77_length = prefix_costs[pos].dist_symbol != 0;
+      if (is_lz77_length) {
+        size_t dist_symbol = prefix_costs[pos].dist_symbol - 1;
+        out.emplace_back(lz77.nonserialized_distance_context, dist_symbol);
+      }
+      size_t val = is_lz77_length ? prefix_costs[pos].len - min_length
+                                  : in[pos - 1].value;
+      out.emplace_back(prefix_costs[pos].ctx, val);
+      out.back().is_lz77_length = is_lz77_length;
+      pos -= prefix_costs[pos].len;
+    }
+    std::reverse(out.begin(), out.end());
+  }
+}
+
+void ApplyLZ77(const HistogramParams& params, size_t num_contexts,
+               const std::vector<std::vector<Token>>& tokens, LZ77Params& lz77,
+               std::vector<std::vector<Token>>& tokens_lz77) {
+  lz77.enabled = false;
+  if (params.force_huffman) {
+    lz77.min_symbol = std::min(PREFIX_MAX_ALPHABET_SIZE - 32, 512);
+  } else {
+    lz77.min_symbol = 224;
+  }
+  if (params.lz77_method == HistogramParams::LZ77Method::kNone) {
+    return;
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kRLE) {
+    ApplyLZ77_RLE(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kLZ77) {
+    ApplyLZ77_LZ77(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else if (params.lz77_method == HistogramParams::LZ77Method::kOptimal) {
+    ApplyLZ77_Optimal(params, num_contexts, tokens, lz77, tokens_lz77);
+  } else {
+    JXL_ABORT("Not implemented");
+  }
+}
+}  // namespace
+
+size_t BuildAndEncodeHistograms(const HistogramParams& params,
+                                size_t num_contexts,
+                                std::vector<std::vector<Token>>& tokens,
+                                EntropyEncodingData* codes,
+                                std::vector<uint8_t>* context_map,
+                                BitWriter* writer, size_t layer,
+                                AuxOut* aux_out) {
+  size_t total_bits = 0;
+  codes->lz77.nonserialized_distance_context = num_contexts;
+  std::vector<std::vector<Token>> tokens_lz77;
+  ApplyLZ77(params, num_contexts, tokens, codes->lz77, tokens_lz77);
+  if (ans_fuzzer_friendly_) {
+    codes->lz77.length_uint_config = HybridUintConfig(10, 0, 0);
+    codes->lz77.min_symbol = 2048;
+  }
+
+  const size_t max_contexts = std::min(num_contexts, kClustersLimit);
+  BitWriter::Allotment allotment(writer,
+                                 128 + num_contexts * 40 + max_contexts * 96);
+  if (writer) {
+    JXL_CHECK(Bundle::Write(codes->lz77, writer, layer, aux_out));
+  } else {
+    size_t ebits, bits;
+    JXL_CHECK(Bundle::CanEncode(codes->lz77, &ebits, &bits));
+    total_bits += bits;
+  }
+  if (codes->lz77.enabled) {
+    if (writer) {
+      size_t b = writer->BitsWritten();
+      EncodeUintConfig(codes->lz77.length_uint_config, writer,
+                       /*log_alpha_size=*/8);
+      total_bits += writer->BitsWritten() - b;
+    } else {
+      SizeWriter size_writer;
+      EncodeUintConfig(codes->lz77.length_uint_config, &size_writer,
+                       /*log_alpha_size=*/8);
+      total_bits += size_writer.size;
+    }
+    num_contexts += 1;
+    tokens = std::move(tokens_lz77);
+  }
+  size_t total_tokens = 0;
+  // Build histograms.
+  HistogramBuilder builder(num_contexts);
+  HybridUintConfig uint_config;  //  Default config for clustering.
+  // Unless we are using the kContextMap histogram option.
+  if (params.uint_method == HistogramParams::HybridUintMethod::kContextMap) {
+    uint_config = HybridUintConfig(2, 0, 1);
+  }
+  if (params.uint_method == HistogramParams::HybridUintMethod::k000) {
+    uint_config = HybridUintConfig(0, 0, 0);
+  }
+  if (ans_fuzzer_friendly_) {
+    uint_config = HybridUintConfig(10, 0, 0);
+  }
+  for (size_t i = 0; i < tokens.size(); ++i) {
+    if (codes->lz77.enabled) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        (token.is_lz77_length ? codes->lz77.length_uint_config : uint_config)
+            .Encode(token.value, &tok, &nbits, &bits);
+        tok += token.is_lz77_length ? codes->lz77.min_symbol : 0;
+        builder.VisitSymbol(tok, token.context);
+      }
+    } else if (num_contexts == 1) {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        uint_config.Encode(token.value, &tok, &nbits, &bits);
+        builder.VisitSymbol(tok, /*token.context=*/0);
+      }
+    } else {
+      for (size_t j = 0; j < tokens[i].size(); ++j) {
+        const Token& token = tokens[i][j];
+        total_tokens++;
+        uint32_t tok, nbits, bits;
+        uint_config.Encode(token.value, &tok, &nbits, &bits);
+        builder.VisitSymbol(tok, token.context);
+      }
+    }
+  }
+
+  bool use_prefix_code =
+      params.force_huffman || total_tokens < 100 ||
+      params.clustering == HistogramParams::ClusteringType::kFastest ||
+      ans_fuzzer_friendly_;
+  if (!use_prefix_code) {
+    bool all_singleton = true;
+    for (size_t i = 0; i < num_contexts; i++) {
+      if (builder.Histo(i).ShannonEntropy() >= 1e-5) {
+        all_singleton = false;
+      }
+    }
+    if (all_singleton) {
+      use_prefix_code = true;
+    }
+  }
+
+  // Encode histograms.
+  total_bits += builder.BuildAndStoreEntropyCodes(params, tokens, codes,
+                                                  context_map, use_prefix_code,
+                                                  writer, layer, aux_out);
+  allotment.FinishedHistogram(writer);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].num_clustered_histograms +=
+        codes->encoding_info.size();
+  }
+  return total_bits;
+}
+
+size_t WriteTokens(const std::vector<Token>& tokens,
+                   const EntropyEncodingData& codes,
+                   const std::vector<uint8_t>& context_map, BitWriter* writer) {
+  size_t num_extra_bits = 0;
+  if (codes.use_prefix_code) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+      uint32_t tok, nbits, bits;
+      const Token& token = tokens[i];
+      size_t histo = context_map[token.context];
+      (token.is_lz77_length ? codes.lz77.length_uint_config
+                            : codes.uint_config[histo])
+          .Encode(token.value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes.lz77.min_symbol : 0;
+      // Combine two calls to the BitWriter. Equivalent to:
+      // writer->Write(codes.encoding_info[histo][tok].depth,
+      //               codes.encoding_info[histo][tok].bits);
+      // writer->Write(nbits, bits);
+      uint64_t data = codes.encoding_info[histo][tok].bits;
+      data |= bits << codes.encoding_info[histo][tok].depth;
+      writer->Write(codes.encoding_info[histo][tok].depth + nbits, data);
+      num_extra_bits += nbits;
+    }
+    return num_extra_bits;
+  }
+  std::vector<uint64_t> out;
+  std::vector<uint8_t> out_nbits;
+  out.reserve(tokens.size());
+  out_nbits.reserve(tokens.size());
+  uint64_t allbits = 0;
+  size_t numallbits = 0;
+  // Writes in *reversed* order.
+  auto addbits = [&](size_t bits, size_t nbits) {
+    if (JXL_UNLIKELY(nbits)) {
+      JXL_DASSERT(bits >> nbits == 0);
+      if (JXL_UNLIKELY(numallbits + nbits > BitWriter::kMaxBitsPerCall)) {
+        out.push_back(allbits);
+        out_nbits.push_back(numallbits);
+        numallbits = allbits = 0;
+      }
+      allbits <<= nbits;
+      allbits |= bits;
+      numallbits += nbits;
+    }
+  };
+  const int end = tokens.size();
+  ANSCoder ans;
+  if (codes.lz77.enabled || context_map.size() > 1) {
+    for (int i = end - 1; i >= 0; --i) {
+      const Token token = tokens[i];
+      const uint8_t histo = context_map[token.context];
+      uint32_t tok, nbits, bits;
+      (token.is_lz77_length ? codes.lz77.length_uint_config
+                            : codes.uint_config[histo])
+          .Encode(tokens[i].value, &tok, &nbits, &bits);
+      tok += token.is_lz77_length ? codes.lz77.min_symbol : 0;
+      const ANSEncSymbolInfo& info = codes.encoding_info[histo][tok];
+      // Extra bits first as this is reversed.
+      addbits(bits, nbits);
+      num_extra_bits += nbits;
+      uint8_t ans_nbits = 0;
+      uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits);
+      addbits(ans_bits, ans_nbits);
+    }
+  } else {
+    for (int i = end - 1; i >= 0; --i) {
+      uint32_t tok, nbits, bits;
+      codes.uint_config[0].Encode(tokens[i].value, &tok, &nbits, &bits);
+      const ANSEncSymbolInfo& info = codes.encoding_info[0][tok];
+      // Extra bits first as this is reversed.
+      addbits(bits, nbits);
+      num_extra_bits += nbits;
+      uint8_t ans_nbits = 0;
+      uint32_t ans_bits = ans.PutSymbol(info, &ans_nbits);
+      addbits(ans_bits, ans_nbits);
+    }
+  }
+  const uint32_t state = ans.GetState();
+  writer->Write(32, state);
+  writer->Write(numallbits, allbits);
+  for (int i = out.size(); i > 0; --i) {
+    writer->Write(out_nbits[i - 1], out[i - 1]);
+  }
+  return num_extra_bits;
+}
+
+void WriteTokens(const std::vector<Token>& tokens,
+                 const EntropyEncodingData& codes,
+                 const std::vector<uint8_t>& context_map, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, 32 * tokens.size() + 32 * 1024 * 4);
+  size_t num_extra_bits = WriteTokens(tokens, codes, context_map, writer);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].extra_bits += num_extra_bits;
+  }
+}
+
+void SetANSFuzzerFriendly(bool ans_fuzzer_friendly) {
+#if JXL_IS_DEBUG_BUILD  // Guard against accidental / malicious changes.
+  ans_fuzzer_friendly_ = ans_fuzzer_friendly;
+#endif
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans.h b/third_party/jpeg-xl/lib/jxl/enc_ans.h
new file mode 100644
index 0000000000..a4afb19b4e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans.h
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ANS_H_
+#define LIB_JXL_ENC_ANS_H_
+
+// Library to encode the ANS population counts to the bit-stream and encode
+// symbols based on the respective distributions.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/ans_common.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_ans_params.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/huffman_table.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+#define USE_MULT_BY_RECIPROCAL
+
+// precision must be equal to:  #bits(state_) + #bits(freq)
+#define RECIPROCAL_PRECISION (32 + ANS_LOG_TAB_SIZE)
+
+// Data structure representing one element of the encoding table built
+// from a distribution.
+// TODO(veluca): split this up, or use an union.
+struct ANSEncSymbolInfo {
+  // ANS
+  uint16_t freq_;
+  std::vector<uint16_t> reverse_map_;
+#ifdef USE_MULT_BY_RECIPROCAL
+  uint64_t ifreq_;
+#endif
+  // Prefix coding.
+  uint8_t depth;
+  uint16_t bits;
+};
+
+class ANSCoder {
+ public:
+  ANSCoder() : state_(ANS_SIGNATURE << 16) {}
+
+  uint32_t PutSymbol(const ANSEncSymbolInfo& t, uint8_t* nbits) {
+    uint32_t bits = 0;
+    *nbits = 0;
+    if ((state_ >> (32 - ANS_LOG_TAB_SIZE)) >= t.freq_) {
+      bits = state_ & 0xffff;
+      state_ >>= 16;
+      *nbits = 16;
+    }
+#ifdef USE_MULT_BY_RECIPROCAL
+    // We use mult-by-reciprocal trick, but that requires 64b calc.
+    const uint32_t v = (state_ * t.ifreq_) >> RECIPROCAL_PRECISION;
+    const uint32_t offset = t.reverse_map_[state_ - v * t.freq_];
+    state_ = (v << ANS_LOG_TAB_SIZE) + offset;
+#else
+    state_ = ((state_ / t.freq_) << ANS_LOG_TAB_SIZE) +
+             t.reverse_map_[state_ % t.freq_];
+#endif
+    return bits;
+  }
+
+  uint32_t GetState() const { return state_; }
+
+ private:
+  uint32_t state_;
+};
+
+// RebalanceHistogram requires a signed type.
+using ANSHistBin = int32_t;
+
+struct EntropyEncodingData {
+  std::vector<std::vector<ANSEncSymbolInfo>> encoding_info;
+  bool use_prefix_code;
+  std::vector<HybridUintConfig> uint_config;
+  LZ77Params lz77;
+};
+
+// Integer to be encoded by an entropy coder, either ANS or Huffman.
+struct Token {
+  Token() {}
+  Token(uint32_t c, uint32_t value)
+      : is_lz77_length(false), context(c), value(value) {}
+  uint32_t is_lz77_length : 1;
+  uint32_t context : 31;
+  uint32_t value;
+};
+
+// Returns an estimate of the number of bits required to encode the given
+// histogram (header bits plus data bits).
+float ANSPopulationCost(const ANSHistBin* data, size_t alphabet_size);
+
+// Apply context clustering, compute histograms and encode them. Returns an
+// estimate of the total bits used for encoding the stream. If `writer` ==
+// nullptr, the bit estimate will not take into account the context map (which
+// does not get written if `num_contexts` == 1).
+size_t BuildAndEncodeHistograms(const HistogramParams& params,
+                                size_t num_contexts,
+                                std::vector<std::vector<Token>>& tokens,
+                                EntropyEncodingData* codes,
+                                std::vector<uint8_t>* context_map,
+                                BitWriter* writer, size_t layer,
+                                AuxOut* aux_out);
+
+// Write the tokens to a string.
+void WriteTokens(const std::vector<Token>& tokens,
+                 const EntropyEncodingData& codes,
+                 const std::vector<uint8_t>& context_map, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out);
+
+// Same as above, but assumes allotment created by caller.
+size_t WriteTokens(const std::vector<Token>& tokens,
+                   const EntropyEncodingData& codes,
+                   const std::vector<uint8_t>& context_map, BitWriter* writer);
+
+// Exposed for tests; to be used with Writer=BitWriter only.
+template <typename Writer>
+void EncodeUintConfigs(const std::vector<HybridUintConfig>& uint_config,
+                       Writer* writer, size_t log_alpha_size);
+extern template void EncodeUintConfigs(const std::vector<HybridUintConfig>&,
+                                       BitWriter*, size_t);
+
+// Globally set the option to create fuzzer-friendly ANS streams. Negatively
+// impacts compression. Not thread-safe.
+void SetANSFuzzerFriendly(bool ans_fuzzer_friendly);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ANS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ans_params.h b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h
new file mode 100644
index 0000000000..50ca31dc03
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ans_params.h
@@ -0,0 +1,76 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ANS_PARAMS_H_
+#define LIB_JXL_ENC_ANS_PARAMS_H_
+
+// Encoder-only parameter needed for ANS entropy encoding methods.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct HistogramParams {
+  enum class ClusteringType {
+    kFastest,  // Only 4 clusters.
+    kFast,
+    kBest,
+  };
+
+  enum class HybridUintMethod {
+    kNone,        // just use kHybridUint420Config.
+    k000,         // force the fastest option.
+    kFast,        // just try a couple of options.
+    kContextMap,  // fast choice for ctx map.
+    kBest,
+  };
+
+  enum class LZ77Method {
+    kNone,     // do not try lz77.
+    kRLE,      // only try doing RLE.
+    kLZ77,     // try lz77 with backward references.
+    kOptimal,  // optimal-matching LZ77 parsing.
+  };
+
+  enum class ANSHistogramStrategy {
+    kFast,         // Only try some methods, early exit.
+    kApproximate,  // Only try some methods.
+    kPrecise,      // Try all methods.
+  };
+
+  HistogramParams() = default;
+
+  HistogramParams(SpeedTier tier, size_t num_ctx) {
+    if (tier > SpeedTier::kFalcon) {
+      clustering = ClusteringType::kFastest;
+      lz77_method = LZ77Method::kNone;
+    } else if (tier > SpeedTier::kTortoise) {
+      clustering = ClusteringType::kFast;
+    } else {
+      clustering = ClusteringType::kBest;
+    }
+    if (tier > SpeedTier::kTortoise) {
+      uint_method = HybridUintMethod::kNone;
+    }
+    if (tier >= SpeedTier::kSquirrel) {
+      ans_histogram_strategy = ANSHistogramStrategy::kApproximate;
+    }
+  }
+
+  ClusteringType clustering = ClusteringType::kBest;
+  HybridUintMethod uint_method = HybridUintMethod::kBest;
+  LZ77Method lz77_method = LZ77Method::kRLE;
+  ANSHistogramStrategy ans_histogram_strategy = ANSHistogramStrategy::kPrecise;
+  std::vector<size_t> image_widths;
+  size_t max_histograms = ~0;
+  bool force_huffman = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ANS_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
new file mode 100644
index 0000000000..9030430e2b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
@@ -0,0 +1,325 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_ar_control_field.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_ar_control_field.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
+                 const Rect& rect,
+                 ArControlFieldHeuristics::TempImages* temp_image) {
+  constexpr size_t N = kBlockDim;
+  ImageB* JXL_RESTRICT epf_sharpness = &enc_state->shared.epf_sharpness;
+  ImageF* JXL_RESTRICT quant = &enc_state->initial_quant_field;
+  JXL_ASSERT(
+      epf_sharpness->xsize() == enc_state->shared.frame_dim.xsize_blocks &&
+      epf_sharpness->ysize() == enc_state->shared.frame_dim.ysize_blocks);
+
+  if (enc_state->cparams.butteraugli_distance < kMinButteraugliForDynamicAR ||
+      enc_state->cparams.speed_tier > SpeedTier::kWombat ||
+      enc_state->shared.frame_header.loop_filter.epf_iters == 0) {
+    FillPlane(static_cast<uint8_t>(4), epf_sharpness, rect);
+    return;
+  }
+
+  // Likely better to have a higher X weight, like:
+  // const float kChannelWeights[3] = {47.0f, 4.35f, 0.287f};
+  const float kChannelWeights[3] = {4.35f, 4.35f, 0.287f};
+  const float kChannelWeightsLapNeg[3] = {-0.125f * kChannelWeights[0],
+                                          -0.125f * kChannelWeights[1],
+                                          -0.125f * kChannelWeights[2]};
+  const size_t sharpness_stride =
+      static_cast<size_t>(epf_sharpness->PixelsPerRow());
+
+  size_t by0 = rect.y0();
+  size_t by1 = rect.y0() + rect.ysize();
+  size_t bx0 = rect.x0();
+  size_t bx1 = rect.x0() + rect.xsize();
+  temp_image->InitOnce();
+  ImageF& laplacian_sqrsum = temp_image->laplacian_sqrsum;
+  // Calculate the L2 of the 3x3 Laplacian in an integral transform
+  // (for example 32x32 dct). This relates to transforms ability
+  // to propagate artefacts.
+  size_t y0 = by0 == 0 ? 2 : 0;
+  size_t y1 = by1 * N + 4 <= opsin.ysize() + 2 ? (by1 - by0) * N + 4
+                                               : opsin.ysize() + 2 - by0 * N;
+  size_t x0 = bx0 == 0 ? 2 : 0;
+  size_t x1 = bx1 * N + 4 <= opsin.xsize() + 2 ? (bx1 - bx0) * N + 4
+                                               : opsin.xsize() + 2 - bx0 * N;
+  HWY_FULL(float) df;
+  for (size_t y = y0; y < y1; y++) {
+    float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y);
+    size_t cy = y + by0 * N - 2;
+    const float* JXL_RESTRICT in_row_t[3];
+    const float* JXL_RESTRICT in_row[3];
+    const float* JXL_RESTRICT in_row_b[3];
+    for (size_t c = 0; c < 3; c++) {
+      in_row_t[c] = opsin.PlaneRow(c, cy > 0 ? cy - 1 : cy);
+      in_row[c] = opsin.PlaneRow(c, cy);
+      in_row_b[c] = opsin.PlaneRow(c, cy + 1 < opsin.ysize() ? cy + 1 : cy);
+    }
+    auto compute_laplacian_scalar = [&](size_t x) {
+      size_t cx = x + bx0 * N - 2;
+      const size_t prevX = cx >= 1 ? cx - 1 : cx;
+      const size_t nextX = cx + 1 < opsin.xsize() ? cx + 1 : cx;
+      float sumsqr = 0;
+      for (size_t c = 0; c < 3; c++) {
+        float laplacian =
+            kChannelWeights[c] * in_row[c][cx] +
+            kChannelWeightsLapNeg[c] *
+                (in_row[c][prevX] + in_row[c][nextX] + in_row_b[c][prevX] +
+                 in_row_b[c][cx] + in_row_b[c][nextX] + in_row_t[c][prevX] +
+                 in_row_t[c][cx] + in_row_t[c][nextX]);
+        sumsqr += laplacian * laplacian;
+      }
+      laplacian_sqrsum_row[x] = sumsqr;
+    };
+    size_t x = x0;
+    for (; x + bx0 * N < 3; x++) {
+      compute_laplacian_scalar(x);
+    }
+    // Interior. One extra pixel of border as the last pixel is special.
+    for (; x + Lanes(df) <= x1 && x + Lanes(df) + bx0 * N - 1 <= opsin.xsize();
+         x += Lanes(df)) {
+      size_t cx = x + bx0 * N - 2;
+      auto sumsqr = Zero(df);
+      for (size_t c = 0; c < 3; c++) {
+        auto laplacian =
+            Mul(LoadU(df, in_row[c] + cx), Set(df, kChannelWeights[c]));
+        auto sum_oth0 = LoadU(df, in_row[c] + cx - 1);
+        auto sum_oth1 = LoadU(df, in_row[c] + cx + 1);
+        auto sum_oth2 = LoadU(df, in_row_t[c] + cx - 1);
+        auto sum_oth3 = LoadU(df, in_row_t[c] + cx);
+        sum_oth0 = Add(sum_oth0, LoadU(df, in_row_t[c] + cx + 1));
+        sum_oth1 = Add(sum_oth1, LoadU(df, in_row_b[c] + cx - 1));
+        sum_oth2 = Add(sum_oth2, LoadU(df, in_row_b[c] + cx));
+        sum_oth3 = Add(sum_oth3, LoadU(df, in_row_b[c] + cx + 1));
+        sum_oth0 = Add(sum_oth0, sum_oth1);
+        sum_oth2 = Add(sum_oth2, sum_oth3);
+        sum_oth0 = Add(sum_oth0, sum_oth2);
+        laplacian =
+            MulAdd(Set(df, kChannelWeightsLapNeg[c]), sum_oth0, laplacian);
+        sumsqr = MulAdd(laplacian, laplacian, sumsqr);
+      }
+      StoreU(sumsqr, df, laplacian_sqrsum_row + x);
+    }
+    for (; x < x1; x++) {
+      compute_laplacian_scalar(x);
+    }
+  }
+  HWY_CAPPED(float, 4) df4;
+  // Calculate the L2 of the 3x3 Laplacian in 4x4 blocks within the area
+  // of the integral transform. Sample them within the integral transform
+  // with two offsets (0,0) and (-2, -2) pixels (sqrsum_00 and sqrsum_22,
+  //  respectively).
+  ImageF& sqrsum_00 = temp_image->sqrsum_00;
+  size_t sqrsum_00_stride = sqrsum_00.PixelsPerRow();
+  float* JXL_RESTRICT sqrsum_00_row = sqrsum_00.Row(0);
+  for (size_t y = 0; y < (by1 - by0) * 2; y++) {
+    const float* JXL_RESTRICT rows_in[4];
+    for (size_t iy = 0; iy < 4; iy++) {
+      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy + 2);
+    }
+    float* JXL_RESTRICT row_out = sqrsum_00_row + y * sqrsum_00_stride;
+    for (size_t x = 0; x < (bx1 - bx0) * 2; x++) {
+      auto sum = Zero(df4);
+      for (size_t iy = 0; iy < 4; iy++) {
+        for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
+          sum = Add(sum, LoadU(df4, rows_in[iy] + x * 4 + ix + 2));
+        }
+      }
+      row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
+    }
+  }
+  // Indexing iy and ix is a bit tricky as we include a 2 pixel border
+  // around the block for evenness calculations. This is similar to what
+  // we did in guetzli for the observability of artefacts, except there
+  // the element is a sliding 5x5, not sparsely sampled 4x4 box like here.
+  ImageF& sqrsum_22 = temp_image->sqrsum_22;
+  size_t sqrsum_22_stride = sqrsum_22.PixelsPerRow();
+  float* JXL_RESTRICT sqrsum_22_row = sqrsum_22.Row(0);
+  for (size_t y = 0; y < (by1 - by0) * 2 + 1; y++) {
+    const float* JXL_RESTRICT rows_in[4];
+    for (size_t iy = 0; iy < 4; iy++) {
+      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy);
+    }
+    float* JXL_RESTRICT row_out = sqrsum_22_row + y * sqrsum_22_stride;
+    // ignore pixels outside the image.
+    // Y coordinates are relative to by0*8+y*4.
+    size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2;
+    size_t ey = y * 4 + by0 * 8 + 4 <= opsin.ysize() + 2
+                    ? 4
+                    : opsin.ysize() - y * 4 - by0 * 8 + 2;
+    for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) {
+      // ignore pixels outside the image.
+      // X coordinates are relative to bx0*8.
+      size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2;
+      size_t ex = x * 4 + bx0 * 8 + 4 <= opsin.xsize() + 2
+                      ? x * 4 + 4
+                      : opsin.xsize() - bx0 * 8 + 2;
+      if (ex - sx == 4 && ey - sy == 4) {
+        auto sum = Zero(df4);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
+            sum = Add(sum, Load(df4, rows_in[iy] + sx + ix));
+          }
+        }
+        row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
+      } else {
+        float sum = 0;
+        for (size_t iy = sy; iy < ey; iy++) {
+          for (size_t ix = sx; ix < ex; ix++) {
+            sum += rows_in[iy][ix];
+          }
+        }
+        row_out[x] = std::sqrt(sum / ((ex - sx) * (ey - sy)));
+      }
+    }
+  }
+  for (size_t by = by0; by < by1; by++) {
+    AcStrategyRow acs_row = enc_state->shared.ac_strategy.ConstRow(by);
+    uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by);
+    float* JXL_RESTRICT quant_row = quant->Row(by);
+    for (size_t bx = bx0; bx < bx1; bx++) {
+      AcStrategy acs = acs_row[bx];
+      if (!acs.IsFirstBlock()) continue;
+      // The errors are going to be linear to the quantization value in this
+      // locality. We only have access to the initial quant field here.
+      float quant_val = 1.0f / quant_row[bx];
+
+      const auto sq00 = [&](size_t y, size_t x) {
+        return sqrsum_00_row[((by - by0) * 2 + y) * sqrsum_00_stride +
+                             (bx - bx0) * 2 + x];
+      };
+      const auto sq22 = [&](size_t y, size_t x) {
+        return sqrsum_22_row[((by - by0) * 2 + y) * sqrsum_22_stride +
+                             (bx - bx0) * 2 + x];
+      };
+      float sqrsum_integral_transform = 0;
+      for (size_t iy = 0; iy < acs.covered_blocks_y() * 2; iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x() * 2; ix++) {
+          sqrsum_integral_transform += sq00(iy, ix) * sq00(iy, ix);
+        }
+      }
+      sqrsum_integral_transform /=
+          4 * acs.covered_blocks_x() * acs.covered_blocks_y();
+      sqrsum_integral_transform = std::sqrt(sqrsum_integral_transform);
+      // If masking is high or amplitude of the artefacts is low, then no
+      // smoothing is needed.
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          // Five 4x4 blocks for masking estimation, all within the
+          // 8x8 area.
+          float minval_1 = std::min(sq00(2 * iy + 0, 2 * ix + 0),
+                                    sq00(2 * iy + 0, 2 * ix + 1));
+          float minval_2 = std::min(sq00(2 * iy + 1, 2 * ix + 0),
+                                    sq00(2 * iy + 1, 2 * ix + 1));
+          float minval = std::min(minval_1, minval_2);
+          minval = std::min(minval, sq22(2 * iy + 1, 2 * ix + 1));
+          // Nine more 4x4 blocks for masking estimation, includes
+          // the 2 pixel area around the 8x8 block being controlled.
+          float minval2_1 = std::min(sq22(2 * iy + 0, 2 * ix + 0),
+                                     sq22(2 * iy + 0, 2 * ix + 1));
+          float minval2_2 = std::min(sq22(2 * iy + 0, 2 * ix + 2),
+                                     sq22(2 * iy + 1, 2 * ix + 0));
+          float minval2_3 = std::min(sq22(2 * iy + 1, 2 * ix + 1),
+                                     sq22(2 * iy + 1, 2 * ix + 2));
+          float minval2_4 = std::min(sq22(2 * iy + 2, 2 * ix + 0),
+                                     sq22(2 * iy + 2, 2 * ix + 1));
+          float minval2_5 = std::min(minval2_1, minval2_2);
+          float minval2_6 = std::min(minval2_3, minval2_4);
+          float minval2 = std::min(minval2_5, minval2_6);
+          minval2 = std::min(minval2, sq22(2 * iy + 2, 2 * ix + 2));
+          float minval3 = std::min(minval, minval2);
+          minval *= 0.125f;
+          minval += 0.625f * minval3;
+          minval +=
+              0.125f * std::min(1.5f * minval3, sq22(2 * iy + 1, 2 * ix + 1));
+          minval += 0.125f * minval2;
+          // Larger kBias, less smoothing for low intensity changes.
+          float kDeltaLimit = 3.2;
+          float bias = 0.0625f * quant_val;
+          float delta =
+              (sqrsum_integral_transform + (kDeltaLimit + 0.05) * bias) /
+              (minval + bias);
+          int out = 4;
+          if (delta > kDeltaLimit) {
+            out = 4;  // smooth
+          } else {
+            out = 0;
+          }
+          // 'threshold' is separate from 'bias' for easier tuning of these
+          // heuristics.
+          float threshold = 0.0625f * quant_val;
+          const float kSmoothLimit = 0.085f;
+          float smooth = 0.20f * (sq00(2 * iy + 0, 2 * ix + 0) +
+                                  sq00(2 * iy + 0, 2 * ix + 1) +
+                                  sq00(2 * iy + 1, 2 * ix + 0) +
+                                  sq00(2 * iy + 1, 2 * ix + 1) + minval);
+          if (smooth < kSmoothLimit * threshold) {
+            out = 4;
+          }
+          out_row[bx + sharpness_stride * iy + ix] = out;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ProcessTile);
+
+void ArControlFieldHeuristics::RunRect(const Rect& block_rect,
+                                       const Image3F& opsin,
+                                       PassesEncoderState* enc_state,
+                                       size_t thread) {
+  HWY_DYNAMIC_DISPATCH(ProcessTile)
+  (opsin, enc_state, block_rect, &temp_images[thread]);
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
new file mode 100644
index 0000000000..aabe71f46f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_AR_CONTROL_FIELD_H_
+#define LIB_JXL_ENC_AR_CONTROL_FIELD_H_
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct PassesEncoderState;
+
+struct ArControlFieldHeuristics {
+  struct TempImages {
+    void InitOnce() {
+      if (laplacian_sqrsum.xsize() != 0) return;
+      laplacian_sqrsum = ImageF(kEncTileDim + 4, kEncTileDim + 4);
+      sqrsum_00 = ImageF(kEncTileDim / 4, kEncTileDim / 4);
+      sqrsum_22 = ImageF(kEncTileDim / 4 + 1, kEncTileDim / 4 + 1);
+    }
+
+    ImageF laplacian_sqrsum;
+    ImageF sqrsum_00;
+    ImageF sqrsum_22;
+  };
+
+  void PrepareForThreads(size_t num_threads) {
+    temp_images.resize(num_threads);
+  }
+
+  void RunRect(const Rect& block_rect, const Image3F& opsin,
+               PassesEncoderState* enc_state, size_t thread);
+
+  std::vector<TempImages> temp_images;
+  ImageB* epf_sharpness;
+  ImageF* quant;
+  bool all_default;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AR_ENC_CONTROL_FIELD_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
new file mode 100644
index 0000000000..1c141d1727
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.cc
@@ -0,0 +1,205 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_aux_out.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <numeric>  // accumulate
+#include <sstream>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+const char* LayerName(size_t layer) {
+  switch (layer) {
+    case kLayerHeader:
+      return "Headers";
+    case kLayerTOC:
+      return "TOC";
+    case kLayerDictionary:
+      return "Patches";
+    case kLayerSplines:
+      return "Splines";
+    case kLayerNoise:
+      return "Noise";
+    case kLayerQuant:
+      return "Quantizer";
+    case kLayerModularTree:
+      return "ModularTree";
+    case kLayerModularGlobal:
+      return "ModularGlobal";
+    case kLayerDC:
+      return "DC";
+    case kLayerModularDcGroup:
+      return "ModularDcGroup";
+    case kLayerControlFields:
+      return "ControlFields";
+    case kLayerOrder:
+      return "CoeffOrder";
+    case kLayerAC:
+      return "ACHistograms";
+    case kLayerACTokens:
+      return "ACTokens";
+    case kLayerModularAcGroup:
+      return "ModularAcGroup";
+    default:
+      JXL_ABORT("Invalid layer %d\n", static_cast<int>(layer));
+  }
+}
+
+void AuxOut::LayerTotals::Print(size_t num_inputs) const {
+  printf("%10" PRId64, static_cast<int64_t>(total_bits));
+  if (histogram_bits != 0) {
+    printf("   [c/i:%6.2f | hst:%8" PRId64 " | ex:%8" PRId64 " | h+c+e:%12.3f",
+           num_clustered_histograms * 1.0 / num_inputs,
+           static_cast<int64_t>(histogram_bits >> 3),
+           static_cast<int64_t>(extra_bits >> 3),
+           (histogram_bits + clustered_entropy + extra_bits) / 8.0);
+    printf("]");
+  }
+  printf("\n");
+}
+
+void AuxOut::Assimilate(const AuxOut& victim) {
+  for (size_t i = 0; i < layers.size(); ++i) {
+    layers[i].Assimilate(victim.layers[i]);
+  }
+  num_blocks += victim.num_blocks;
+  num_small_blocks += victim.num_small_blocks;
+  num_dct4x8_blocks += victim.num_dct4x8_blocks;
+  num_afv_blocks += victim.num_afv_blocks;
+  num_dct8_blocks += victim.num_dct8_blocks;
+  num_dct8x16_blocks += victim.num_dct8x16_blocks;
+  num_dct8x32_blocks += victim.num_dct8x32_blocks;
+  num_dct16_blocks += victim.num_dct16_blocks;
+  num_dct16x32_blocks += victim.num_dct16x32_blocks;
+  num_dct32_blocks += victim.num_dct32_blocks;
+  num_dct32x64_blocks += victim.num_dct32x64_blocks;
+  num_dct64_blocks += victim.num_dct64_blocks;
+  num_butteraugli_iters += victim.num_butteraugli_iters;
+  for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
+    dc_pred_usage[i] += victim.dc_pred_usage[i];
+    dc_pred_usage_xb[i] += victim.dc_pred_usage_xb[i];
+  }
+  max_quant_rescale = std::max(max_quant_rescale, victim.max_quant_rescale);
+  min_quant_rescale = std::min(min_quant_rescale, victim.min_quant_rescale);
+  max_bitrate_error = std::max(max_bitrate_error, victim.max_bitrate_error);
+  min_bitrate_error = std::min(min_bitrate_error, victim.min_bitrate_error);
+}
+
+void AuxOut::Print(size_t num_inputs) const {
+  if (num_inputs == 0) return;
+
+  LayerTotals all_layers;
+  for (size_t i = 0; i < layers.size(); ++i) {
+    all_layers.Assimilate(layers[i]);
+  }
+
+  printf("Average butteraugli iters: %10.2f\n",
+         num_butteraugli_iters * 1.0 / num_inputs);
+  if (min_quant_rescale != 1.0 || max_quant_rescale != 1.0) {
+    printf("quant rescale range: %f .. %f\n", min_quant_rescale,
+           max_quant_rescale);
+    printf("bitrate error range: %.3f%% .. %.3f%%\n",
+           100.0f * min_bitrate_error, 100.0f * max_bitrate_error);
+  }
+
+  for (size_t i = 0; i < layers.size(); ++i) {
+    if (layers[i].total_bits != 0) {
+      printf("Total layer bits %-10s\t", LayerName(i));
+      printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits);
+      layers[i].Print(num_inputs);
+    }
+  }
+  printf("Total image size           ");
+  all_layers.Print(num_inputs);
+
+  const uint32_t dc_pred_total =
+      std::accumulate(dc_pred_usage.begin(), dc_pred_usage.end(), 0u);
+  const uint32_t dc_pred_total_xb =
+      std::accumulate(dc_pred_usage_xb.begin(), dc_pred_usage_xb.end(), 0u);
+  if (dc_pred_total + dc_pred_total_xb != 0) {
+    printf("\nDC pred     Y                XB:\n");
+    for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
+      printf("  %6u (%5.2f%%)    %6u (%5.2f%%)\n", dc_pred_usage[i],
+             100.0 * dc_pred_usage[i] / dc_pred_total, dc_pred_usage_xb[i],
+             100.0 * dc_pred_usage_xb[i] / dc_pred_total_xb);
+    }
+  }
+
+  size_t total_blocks = 0;
+  size_t total_positions = 0;
+  if (total_blocks != 0 && total_positions != 0) {
+    printf("\n\t\t  Blocks\t\tPositions\t\t\tBlocks/Position\n");
+    printf(" Total:\t\t    %7" PRIuS "\t\t     %7" PRIuS " \t\t\t%10f%%\n\n",
+           total_blocks, total_positions,
+           100.0 * total_blocks / total_positions);
+  }
+}
+
+template <typename T>
+void AuxOut::DumpImage(const char* label, const Image3<T>& image) const {
+  if (!dump_image) return;
+  if (debug_prefix.empty()) return;
+  std::ostringstream pathname;
+  pathname << debug_prefix << label << ".png";
+  (void)dump_image(ConvertToFloat(image), ColorEncoding::SRGB(),
+                   pathname.str());
+}
+template void AuxOut::DumpImage(const char* label,
+                                const Image3<float>& image) const;
+template void AuxOut::DumpImage(const char* label,
+                                const Image3<uint8_t>& image) const;
+
+template <typename T>
+void AuxOut::DumpPlaneNormalized(const char* label,
+                                 const Plane<T>& image) const {
+  T min;
+  T max;
+  ImageMinMax(image, &min, &max);
+  Image3B normalized(image.xsize(), image.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    float mul = min == max ? 0 : (255.0f / (max - min));
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row_in = image.ConstRow(y);
+      uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        row_out[x] = static_cast<uint8_t>((row_in[x] - min) * mul);
+      }
+    }
+  }
+  DumpImage(label, normalized);
+}
+template void AuxOut::DumpPlaneNormalized(const char* label,
+                                          const Plane<float>& image) const;
+template void AuxOut::DumpPlaneNormalized(const char* label,
+                                          const Plane<uint8_t>& image) const;
+
+void AuxOut::DumpXybImage(const char* label, const Image3F& image) const {
+  if (!dump_image) return;
+  if (debug_prefix.empty()) return;
+  std::ostringstream pathname;
+  pathname << debug_prefix << label << ".png";
+
+  Image3F linear(image.xsize(), image.ysize());
+  OpsinParams opsin_params;
+  opsin_params.Init(kDefaultIntensityTarget);
+  OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params);
+
+  (void)dump_image(std::move(linear), ColorEncoding::LinearSRGB(),
+                   pathname.str());
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_aux_out.h b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h
new file mode 100644
index 0000000000..78222823ae
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_aux_out.h
@@ -0,0 +1,163 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AUX_OUT_H_
+#define LIB_JXL_AUX_OUT_H_
+
+// Optional output information for debugging and analyzing size usage.
+
+#include <stddef.h>
+
+#include <array>
+#include <functional>
+#include <string>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/jxl_inspection.h"
+
+namespace jxl {
+
+struct ColorEncoding;
+
+// For LayerName and AuxOut::layers[] index. Order does not matter.
+enum {
+  kLayerHeader = 0,
+  kLayerTOC,
+  kLayerDictionary,
+  kLayerSplines,
+  kLayerNoise,
+  kLayerQuant,
+  kLayerModularTree,
+  kLayerModularGlobal,
+  kLayerDC,
+  kLayerModularDcGroup,
+  kLayerControlFields,
+  kLayerOrder,
+  kLayerAC,
+  kLayerACTokens,
+  kLayerModularAcGroup,
+  kNumImageLayers
+};
+
+const char* LayerName(size_t layer);
+
+// Statistics gathered during compression or decompression.
+struct AuxOut {
+ private:
+  struct LayerTotals {
+    void Assimilate(const LayerTotals& victim) {
+      num_clustered_histograms += victim.num_clustered_histograms;
+      histogram_bits += victim.histogram_bits;
+      extra_bits += victim.extra_bits;
+      total_bits += victim.total_bits;
+      clustered_entropy += victim.clustered_entropy;
+    }
+    void Print(size_t num_inputs) const;
+
+    size_t num_clustered_histograms = 0;
+    size_t extra_bits = 0;
+
+    // Set via BitsWritten below
+    size_t histogram_bits = 0;
+    size_t total_bits = 0;
+
+    double clustered_entropy = 0.0;
+  };
+
+ public:
+  AuxOut() = default;
+  AuxOut(const AuxOut&) = default;
+
+  void Assimilate(const AuxOut& victim);
+
+  void Print(size_t num_inputs) const;
+
+  size_t TotalBits() const {
+    size_t total = 0;
+    for (const auto& layer : layers) {
+      total += layer.total_bits;
+    }
+    return total;
+  }
+
+  template <typename T>
+  void DumpImage(const char* label, const Image3<T>& image) const;
+
+  void DumpXybImage(const char* label, const Image3F& image) const;
+
+  template <typename T>
+  void DumpPlaneNormalized(const char* label, const Plane<T>& image) const;
+
+  void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) {
+    inspector_image3f_ = inspector;
+  }
+
+  // Allows hooking intermediate data inspection into various places of the
+  // processing pipeline. Returns true iff processing should proceed.
+  bool InspectImage3F(const char* label, const Image3F& image) {
+    if (inspector_image3f_ != nullptr) {
+      return inspector_image3f_(label, image);
+    }
+    return true;
+  }
+
+  std::array<LayerTotals, kNumImageLayers> layers;
+  size_t num_blocks = 0;
+
+  // Number of blocks that use larger DCT (set by ac_strategy).
+  size_t num_small_blocks = 0;
+  size_t num_dct4x8_blocks = 0;
+  size_t num_afv_blocks = 0;
+  size_t num_dct8_blocks = 0;
+  size_t num_dct8x16_blocks = 0;
+  size_t num_dct8x32_blocks = 0;
+  size_t num_dct16_blocks = 0;
+  size_t num_dct16x32_blocks = 0;
+  size_t num_dct32_blocks = 0;
+  size_t num_dct32x64_blocks = 0;
+  size_t num_dct64_blocks = 0;
+
+  std::array<uint32_t, 8> dc_pred_usage = {{0}};
+  std::array<uint32_t, 8> dc_pred_usage_xb = {{0}};
+
+  int num_butteraugli_iters = 0;
+
+  float max_quant_rescale = 1.0f;
+  float min_quant_rescale = 1.0f;
+  float min_bitrate_error = 0.0f;
+  float max_bitrate_error = 0.0f;
+
+  // If not empty, additional debugging information (e.g. debug images) is
+  // saved in files with this prefix.
+  std::string debug_prefix;
+
+  // By how much the decoded image was downsampled relative to the encoded
+  // image.
+  size_t downsampling = 1;
+
+  jxl::InspectorImage3F inspector_image3f_;
+
+  std::function<Status(Image3F&&, const ColorEncoding&, const std::string&)>
+      dump_image = nullptr;
+};
+
+extern template void AuxOut::DumpImage(const char* label,
+                                       const Image3<float>& image) const;
+extern template void AuxOut::DumpImage(const char* label,
+                                       const Image3<uint8_t>& image) const;
+extern template void AuxOut::DumpPlaneNormalized(
+    const char* label, const Plane<float>& image) const;
+extern template void AuxOut::DumpPlaneNormalized(
+    const char* label, const Plane<uint8_t>& image) const;
+
+// Used to skip image creation if they won't be written to debug directory.
+static inline bool WantDebugOutput(const AuxOut* aux_out) {
+  // Need valid pointer and filename.
+  return aux_out != nullptr && !aux_out->debug_prefix.empty();
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_AUX_OUT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc
new file mode 100644
index 0000000000..7964c28f76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc
@@ -0,0 +1,201 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_bit_writer.h"
+
+#include <string.h>  // memcpy
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
+
+namespace jxl {
+
+BitWriter::Allotment::Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits)
+    : max_bits_(max_bits) {
+  if (writer == nullptr) return;
+  prev_bits_written_ = writer->BitsWritten();
+  const size_t prev_bytes = writer->storage_.size();
+  const size_t next_bytes = DivCeil(max_bits, kBitsPerByte);
+  writer->storage_.resize(prev_bytes + next_bytes);
+  parent_ = writer->current_allotment_;
+  writer->current_allotment_ = this;
+}
+
+BitWriter::Allotment::~Allotment() {
+  if (!called_) {
+    // Not calling is a bug - unused storage will not be reclaimed.
+    JXL_ABORT("Did not call Allotment::ReclaimUnused");
+  }
+}
+
+void BitWriter::Allotment::FinishedHistogram(BitWriter* JXL_RESTRICT writer) {
+  if (writer == nullptr) return;
+  JXL_ASSERT(!called_);              // Call before ReclaimUnused
+  JXL_ASSERT(histogram_bits_ == 0);  // Do not call twice
+  JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_);
+  histogram_bits_ = writer->BitsWritten() - prev_bits_written_;
+}
+
+void BitWriter::Allotment::ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
+                                            size_t layer,
+                                            AuxOut* JXL_RESTRICT aux_out) {
+  size_t used_bits, unused_bits;
+  PrivateReclaim(writer, &used_bits, &unused_bits);
+
+#if 0
+  printf("Layer %s bits: max %" PRIuS " used %" PRIuS " unused %" PRIuS "\n",
+         LayerName(layer), MaxBits(), used_bits, unused_bits);
+#endif
+
+  // This may be a nested call with aux_out == null. Whenever we know that
+  // aux_out is null, we can call ReclaimUnused directly.
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].total_bits += used_bits;
+    aux_out->layers[layer].histogram_bits += HistogramBits();
+  }
+}
+
+void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer,
+                                          size_t* JXL_RESTRICT used_bits,
+                                          size_t* JXL_RESTRICT unused_bits) {
+  JXL_ASSERT(!called_);  // Do not call twice
+  called_ = true;
+  if (writer == nullptr) return;
+
+  JXL_ASSERT(writer->BitsWritten() >= prev_bits_written_);
+  *used_bits = writer->BitsWritten() - prev_bits_written_;
+  JXL_ASSERT(*used_bits <= max_bits_);
+  *unused_bits = max_bits_ - *used_bits;
+
+  // Reclaim unused bytes whole bytes from writer's allotment.
+  const size_t unused_bytes = *unused_bits / kBitsPerByte;  // truncate
+  JXL_ASSERT(writer->storage_.size() >= unused_bytes);
+  writer->storage_.resize(writer->storage_.size() - unused_bytes);
+  writer->current_allotment_ = parent_;
+  // Ensure we don't also charge the parent for these bits.
+  auto parent = parent_;
+  while (parent != nullptr) {
+    parent->prev_bits_written_ += *used_bits;
+    parent = parent->parent_;
+  }
+}
+
+void BitWriter::AppendByteAligned(const Span<const uint8_t>& span) {
+  if (span.empty()) return;
+  storage_.resize(storage_.size() + span.size() + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  memcpy(storage_.data() + pos, span.data(), span.size());
+  pos += span.size();
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += span.size() * kBitsPerByte;
+}
+
+void BitWriter::AppendByteAligned(const BitWriter& other) {
+  JXL_ASSERT(other.BitsWritten() % kBitsPerByte == 0);
+  JXL_ASSERT(other.BitsWritten() / kBitsPerByte != 0);
+
+  AppendByteAligned(other.GetSpan());
+}
+
+void BitWriter::AppendByteAligned(const std::vector<BitWriter>& others) {
+  // Total size to add so we can preallocate
+  size_t other_bytes = 0;
+  for (const BitWriter& writer : others) {
+    JXL_ASSERT(writer.BitsWritten() % kBitsPerByte == 0);
+    other_bytes += writer.BitsWritten() / kBitsPerByte;
+  }
+  if (other_bytes == 0) {
+    // No bytes to append: this happens for example when creating per-group
+    // storage for groups, but not writing anything in them for e.g. lossless
+    // images with no alpha. Do nothing.
+    return;
+  }
+  storage_.resize(storage_.size() + other_bytes + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  for (const BitWriter& writer : others) {
+    const Span<const uint8_t> span = writer.GetSpan();
+    if (!span.empty()) {
+      memcpy(storage_.data() + pos, span.data(), span.size());
+      pos += span.size();
+    }
+  }
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += other_bytes * kBitsPerByte;
+}
+
+// TODO(lode): avoid code duplication
+void BitWriter::AppendByteAligned(
+    const std::vector<std::unique_ptr<BitWriter>>& others) {
+  // Total size to add so we can preallocate
+  size_t other_bytes = 0;
+  for (const auto& writer : others) {
+    JXL_ASSERT(writer->BitsWritten() % kBitsPerByte == 0);
+    other_bytes += writer->BitsWritten() / kBitsPerByte;
+  }
+  if (other_bytes == 0) {
+    // No bytes to append: this happens for example when creating per-group
+    // storage for groups, but not writing anything in them for e.g. lossless
+    // images with no alpha. Do nothing.
+    return;
+  }
+  storage_.resize(storage_.size() + other_bytes + 1);  // extra zero padding
+
+  // Concatenate by copying bytes because both source and destination are bytes.
+  JXL_ASSERT(BitsWritten() % kBitsPerByte == 0);
+  size_t pos = BitsWritten() / kBitsPerByte;
+  for (const auto& writer : others) {
+    const Span<const uint8_t> span = writer->GetSpan();
+    memcpy(storage_.data() + pos, span.data(), span.size());
+    pos += span.size();
+  }
+  storage_[pos++] = 0;  // for next Write
+  JXL_ASSERT(pos <= storage_.size());
+  bits_written_ += other_bytes * kBitsPerByte;
+}
+
+// Example: let's assume that 3 bits (Rs below) have been written already:
+// BYTE+0       BYTE+1       BYTE+2
+// 0000 0RRR    ???? ????    ???? ????
+//
+// Now, we could write up to 5 bits by just shifting them left by 3 bits and
+// OR'ing to BYTE-0.
+//
+// For n > 5 bits, we write the lowest 5 bits as above, then write the next
+// lowest bits into BYTE+1 starting from its lower bits and so on.
+void BitWriter::Write(size_t n_bits, uint64_t bits) {
+  JXL_DASSERT((bits >> n_bits) == 0);
+  JXL_DASSERT(n_bits <= kMaxBitsPerCall);
+  uint8_t* p = &storage_[bits_written_ / kBitsPerByte];
+  const size_t bits_in_first_byte = bits_written_ % kBitsPerByte;
+  bits <<= bits_in_first_byte;
+#if JXL_BYTE_ORDER_LITTLE
+  uint64_t v = *p;
+  // Last (partial) or next byte to write must be zero-initialized!
+  // PaddedBytes initializes the first, and Write/Append maintain this.
+  JXL_DASSERT(v >> bits_in_first_byte == 0);
+  v |= bits;
+  memcpy(p, &v, sizeof(v));  // Write bytes: possibly more than n_bits/8
+#else
+  *p++ |= static_cast<uint8_t>(bits & 0xFF);
+  for (size_t bits_left_to_write = n_bits + bits_in_first_byte;
+       bits_left_to_write >= 9; bits_left_to_write -= 8) {
+    bits >>= 8;
+    *p++ = static_cast<uint8_t>(bits & 0xFF);
+  }
+  *p = 0;
+#endif
+  bits_written_ += n_bits;
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h
new file mode 100644
index 0000000000..d3fac15a68
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_bit_writer.h
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BIT_WRITER_H_
+#define LIB_JXL_ENC_BIT_WRITER_H_
+
+// BitWriter class: unbuffered writes using unaligned 64-bit stores.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+struct BitWriter {
+  // Upper bound on `n_bits` in each call to Write. We shift a 64-bit word by
+  // 7 bits (max already valid bits in the last byte) and at least 1 bit is
+  // needed to zero-initialize the bit-stream ahead (i.e. if 7 bits are valid
+  // and we write 57 bits, then the next write will access a byte that was not
+  // yet zero-initialized).
+  static constexpr size_t kMaxBitsPerCall = 56;
+
+  BitWriter() : bits_written_(0) {}
+
+  // Disallow copying - may lead to bugs.
+  BitWriter(const BitWriter&) = delete;
+  BitWriter& operator=(const BitWriter&) = delete;
+  BitWriter(BitWriter&&) = default;
+  BitWriter& operator=(BitWriter&&) = default;
+
+  size_t BitsWritten() const { return bits_written_; }
+
+  Span<const uint8_t> GetSpan() const {
+    // Callers must ensure byte alignment to avoid uninitialized bits.
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+    return Span<const uint8_t>(storage_.data(), bits_written_ / kBitsPerByte);
+  }
+
+  // Example usage: bytes = std::move(writer).TakeBytes(); Useful for the
+  // top-level encoder which returns PaddedBytes, not a BitWriter.
+  // *this must be an rvalue reference and is invalid afterwards.
+  PaddedBytes&& TakeBytes() && {
+    // Callers must ensure byte alignment to avoid uninitialized bits.
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+    storage_.resize(bits_written_ / kBitsPerByte);
+    return std::move(storage_);
+  }
+
+ private:
+  // Must be byte-aligned before calling.
+  void AppendByteAligned(const Span<const uint8_t>& span);
+
+ public:
+  // NOTE: no allotment needed, the other BitWriters have already been charged.
+  void AppendByteAligned(const BitWriter& other);
+  void AppendByteAligned(const std::vector<std::unique_ptr<BitWriter>>& others);
+  void AppendByteAligned(const std::vector<BitWriter>& others);
+
+  class Allotment {
+   public:
+    // Expands a BitWriter's storage. Must happen before calling Write or
+    // ZeroPadToByte. Must call ReclaimUnused after writing to reclaim the
+    // unused storage so that BitWriter memory use remains tightly bounded.
+    Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits);
+    ~Allotment();
+
+    size_t MaxBits() const { return max_bits_; }
+
+    // Call after writing a histogram, but before ReclaimUnused.
+    void FinishedHistogram(BitWriter* JXL_RESTRICT writer);
+
+    size_t HistogramBits() const {
+      JXL_ASSERT(called_);
+      return histogram_bits_;
+    }
+
+    void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* JXL_RESTRICT aux_out);
+
+   private:
+    void PrivateReclaim(BitWriter* JXL_RESTRICT writer,
+                        size_t* JXL_RESTRICT used_bits,
+                        size_t* JXL_RESTRICT unused_bits);
+
+    size_t prev_bits_written_;
+    const size_t max_bits_;
+    size_t histogram_bits_ = 0;
+    bool called_ = false;
+    Allotment* parent_;
+  };
+
+  // Writes bits into bytes in increasing addresses, and within a byte
+  // least-significant-bit first.
+  //
+  // The function can write up to 56 bits in one go.
+  void Write(size_t n_bits, uint64_t bits);
+
+  // This should only rarely be used - e.g. when the current location will be
+  // referenced via byte offset (TOCs point to groups), or byte-aligned reading
+  // is required for speed.
+  void ZeroPadToByte() {
+    const size_t remainder_bits =
+        RoundUpBitsToByteMultiple(bits_written_) - bits_written_;
+    if (remainder_bits == 0) return;
+    Write(remainder_bits, 0);
+    JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
+  }
+
+ private:
+  size_t bits_written_;
+  PaddedBytes storage_;
+  Allotment* current_allotment_ = nullptr;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BIT_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc
new file mode 100644
index 0000000000..5711f45884
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.cc
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_butteraugli_comparator.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_image_bundle.h"
+
+namespace jxl {
+
+JxlButteraugliComparator::JxlButteraugliComparator(
+    const ButteraugliParams& params, const JxlCmsInterface& cms)
+    : params_(params), cms_(cms) {}
+
+Status JxlButteraugliComparator::SetReferenceImage(const ImageBundle& ref) {
+  const ImageBundle* ref_linear_srgb;
+  ImageMetadata metadata = *ref.metadata();
+  ImageBundle store(&metadata);
+  if (!TransformIfNeeded(ref, ColorEncoding::LinearSRGB(ref.IsGray()), cms_,
+                         /*pool=*/nullptr, &store, &ref_linear_srgb)) {
+    return false;
+  }
+
+  comparator_.reset(
+      new ButteraugliComparator(ref_linear_srgb->color(), params_));
+  xsize_ = ref.xsize();
+  ysize_ = ref.ysize();
+  return true;
+}
+
+Status JxlButteraugliComparator::CompareWith(const ImageBundle& actual,
+                                             ImageF* diffmap, float* score) {
+  if (!comparator_) {
+    return JXL_FAILURE("Must set reference image first");
+  }
+  if (xsize_ != actual.xsize() || ysize_ != actual.ysize()) {
+    return JXL_FAILURE("Images must have same size");
+  }
+
+  const ImageBundle* actual_linear_srgb;
+  ImageMetadata metadata = *actual.metadata();
+  ImageBundle store(&metadata);
+  if (!TransformIfNeeded(actual, ColorEncoding::LinearSRGB(actual.IsGray()),
+                         cms_,
+                         /*pool=*/nullptr, &store, &actual_linear_srgb)) {
+    return false;
+  }
+
+  ImageF temp_diffmap(xsize_, ysize_);
+  comparator_->Diffmap(actual_linear_srgb->color(), temp_diffmap);
+
+  if (score != nullptr) {
+    *score = ButteraugliScoreFromDiffmap(temp_diffmap, &params_);
+  }
+  if (diffmap != nullptr) {
+    diffmap->Swap(temp_diffmap);
+  }
+
+  return true;
+}
+
+float JxlButteraugliComparator::GoodQualityScore() const {
+  return ButteraugliFuzzyInverse(1.5);
+}
+
+float JxlButteraugliComparator::BadQualityScore() const {
+  return ButteraugliFuzzyInverse(0.5);
+}
+
+float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap,
+                          ThreadPool* pool) {
+  JxlButteraugliComparator comparator(params, cms);
+  return ComputeScore(rgb0, rgb1, &comparator, cms, distmap, pool);
+}
+
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap,
+                          ThreadPool* pool) {
+  JxlButteraugliComparator comparator(params, cms);
+  JXL_ASSERT(frames0.size() == frames1.size());
+  float max_dist = 0.0f;
+  for (size_t i = 0; i < frames0.size(); ++i) {
+    max_dist = std::max(
+        max_dist,
+        ComputeScore(frames0[i], frames1[i], &comparator, cms, distmap, pool));
+  }
+  return max_dist;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
new file mode 100644
index 0000000000..6c37d1dc7d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
@@ -0,0 +1,59 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
+#define LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+
+#include <memory>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/enc_comparator.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+class JxlButteraugliComparator : public Comparator {
+ public:
+  explicit JxlButteraugliComparator(const ButteraugliParams& params,
+                                    const JxlCmsInterface& cms);
+
+  Status SetReferenceImage(const ImageBundle& ref) override;
+
+  Status CompareWith(const ImageBundle& actual, ImageF* diffmap,
+                     float* score) override;
+
+  float GoodQualityScore() const override;
+  float BadQualityScore() const override;
+
+ private:
+  ButteraugliParams params_;
+  JxlCmsInterface cms_;
+  std::unique_ptr<ButteraugliComparator> comparator_;
+  size_t xsize_ = 0;
+  size_t ysize_ = 0;
+};
+
+// Returns the butteraugli distance between rgb0 and rgb1.
+// If distmap is not null, it must be the same size as rgb0 and rgb1.
+float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap = nullptr,
+                          ThreadPool* pool = nullptr);
+
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
+                          const ButteraugliParams& params,
+                          const JxlCmsInterface& cms, ImageF* distmap = nullptr,
+                          ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc
new file mode 100644
index 0000000000..fe5629dcda
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.cc
@@ -0,0 +1,211 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+
+#include <math.h>
+#include <stdlib.h>
+
+#include <atomic>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_butteraugli_pnorm.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p) {
+  PROFILER_FUNC;
+
+  const double onePerPixels = 1.0 / (distmap.ysize() * distmap.xsize());
+  if (std::abs(p - 3.0) < 1E-6) {
+    double sum1[3] = {0.0};
+
+// Prefer double if possible, but otherwise use float rather than scalar.
+#if HWY_CAP_FLOAT64
+    using T = double;
+    const Rebind<float, HWY_FULL(double)> df;
+#else
+    using T = float;
+#endif
+    const HWY_FULL(T) d;
+    constexpr size_t N = MaxLanes(HWY_FULL(T)());
+    // Manually aligned storage to avoid asan crash on clang-7 due to
+    // unaligned spill.
+    HWY_ALIGN T sum_totals0[N] = {0};
+    HWY_ALIGN T sum_totals1[N] = {0};
+    HWY_ALIGN T sum_totals2[N] = {0};
+
+    for (size_t y = 0; y < distmap.ysize(); ++y) {
+      const float* JXL_RESTRICT row = distmap.ConstRow(y);
+
+      auto sums0 = Zero(d);
+      auto sums1 = Zero(d);
+      auto sums2 = Zero(d);
+
+      size_t x = 0;
+      for (; x + Lanes(d) <= distmap.xsize(); x += Lanes(d)) {
+#if HWY_CAP_FLOAT64
+        const auto d1 = PromoteTo(d, Load(df, row + x));
+#else
+        const auto d1 = Load(d, row + x);
+#endif
+        const auto d2 = Mul(d1, Mul(d1, d1));
+        sums0 = Add(sums0, d2);
+        const auto d3 = Mul(d2, d2);
+        sums1 = Add(sums1, d3);
+        const auto d4 = Mul(d3, d3);
+        sums2 = Add(sums2, d4);
+      }
+
+      Store(Add(sums0, Load(d, sum_totals0)), d, sum_totals0);
+      Store(Add(sums1, Load(d, sum_totals1)), d, sum_totals1);
+      Store(Add(sums2, Load(d, sum_totals2)), d, sum_totals2);
+
+      for (; x < distmap.xsize(); ++x) {
+        const double d1 = row[x];
+        double d2 = d1 * d1 * d1;
+        sum1[0] += d2;
+        d2 *= d2;
+        sum1[1] += d2;
+        d2 *= d2;
+        sum1[2] += d2;
+      }
+    }
+    double v = 0;
+    v += pow(
+        onePerPixels * (sum1[0] + GetLane(SumOfLanes(d, Load(d, sum_totals0)))),
+        1.0 / (p * 1.0));
+    v += pow(
+        onePerPixels * (sum1[1] + GetLane(SumOfLanes(d, Load(d, sum_totals1)))),
+        1.0 / (p * 2.0));
+    v += pow(
+        onePerPixels * (sum1[2] + GetLane(SumOfLanes(d, Load(d, sum_totals2)))),
+        1.0 / (p * 4.0));
+    v /= 3.0;
+    return v;
+  } else {
+    static std::atomic<int> once{0};
+    if (once.fetch_add(1, std::memory_order_relaxed) == 0) {
+      JXL_WARNING("WARNING: using slow ComputeDistanceP");
+    }
+    double sum1[3] = {0.0};
+    for (size_t y = 0; y < distmap.ysize(); ++y) {
+      const float* JXL_RESTRICT row = distmap.ConstRow(y);
+      for (size_t x = 0; x < distmap.xsize(); ++x) {
+        double d2 = std::pow(row[x], p);
+        sum1[0] += d2;
+        d2 *= d2;
+        sum1[1] += d2;
+        d2 *= d2;
+        sum1[2] += d2;
+      }
+    }
+    double v = 0;
+    for (int i = 0; i < 3; ++i) {
+      v += pow(onePerPixels * (sum1[i]), 1.0 / (p * (1 << i)));
+    }
+    v /= 3.0;
+    return v;
+  }
+}
+
+// TODO(lode): take alpha into account when needed
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms) {
+  PROFILER_FUNC;
+  // Convert to sRGB - closer to perception than linear.
+  const Image3F* srgb1 = &ib1.color();
+  Image3F copy1;
+  if (!ib1.IsSRGB()) {
+    JXL_CHECK(
+        ib1.CopyTo(Rect(ib1), ColorEncoding::SRGB(ib1.IsGray()), cms, &copy1));
+    srgb1 = &copy1;
+  }
+  const Image3F* srgb2 = &ib2.color();
+  Image3F copy2;
+  if (!ib2.IsSRGB()) {
+    JXL_CHECK(
+        ib2.CopyTo(Rect(ib2), ColorEncoding::SRGB(ib2.IsGray()), cms, &copy2));
+    srgb2 = &copy2;
+  }
+
+  JXL_CHECK(SameSize(*srgb1, *srgb2));
+
+  // TODO(veluca): SIMD.
+  float yuvmatrix[3][3] = {{0.299, 0.587, 0.114},
+                           {-0.14713, -0.28886, 0.436},
+                           {0.615, -0.51499, -0.10001}};
+  double sum_of_squares[3] = {};
+  for (size_t y = 0; y < srgb1->ysize(); ++y) {
+    const float* JXL_RESTRICT row1[3];
+    const float* JXL_RESTRICT row2[3];
+    for (size_t j = 0; j < 3; j++) {
+      row1[j] = srgb1->ConstPlaneRow(j, y);
+      row2[j] = srgb2->ConstPlaneRow(j, y);
+    }
+    for (size_t x = 0; x < srgb1->xsize(); ++x) {
+      float cdiff[3] = {};
+      // YUV conversion is linear, so we can run it on the difference.
+      for (size_t j = 0; j < 3; j++) {
+        cdiff[j] = row1[j][x] - row2[j][x];
+      }
+      float yuvdiff[3] = {};
+      for (size_t j = 0; j < 3; j++) {
+        for (size_t k = 0; k < 3; k++) {
+          yuvdiff[j] += yuvmatrix[j][k] * cdiff[k];
+        }
+      }
+      for (size_t j = 0; j < 3; j++) {
+        sum_of_squares[j] += yuvdiff[j] * yuvdiff[j];
+      }
+    }
+  }
+  // Weighted PSNR as in JPEG-XL: chroma counts 1/8.
+  const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8};
+  // Avoid squaring the weight - 1/64 is too extreme.
+  double norm = 0;
+  for (size_t i = 0; i < 3; i++) {
+    norm += std::sqrt(sum_of_squares[i]) * weights[i];
+  }
+  // This function returns distance *squared*.
+  return norm * norm;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ComputeDistanceP);
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p) {
+  return HWY_DYNAMIC_DISPATCH(ComputeDistanceP)(distmap, params, p);
+}
+
+HWY_EXPORT(ComputeDistance2);
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms) {
+  return HWY_DYNAMIC_DISPATCH(ComputeDistance2)(ib1, ib2, cms);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h
new file mode 100644
index 0000000000..cf6872e5d0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_pnorm.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+#define LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Computes p-norm given the butteraugli distmap.
+double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
+                        double p);
+
+double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
+                        const JxlCmsInterface& cms);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.cc b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
new file mode 100644
index 0000000000..fc3e5c9f30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.cc
@@ -0,0 +1,218 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_cache.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
+                               ThreadPool* pool, PassesEncoderState* enc_state,
+                               ModularFrameEncoder* modular_frame_encoder,
+                               AuxOut* aux_out) {
+  PROFILER_FUNC;
+
+  PassesSharedState& JXL_RESTRICT shared = enc_state->shared;
+
+  enc_state->histogram_idx.resize(shared.frame_dim.num_groups);
+
+  enc_state->x_qm_multiplier =
+      std::pow(1.25f, shared.frame_header.x_qm_scale - 2.0f);
+  enc_state->b_qm_multiplier =
+      std::pow(1.25f, shared.frame_header.b_qm_scale - 2.0f);
+
+  if (enc_state->coeffs.size() < shared.frame_header.passes.num_passes) {
+    enc_state->coeffs.reserve(shared.frame_header.passes.num_passes);
+    for (size_t i = enc_state->coeffs.size();
+         i < shared.frame_header.passes.num_passes; i++) {
+      // Allocate enough coefficients for each group on every row.
+      enc_state->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, shared.frame_dim.num_groups));
+    }
+  }
+  while (enc_state->coeffs.size() > shared.frame_header.passes.num_passes) {
+    enc_state->coeffs.pop_back();
+  }
+
+  float scale =
+      shared.quantizer.ScaleGlobalScale(enc_state->cparams.quant_ac_rescale);
+  DequantMatricesScaleDC(&shared.matrices, scale);
+  shared.quantizer.RecomputeFromGlobalScale();
+
+  Image3F dc(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, shared.frame_dim.num_groups, ThreadPool::NoInit,
+      [&](size_t group_idx, size_t _) {
+        ComputeCoefficients(group_idx, enc_state, opsin, &dc);
+      },
+      "Compute coeffs"));
+
+  if (shared.frame_header.flags & FrameHeader::kUseDcFrame) {
+    CompressParams cparams = enc_state->cparams;
+    cparams.dots = Override::kOff;
+    cparams.noise = Override::kOff;
+    cparams.patches = Override::kOff;
+    cparams.gaborish = Override::kOff;
+    cparams.epf = 0;
+    cparams.resampling = 1;
+    cparams.ec_resampling = 1;
+    // The DC frame will have alpha=0. Don't erase its contents.
+    cparams.keep_invisible = Override::kOn;
+    JXL_ASSERT(cparams.progressive_dc > 0);
+    cparams.progressive_dc--;
+    // Use kVarDCT in max_error_mode for intermediate progressive DC,
+    // and kModular for the smallest DC (first in the bitstream)
+    if (cparams.progressive_dc == 0) {
+      cparams.modular_mode = true;
+      cparams.speed_tier =
+          SpeedTier(std::max(static_cast<int>(SpeedTier::kTortoise),
+                             static_cast<int>(cparams.speed_tier) - 1));
+      cparams.butteraugli_distance =
+          std::max(kMinButteraugliDistance,
+                   enc_state->cparams.butteraugli_distance * 0.02f);
+    } else {
+      cparams.max_error_mode = true;
+      for (size_t c = 0; c < 3; c++) {
+        cparams.max_error[c] = shared.quantizer.MulDC()[c];
+      }
+      // Guess a distance that produces good initial results.
+      cparams.butteraugli_distance =
+          std::max(kMinButteraugliDistance,
+                   enc_state->cparams.butteraugli_distance * 0.1f);
+    }
+    ImageBundle ib(&shared.metadata->m);
+    // This is a lie - dc is in XYB
+    // (but EncodeFrame will skip RGB->XYB conversion anyway)
+    ib.SetFromImage(
+        std::move(dc),
+        ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray()));
+    if (!ib.metadata()->extra_channel_info.empty()) {
+      // Add dummy extra channels to the patch image: dc_level frames do not yet
+      // support extra channels, but the codec expects that the amount of extra
+      // channels in frames matches that in the metadata of the codestream.
+      std::vector<ImageF> extra_channels;
+      extra_channels.reserve(ib.metadata()->extra_channel_info.size());
+      for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
+        extra_channels.emplace_back(ib.xsize(), ib.ysize());
+        // Must initialize the image with data to not affect blending with
+        // uninitialized memory.
+        // TODO(lode): dc_level must copy and use the real extra channels
+        // instead.
+        ZeroFillImage(&extra_channels.back());
+      }
+      ib.SetExtraChannels(std::move(extra_channels));
+    }
+    std::unique_ptr<PassesEncoderState> state =
+        jxl::make_unique<PassesEncoderState>();
+
+    auto special_frame = std::unique_ptr<BitWriter>(new BitWriter());
+    FrameInfo dc_frame_info;
+    dc_frame_info.frame_type = FrameType::kDCFrame;
+    dc_frame_info.dc_level = shared.frame_header.dc_level + 1;
+    dc_frame_info.ib_needs_color_transform = false;
+    dc_frame_info.save_before_color_transform = true;  // Implicitly true
+    AuxOut dc_aux_out;
+    if (aux_out) {
+      dc_aux_out.debug_prefix = aux_out->debug_prefix;
+    }
+    JXL_CHECK(EncodeFrame(cparams, dc_frame_info, shared.metadata, ib,
+                          state.get(), cms, pool, special_frame.get(),
+                          aux_out ? &dc_aux_out : nullptr));
+    if (aux_out) {
+      for (const auto& l : dc_aux_out.layers) {
+        aux_out->layers[kLayerDC].Assimilate(l);
+      }
+    }
+    const Span<const uint8_t> encoded = special_frame->GetSpan();
+    enc_state->special_frames.emplace_back(std::move(special_frame));
+
+    ImageBundle decoded(&shared.metadata->m);
+    std::unique_ptr<PassesDecoderState> dec_state =
+        jxl::make_unique<PassesDecoderState>();
+    JXL_CHECK(
+        dec_state->output_encoding_info.SetFromMetadata(*shared.metadata));
+    const uint8_t* frame_start = encoded.data();
+    size_t encoded_size = encoded.size();
+    for (int i = 0; i <= cparams.progressive_dc; ++i) {
+      JXL_CHECK(DecodeFrame(dec_state.get(), pool, frame_start, encoded_size,
+                            &decoded, *shared.metadata));
+      frame_start += decoded.decoded_bytes();
+      encoded_size -= decoded.decoded_bytes();
+    }
+    // TODO(lode): shared.frame_header.dc_level should be equal to
+    // dec_state.shared->frame_header.dc_level - 1 here, since above we set
+    // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and
+    // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame
+    // outputs multiple frames, this assumption could be wrong.
+    shared.dc_storage =
+        CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]);
+    ZeroFillImage(&shared.quant_dc);
+    shared.dc = &shared.dc_storage;
+    JXL_CHECK(encoded_size == 0);
+  } else {
+    auto compute_dc_coeffs = [&](int group_index, int /* thread */) {
+      modular_frame_encoder->AddVarDCTDC(
+          dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon,
+          enc_state, /*jpeg_transcode=*/false);
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
+                                  ThreadPool::NoInit, compute_dc_coeffs,
+                                  "Compute DC coeffs"));
+    // TODO(veluca): this is only useful in tests and if inspection is enabled.
+    if (!(shared.frame_header.flags & FrameHeader::kSkipAdaptiveDCSmoothing)) {
+      AdaptiveDCSmoothing(shared.quantizer.MulDC(), &shared.dc_storage, pool);
+    }
+  }
+  auto compute_ac_meta = [&](int group_index, int /* thread */) {
+    modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/false,
+                                         enc_state);
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
+                                ThreadPool::NoInit, compute_ac_meta,
+                                "Compute AC Metadata"));
+
+  if (aux_out != nullptr) {
+    aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec",
+                            shared.dc_storage);
+  }
+  return true;
+}
+
+void EncCache::InitOnce() {
+  PROFILER_FUNC;
+
+  if (num_nzeroes.xsize() == 0) {
+    num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cache.h b/third_party/jpeg-xl/lib/jxl/enc_cache.h
new file mode 100644
index 0000000000..6c7870ba00
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.h
@@ -0,0 +1,93 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CACHE_H_
+#define LIB_JXL_ENC_CACHE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_heuristics.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_progressive_split.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Contains encoder state.
+struct PassesEncoderState {
+  PassesSharedState shared;
+
+  ImageF initial_quant_field;    // Invalid in Falcon mode.
+  ImageF initial_quant_masking;  // Invalid in Falcon mode.
+
+  // Per-pass DCT coefficients for the image. One row per group.
+  std::vector<std::unique_ptr<ACImage>> coeffs;
+
+  // Raw data for special (reference+DC) frames.
+  std::vector<std::unique_ptr<BitWriter>> special_frames;
+
+  // For splitting into passes.
+  ProgressiveSplitter progressive_splitter;
+
+  CompressParams cparams;
+
+  struct PassData {
+    std::vector<std::vector<Token>> ac_tokens;
+    std::vector<uint8_t> context_map;
+    EntropyEncodingData codes;
+  };
+
+  std::vector<PassData> passes;
+  std::vector<uint8_t> histogram_idx;
+
+  // Coefficient orders that are non-default.
+  std::vector<uint32_t> used_orders;
+
+  // Multiplier to be applied to the quant matrices of the x channel.
+  float x_qm_multiplier = 1.0f;
+  float b_qm_multiplier = 1.0f;
+
+  // Heuristics to be used by the encoder.
+  std::unique_ptr<EncoderHeuristics> heuristics =
+      make_unique<DefaultEncoderHeuristics>();
+};
+
+// Initialize per-frame information.
+class ModularFrameEncoder;
+Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
+                               ThreadPool* pool,
+                               PassesEncoderState* passes_enc_state,
+                               ModularFrameEncoder* modular_frame_encoder,
+                               AuxOut* aux_out);
+
+// Working area for ComputeCoefficients (per-group!)
+struct EncCache {
+  // Allocates memory when first called, shrinks images to current group size.
+  void InitOnce();
+
+  // TokenizeCoefficients
+  Image3I num_nzeroes;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CACHE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
new file mode 100644
index 0000000000..0cdd2a7823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
@@ -0,0 +1,409 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_chroma_from_luma.h"
+
+#include <float.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc"
+#include <hwy/aligned_allocator.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/quantizer.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+
+static HWY_FULL(float) df;
+
+struct CFLFunction {
+  static constexpr float kCoeff = 1.f / 3;
+  static constexpr float kThres = 100.0f;
+  static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
+  CFLFunction(const float* values_m, const float* values_s, size_t num,
+              float base, float distance_mul)
+      : values_m(values_m),
+        values_s(values_s),
+        num(num),
+        base(base),
+        distance_mul(distance_mul) {}
+
+  // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) +
+  // distance_mul * x^2 * num.
+  float Compute(float x, float eps, float* fpeps, float* fmeps) const {
+    float first_derivative = 2 * distance_mul * num * x;
+    float first_derivative_peps = 2 * distance_mul * num * (x + eps);
+    float first_derivative_meps = 2 * distance_mul * num * (x - eps);
+
+    const auto inv_color_factor = Set(df, kInvColorFactor);
+    const auto thres = Set(df, kThres);
+    const auto coeffx2 = Set(df, kCoeff * 2.0f);
+    const auto one = Set(df, 1.0f);
+    const auto zero = Set(df, 0.0f);
+    const auto base_v = Set(df, base);
+    const auto x_v = Set(df, x);
+    const auto xpe_v = Set(df, x + eps);
+    const auto xme_v = Set(df, x - eps);
+    auto fd_v = Zero(df);
+    auto fdpe_v = Zero(df);
+    auto fdme_v = Zero(df);
+    JXL_ASSERT(num % Lanes(df) == 0);
+
+    for (size_t i = 0; i < num; i += Lanes(df)) {
+      // color residual = ax + b
+      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
+      const auto b =
+          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
+      const auto v = MulAdd(a, x_v, b);
+      const auto vpe = MulAdd(a, xpe_v, b);
+      const auto vme = MulAdd(a, xme_v, b);
+      const auto av = Abs(v);
+      const auto avpe = Abs(vpe);
+      const auto avme = Abs(vme);
+      const auto acoeffx2 = Mul(coeffx2, a);
+      auto d = Mul(acoeffx2, Add(av, one));
+      auto dpe = Mul(acoeffx2, Add(avpe, one));
+      auto dme = Mul(acoeffx2, Add(avme, one));
+      d = IfThenElse(Lt(v, zero), Sub(zero, d), d);
+      dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe);
+      dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme);
+      const auto above = Ge(av, thres);
+      // TODO(eustas): use IfThenElseZero
+      fd_v = Add(fd_v, IfThenElse(above, zero, d));
+      fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe));
+      fdme_v = Add(fdme_v, IfThenElse(above, zero, dme));
+    }
+
+    *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v));
+    *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v));
+    return first_derivative + GetLane(SumOfLanes(df, fd_v));
+  }
+
+  const float* JXL_RESTRICT values_m;
+  const float* JXL_RESTRICT values_s;
+  size_t num;
+  float base;
+  float distance_mul;
+};
+
+// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
+int32_t FindBestMultiplier(const float* values_m, const float* values_s,
+                           size_t num, float base, float distance_mul,
+                           bool fast) {
+  if (num == 0) {
+    return 0;
+  }
+  float x;
+  if (fast) {
+    static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor;
+    auto ca = Zero(df);
+    auto cb = Zero(df);
+    const auto inv_color_factor = Set(df, kInvColorFactor);
+    const auto base_v = Set(df, base);
+    for (size_t i = 0; i < num; i += Lanes(df)) {
+      // color residual = ax + b
+      const auto a = Mul(inv_color_factor, Load(df, values_m + i));
+      const auto b =
+          Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i));
+      ca = MulAdd(a, a, ca);
+      cb = MulAdd(a, b, cb);
+    }
+    // + distance_mul * x^2 * num
+    x = -GetLane(SumOfLanes(df, cb)) /
+        (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
+  } else {
+    constexpr float eps = 100;
+    constexpr float kClamp = 20.0f;
+    CFLFunction fn(values_m, values_s, num, base, distance_mul);
+    x = 0;
+    // Up to 20 Newton iterations, with approximate derivatives.
+    // Derivatives are approximate due to the high amount of noise in the exact
+    // derivatives.
+    for (size_t i = 0; i < 20; i++) {
+      float dfpeps, dfmeps;
+      float df = fn.Compute(x, eps, &dfpeps, &dfmeps);
+      float ddf = (dfpeps - dfmeps) / (2 * eps);
+      float kExperimentalInsignificantStabilizer = 0.85;
+      float step = df / (ddf + kExperimentalInsignificantStabilizer);
+      x -= std::min(kClamp, std::max(-kClamp, step));
+      if (std::abs(step) < 3e-3) break;
+    }
+  }
+  // CFL seems to be tricky for larger transforms for HF components
+  // close to zero. This heuristic brings the solutions closer to zero
+  // and reduces red-green oscillations.
+  float towards_zero = 2.6;
+  if (x >= towards_zero) {
+    x -= towards_zero;
+  } else if (x <= -towards_zero) {
+    x += towards_zero;
+  } else {
+    x = 0;
+  }
+  return std::max(-128.0f, std::min(127.0f, roundf(x)));
+}
+
+void InitDCStorage(size_t num_blocks, ImageF* dc_values) {
+  // First row: Y channel
+  // Second row: X channel
+  // Third row: Y channel
+  // Fourth row: B channel
+  *dc_values = ImageF(RoundUpTo(num_blocks, Lanes(df)), 4);
+
+  JXL_ASSERT(dc_values->xsize() != 0);
+  // Zero-fill the last lanes
+  for (size_t y = 0; y < 4; y++) {
+    for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize();
+         x++) {
+      dc_values->Row(y)[x] = 0;
+    }
+  }
+}
+
+void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
+               int32_t* dc_b) {
+  constexpr float kDistanceMultiplierDC = 1e-5f;
+  const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0);
+  const float* JXL_RESTRICT dc_values_x = dc_values.Row(1);
+  const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2);
+  const float* JXL_RESTRICT dc_values_b = dc_values.Row(3);
+  *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
+                             kDistanceMultiplierDC, fast);
+  *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
+                             kYToBRatio, kDistanceMultiplierDC, fast);
+}
+
+void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
+                 const AcStrategyImage* ac_strategy,
+                 const ImageI* raw_quant_field, const Quantizer* quantizer,
+                 const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
+                 ImageSB* map_b, ImageF* dc_values, float* mem) {
+  static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
+                "Invalid color tile dim");
+  size_t xsize_blocks = opsin.xsize() / kBlockDim;
+  constexpr float kDistanceMultiplierAC = 1e-9f;
+
+  const size_t y0 = r.y0();
+  const size_t x0 = r.x0();
+  const size_t x1 = r.x0() + r.xsize();
+  const size_t y1 = r.y0() + r.ysize();
+
+  int ty = y0 / kColorTileDimInBlocks;
+  int tx = x0 / kColorTileDimInBlocks;
+
+  int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty);
+  int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty);
+
+  float* JXL_RESTRICT dc_values_yx = dc_values->Row(0);
+  float* JXL_RESTRICT dc_values_x = dc_values->Row(1);
+  float* JXL_RESTRICT dc_values_yb = dc_values->Row(2);
+  float* JXL_RESTRICT dc_values_b = dc_values->Row(3);
+
+  // All are aligned.
+  float* HWY_RESTRICT block_y = mem;
+  float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT coeffs_yx = block_b + AcStrategy::kMaxCoeffArea;
+  float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
+  float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim;
+  JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea ==
+              block_y + CfLHeuristics::kItemsPerThread);
+
+  // Small (~256 bytes each)
+  HWY_ALIGN_MAX float
+      dc_y[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  HWY_ALIGN_MAX float
+      dc_x[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  HWY_ALIGN_MAX float
+      dc_b[AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks] = {};
+  size_t num_ac = 0;
+
+  for (size_t y = y0; y < y1; ++y) {
+    const float* JXL_RESTRICT row_y = opsin.ConstPlaneRow(1, y * kBlockDim);
+    const float* JXL_RESTRICT row_x = opsin.ConstPlaneRow(0, y * kBlockDim);
+    const float* JXL_RESTRICT row_b = opsin.ConstPlaneRow(2, y * kBlockDim);
+    size_t stride = opsin.PixelsPerRow();
+
+    for (size_t x = x0; x < x1; x++) {
+      AcStrategy acs = use_dct8
+                           ? AcStrategy::FromRawStrategy(AcStrategy::Type::DCT)
+                           : ac_strategy->ConstRow(y)[x];
+      if (!acs.IsFirstBlock()) continue;
+      size_t xs = acs.covered_blocks_x();
+      TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride,
+                          block_y, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs);
+      TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride,
+                          block_x, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs);
+      TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride,
+                          block_b, scratch_space);
+      DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs);
+      const float* const JXL_RESTRICT qm_x =
+          dequant.InvMatrix(acs.Strategy(), 0);
+      const float* const JXL_RESTRICT qm_b =
+          dequant.InvMatrix(acs.Strategy(), 2);
+      float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
+      float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
+
+      // Copy DCs in dc_values.
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < xs; ix++) {
+          dc_values_yx[(iy + y) * xsize_blocks + ix + x] =
+              dc_y[iy * xs + ix] * q_dc_x;
+          dc_values_x[(iy + y) * xsize_blocks + ix + x] =
+              dc_x[iy * xs + ix] * q_dc_x;
+          dc_values_yb[(iy + y) * xsize_blocks + ix + x] =
+              dc_y[iy * xs + ix] * q_dc_b;
+          dc_values_b[(iy + y) * xsize_blocks + ix + x] =
+              dc_b[iy * xs + ix] * q_dc_b;
+        }
+      }
+
+      // Do not use this block for computing AC CfL.
+      if (acs.covered_blocks_x() + x0 > x1 ||
+          acs.covered_blocks_y() + y0 > y1) {
+        continue;
+      }
+
+      // Copy AC coefficients in the local block. The order in which
+      // coefficients get stored does not matter.
+      size_t cx = acs.covered_blocks_x();
+      size_t cy = acs.covered_blocks_y();
+      CoefficientLayout(&cy, &cx);
+      // Zero out LFs. This introduces terms in the optimization loop that
+      // don't affect the result, as they are all 0, but allow for simpler
+      // SIMDfication.
+      for (size_t iy = 0; iy < cy; iy++) {
+        for (size_t ix = 0; ix < cx; ix++) {
+          block_y[cx * kBlockDim * iy + ix] = 0;
+          block_x[cx * kBlockDim * iy + ix] = 0;
+          block_b[cx * kBlockDim * iy + ix] = 0;
+        }
+      }
+      // Unclear why this is like it is. (This works slightly better
+      // than the previous approach which was also a hack.)
+      const float qq =
+          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
+      // Experimentally values 128-130 seem best -- I don't know why we
+      // need this multiplier.
+      const float kStrangeMultiplier = 128;
+      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
+      const auto qv = Set(df, q);
+      for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
+        const auto b_y = Load(df, block_y + i);
+        const auto b_x = Load(df, block_x + i);
+        const auto b_b = Load(df, block_b + i);
+        const auto qqm_x = Mul(qv, Load(df, qm_x + i));
+        const auto qqm_b = Mul(qv, Load(df, qm_b + i));
+        Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac);
+        Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac);
+        Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac);
+        Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac);
+        num_ac += Lanes(df);
+      }
+    }
+  }
+  JXL_CHECK(num_ac % Lanes(df) == 0);
+  row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
+                                     kDistanceMultiplierAC, fast);
+  row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio,
+                                     kDistanceMultiplierAC, fast);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(InitDCStorage);
+HWY_EXPORT(ComputeDC);
+HWY_EXPORT(ComputeTile);
+
+void CfLHeuristics::Init(const Image3F& opsin) {
+  size_t xsize_blocks = opsin.xsize() / kBlockDim;
+  size_t ysize_blocks = opsin.ysize() / kBlockDim;
+  HWY_DYNAMIC_DISPATCH(InitDCStorage)
+  (xsize_blocks * ysize_blocks, &dc_values);
+}
+
+void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
+                                const DequantMatrices& dequant,
+                                const AcStrategyImage* ac_strategy,
+                                const ImageI* raw_quant_field,
+                                const Quantizer* quantizer, bool fast,
+                                size_t thread, ColorCorrelationMap* cmap) {
+  bool use_dct8 = ac_strategy == nullptr;
+  HWY_DYNAMIC_DISPATCH(ComputeTile)
+  (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
+   &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+   mem.get() + thread * kItemsPerThread);
+}
+
+void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
+  int32_t ytob_dc = 0;
+  int32_t ytox_dc = 0;
+  HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc);
+  cmap->SetYToBDC(ytob_dc);
+  cmap->SetYToXDC(ytox_dc);
+}
+
+void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
+                                 size_t layer, AuxOut* aux_out) {
+  float color_factor = map->GetColorFactor();
+  float base_correlation_x = map->GetBaseCorrelationX();
+  float base_correlation_b = map->GetBaseCorrelationB();
+  int32_t ytox_dc = map->GetYToXDC();
+  int32_t ytob_dc = map->GetYToBDC();
+
+  BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32);
+  if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor &&
+      base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) {
+    writer->Write(1, 1);
+    allotment.ReclaimAndCharge(writer, layer, aux_out);
+    return;
+  }
+  writer->Write(1, 0);
+  JXL_CHECK(U32Coder::Write(kColorFactorDist, color_factor, writer));
+  JXL_CHECK(F16Coder::Write(base_correlation_x, writer));
+  JXL_CHECK(F16Coder::Write(base_correlation_b, writer));
+  writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits<int8_t>::min());
+  writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits<int8_t>::min());
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
new file mode 100644
index 0000000000..899b91b041
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
+#define LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
+
+// Chroma-from-luma, computed using heuristics to determine the best linear
+// model for the X and B channels from the Y channel.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+struct AuxOut;
+class Quantizer;
+
+void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
+                                 size_t layer, AuxOut* aux_out);
+
+struct CfLHeuristics {
+  void Init(const Image3F& opsin);
+
+  void PrepareForThreads(size_t num_threads) {
+    mem = hwy::AllocateAligned<float>(num_threads * kItemsPerThread);
+  }
+
+  void ComputeTile(const Rect& r, const Image3F& opsin,
+                   const DequantMatrices& dequant,
+                   const AcStrategyImage* ac_strategy,
+                   const ImageI* raw_quant_field, const Quantizer* quantizer,
+                   bool fast, size_t thread, ColorCorrelationMap* cmap);
+
+  void ComputeDC(bool fast, ColorCorrelationMap* cmap);
+
+  ImageF dc_values;
+  hwy::AlignedFreeUniquePtr<float[]> mem;
+
+  // Working set is too large for stack; allocate dynamically.
+  constexpr static size_t kItemsPerThread =
+      AcStrategy::kMaxCoeffArea * 3        // Blocks
+      + kColorTileDim * kColorTileDim * 4  // AC coeff storage
+      + AcStrategy::kMaxCoeffArea * 2;     // Scratch space
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CHROMA_FROM_LUMA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.cc b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
new file mode 100644
index 0000000000..c79b3ac834
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.cc
@@ -0,0 +1,295 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_cluster.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <queue>
+#include <tuple>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_cluster.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/fast_math-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <class V>
+V Entropy(V count, V inv_total, V total) {
+  const HWY_CAPPED(float, Histogram::kRounding) d;
+  const auto zero = Set(d, 0.0f);
+  // TODO(eustas): why (0 - x) instead of Neg(x)?
+  return IfThenZeroElse(
+      Eq(count, total),
+      Sub(zero, Mul(count, FastLog2f(d, Mul(inv_total, count)))));
+}
+
+void HistogramEntropy(const Histogram& a) {
+  a.entropy_ = 0.0f;
+  if (a.total_count_ == 0) return;
+
+  const HWY_CAPPED(float, Histogram::kRounding) df;
+  const HWY_CAPPED(int32_t, Histogram::kRounding) di;
+
+  const auto inv_tot = Set(df, 1.0f / a.total_count_);
+  auto entropy_lanes = Zero(df);
+  auto total = Set(df, a.total_count_);
+
+  for (size_t i = 0; i < a.data_.size(); i += Lanes(di)) {
+    const auto counts = LoadU(di, &a.data_[i]);
+    entropy_lanes =
+        Add(entropy_lanes, Entropy(ConvertTo(df, counts), inv_tot, total));
+  }
+  a.entropy_ += GetLane(SumOfLanes(df, entropy_lanes));
+}
+
+float HistogramDistance(const Histogram& a, const Histogram& b) {
+  if (a.total_count_ == 0 || b.total_count_ == 0) return 0;
+
+  const HWY_CAPPED(float, Histogram::kRounding) df;
+  const HWY_CAPPED(int32_t, Histogram::kRounding) di;
+
+  const auto inv_tot = Set(df, 1.0f / (a.total_count_ + b.total_count_));
+  auto distance_lanes = Zero(df);
+  auto total = Set(df, a.total_count_ + b.total_count_);
+
+  for (size_t i = 0; i < std::max(a.data_.size(), b.data_.size());
+       i += Lanes(di)) {
+    const auto a_counts =
+        a.data_.size() > i ? LoadU(di, &a.data_[i]) : Zero(di);
+    const auto b_counts =
+        b.data_.size() > i ? LoadU(di, &b.data_[i]) : Zero(di);
+    const auto counts = ConvertTo(df, Add(a_counts, b_counts));
+    distance_lanes = Add(distance_lanes, Entropy(counts, inv_tot, total));
+  }
+  const float total_distance = GetLane(SumOfLanes(df, distance_lanes));
+  return total_distance - a.entropy_ - b.entropy_;
+}
+
+// First step of a k-means clustering with a fancy distance metric.
+void FastClusterHistograms(const std::vector<Histogram>& in,
+                           size_t max_histograms, std::vector<Histogram>* out,
+                           std::vector<uint32_t>* histogram_symbols) {
+  PROFILER_FUNC;
+  out->clear();
+  out->reserve(max_histograms);
+  histogram_symbols->clear();
+  histogram_symbols->resize(in.size(), max_histograms);
+
+  std::vector<float> dists(in.size(), std::numeric_limits<float>::max());
+  size_t largest_idx = 0;
+  for (size_t i = 0; i < in.size(); i++) {
+    if (in[i].total_count_ == 0) {
+      (*histogram_symbols)[i] = 0;
+      dists[i] = 0.0f;
+      continue;
+    }
+    HistogramEntropy(in[i]);
+    if (in[i].total_count_ > in[largest_idx].total_count_) {
+      largest_idx = i;
+    }
+  }
+
+  constexpr float kMinDistanceForDistinct = 48.0f;
+  while (out->size() < max_histograms) {
+    (*histogram_symbols)[largest_idx] = out->size();
+    out->push_back(in[largest_idx]);
+    dists[largest_idx] = 0.0f;
+    largest_idx = 0;
+    for (size_t i = 0; i < in.size(); i++) {
+      if (dists[i] == 0.0f) continue;
+      dists[i] = std::min(HistogramDistance(in[i], out->back()), dists[i]);
+      if (dists[i] > dists[largest_idx]) largest_idx = i;
+    }
+    if (dists[largest_idx] < kMinDistanceForDistinct) break;
+  }
+
+  for (size_t i = 0; i < in.size(); i++) {
+    if ((*histogram_symbols)[i] != max_histograms) continue;
+    size_t best = 0;
+    float best_dist = HistogramDistance(in[i], (*out)[best]);
+    for (size_t j = 1; j < out->size(); j++) {
+      float dist = HistogramDistance(in[i], (*out)[j]);
+      if (dist < best_dist) {
+        best = j;
+        best_dist = dist;
+      }
+    }
+    (*out)[best].AddHistogram(in[i]);
+    HistogramEntropy((*out)[best]);
+    (*histogram_symbols)[i] = best;
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(FastClusterHistograms);  // Local function
+HWY_EXPORT(HistogramEntropy);       // Local function
+
+float Histogram::ShannonEntropy() const {
+  HWY_DYNAMIC_DISPATCH(HistogramEntropy)(*this);
+  return entropy_;
+}
+
+namespace {
+// -----------------------------------------------------------------------------
+// Histogram refinement
+
+// Reorder histograms in *out so that the new symbols in *symbols come in
+// increasing order.
+void HistogramReindex(std::vector<Histogram>* out,
+                      std::vector<uint32_t>* symbols) {
+  std::vector<Histogram> tmp(*out);
+  std::map<int, int> new_index;
+  int next_index = 0;
+  for (uint32_t symbol : *symbols) {
+    if (new_index.find(symbol) == new_index.end()) {
+      new_index[symbol] = next_index;
+      (*out)[next_index] = tmp[symbol];
+      ++next_index;
+    }
+  }
+  out->resize(next_index);
+  for (uint32_t& symbol : *symbols) {
+    symbol = new_index[symbol];
+  }
+}
+
+}  // namespace
+
+// Clusters similar histograms in 'in' together, the selected histograms are
+// placed in 'out', and for each index in 'in', *histogram_symbols will
+// indicate which of the 'out' histograms is the best approximation.
+void ClusterHistograms(const HistogramParams params,
+                       const std::vector<Histogram>& in, size_t max_histograms,
+                       std::vector<Histogram>* out,
+                       std::vector<uint32_t>* histogram_symbols) {
+  max_histograms = std::min(max_histograms, params.max_histograms);
+  max_histograms = std::min(max_histograms, in.size());
+  if (params.clustering == HistogramParams::ClusteringType::kFastest) {
+    max_histograms = std::min(max_histograms, static_cast<size_t>(4));
+  }
+
+  HWY_DYNAMIC_DISPATCH(FastClusterHistograms)
+  (in, max_histograms, out, histogram_symbols);
+
+  if (params.clustering == HistogramParams::ClusteringType::kBest) {
+    for (size_t i = 0; i < out->size(); i++) {
+      (*out)[i].entropy_ =
+          ANSPopulationCost((*out)[i].data_.data(), (*out)[i].data_.size());
+    }
+    uint32_t next_version = 2;
+    std::vector<uint32_t> version(out->size(), 1);
+    std::vector<uint32_t> renumbering(out->size());
+    std::iota(renumbering.begin(), renumbering.end(), 0);
+
+    // Try to pair up clusters if doing so reduces the total cost.
+
+    struct HistogramPair {
+      // validity of a pair: p.version == max(version[i], version[j])
+      float cost;
+      uint32_t first;
+      uint32_t second;
+      uint32_t version;
+      // We use > because priority queues sort in *decreasing* order, but we
+      // want lower cost elements to appear first.
+      bool operator<(const HistogramPair& other) const {
+        return std::make_tuple(cost, first, second, version) >
+               std::make_tuple(other.cost, other.first, other.second,
+                               other.version);
+      }
+    };
+
+    // Create list of all pairs by increasing merging cost.
+    std::priority_queue<HistogramPair> pairs_to_merge;
+    for (uint32_t i = 0; i < out->size(); i++) {
+      for (uint32_t j = i + 1; j < out->size(); j++) {
+        Histogram histo;
+        histo.AddHistogram((*out)[i]);
+        histo.AddHistogram((*out)[j]);
+        float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) -
+                     (*out)[i].entropy_ - (*out)[j].entropy_;
+        // Avoid enqueueing pairs that are not advantageous to merge.
+        if (cost >= 0) continue;
+        pairs_to_merge.push(
+            HistogramPair{cost, i, j, std::max(version[i], version[j])});
+      }
+    }
+
+    // Merge the best pair to merge, add new pairs that get formed as a
+    // consequence.
+    while (!pairs_to_merge.empty()) {
+      uint32_t first = pairs_to_merge.top().first;
+      uint32_t second = pairs_to_merge.top().second;
+      uint32_t ver = pairs_to_merge.top().version;
+      pairs_to_merge.pop();
+      if (ver != std::max(version[first], version[second]) ||
+          version[first] == 0 || version[second] == 0) {
+        continue;
+      }
+      (*out)[first].AddHistogram((*out)[second]);
+      (*out)[first].entropy_ = ANSPopulationCost((*out)[first].data_.data(),
+                                                 (*out)[first].data_.size());
+      for (size_t i = 0; i < renumbering.size(); i++) {
+        if (renumbering[i] == second) {
+          renumbering[i] = first;
+        }
+      }
+      version[second] = 0;
+      version[first] = next_version++;
+      for (uint32_t j = 0; j < out->size(); j++) {
+        if (j == first) continue;
+        if (version[j] == 0) continue;
+        Histogram histo;
+        histo.AddHistogram((*out)[first]);
+        histo.AddHistogram((*out)[j]);
+        float cost = ANSPopulationCost(histo.data_.data(), histo.data_.size()) -
+                     (*out)[first].entropy_ - (*out)[j].entropy_;
+        // Avoid enqueueing pairs that are not advantageous to merge.
+        if (cost >= 0) continue;
+        pairs_to_merge.push(
+            HistogramPair{cost, std::min(first, j), std::max(first, j),
+                          std::max(version[first], version[j])});
+      }
+    }
+    std::vector<uint32_t> reverse_renumbering(out->size(), -1);
+    size_t num_alive = 0;
+    for (size_t i = 0; i < out->size(); i++) {
+      if (version[i] == 0) continue;
+      (*out)[num_alive++] = (*out)[i];
+      reverse_renumbering[i] = num_alive - 1;
+    }
+    out->resize(num_alive);
+    for (size_t i = 0; i < histogram_symbols->size(); i++) {
+      (*histogram_symbols)[i] =
+          reverse_renumbering[renumbering[(*histogram_symbols)[i]]];
+    }
+  }
+
+  // Convert the context map to a canonical form.
+  HistogramReindex(out, histogram_symbols);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_cluster.h b/third_party/jpeg-xl/lib/jxl/enc_cluster.h
new file mode 100644
index 0000000000..4b062e820c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_cluster.h
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for clustering similar histograms together.
+
+#ifndef LIB_JXL_ENC_CLUSTER_H_
+#define LIB_JXL_ENC_CLUSTER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/enc_ans.h"
+
+namespace jxl {
+
+struct Histogram {
+  Histogram() {
+    total_count_ = 0;
+    entropy_ = 0.0;
+  }
+  void Clear() {
+    data_.clear();
+    total_count_ = 0;
+  }
+  void Add(size_t symbol) {
+    if (data_.size() <= symbol) {
+      data_.resize(DivCeil(symbol + 1, kRounding) * kRounding);
+    }
+    ++data_[symbol];
+    ++total_count_;
+  }
+  void AddHistogram(const Histogram& other) {
+    if (other.data_.size() > data_.size()) {
+      data_.resize(other.data_.size());
+    }
+    for (size_t i = 0; i < other.data_.size(); ++i) {
+      data_[i] += other.data_[i];
+    }
+    total_count_ += other.total_count_;
+  }
+  float PopulationCost() const {
+    return ANSPopulationCost(data_.data(), data_.size());
+  }
+  float ShannonEntropy() const;
+
+  std::vector<ANSHistBin> data_;
+  size_t total_count_;
+  mutable float entropy_;  // WARNING: not kept up-to-date.
+  static constexpr size_t kRounding = 8;
+};
+
+void ClusterHistograms(HistogramParams params, const std::vector<Histogram>& in,
+                       size_t max_histograms, std::vector<Histogram>* out,
+                       std::vector<uint32_t>* histogram_symbols);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CLUSTER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc
new file mode 100644
index 0000000000..389b53598a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.cc
@@ -0,0 +1,291 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/lehmer_code.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+std::pair<uint32_t, uint32_t> ComputeUsedOrders(
+    const SpeedTier speed, const AcStrategyImage& ac_strategy,
+    const Rect& rect) {
+  // Only uses DCT8 = 0, so bitfield = 1.
+  if (speed >= SpeedTier::kFalcon) return {1, 1};
+
+  uint32_t ret = 0;
+  uint32_t ret_customize = 0;
+  size_t xsize_blocks = rect.xsize();
+  size_t ysize_blocks = rect.ysize();
+  // TODO(veluca): precompute when doing DCT.
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+    for (size_t bx = 0; bx < xsize_blocks; ++bx) {
+      int ord = kStrategyOrder[acs_row[bx].RawStrategy()];
+      // Do not customize coefficient orders for blocks bigger than 32x32.
+      ret |= 1u << ord;
+      if (ord > 6) {
+        continue;
+      }
+      ret_customize |= 1u << ord;
+    }
+  }
+  // Use default orders for small images.
+  if (ac_strategy.xsize() < 5 && ac_strategy.ysize() < 5) return {ret, 0};
+  return {ret, ret_customize};
+}
+
+void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs,
+                       const AcStrategyImage& ac_strategy,
+                       const FrameDimensions& frame_dim, uint32_t& used_orders,
+                       uint16_t used_acs, coeff_order_t* JXL_RESTRICT order) {
+  std::vector<int32_t> num_zeros(kCoeffOrderMaxSize);
+  // If compressing at high speed and only using 8x8 DCTs, only consider a
+  // subset of blocks.
+  double block_fraction = 1.0f;
+  // TODO(veluca): figure out why sampling blocks if non-8x8s are used makes
+  // encoding significantly less dense.
+  if (speed >= SpeedTier::kSquirrel && used_orders == 1) {
+    block_fraction = 0.5f;
+  }
+  // No need to compute number of zero coefficients if all orders are the
+  // default.
+  if (used_orders != 0) {
+    uint64_t threshold =
+        (std::numeric_limits<uint64_t>::max() >> 32) * block_fraction;
+    uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
+                     static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
+    // Xorshift128+ adapted from xorshift128+-inl.h
+    auto use_sample = [&]() {
+      auto s1 = s[0];
+      const auto s0 = s[1];
+      const auto bits = s1 + s0;  // b, c
+      s[0] = s0;
+      s1 ^= s1 << 23;
+      s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+      s[1] = s1;
+      return (bits >> 32) <= threshold;
+    };
+
+    // Count number of zero coefficients, separately for each DCT band.
+    // TODO(veluca): precompute when doing DCT.
+    for (size_t group_index = 0; group_index < frame_dim.num_groups;
+         group_index++) {
+      const size_t gx = group_index % frame_dim.xsize_groups;
+      const size_t gy = group_index / frame_dim.xsize_groups;
+      const Rect rect(gx * kGroupDimInBlocks, gy * kGroupDimInBlocks,
+                      kGroupDimInBlocks, kGroupDimInBlocks,
+                      frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+      ConstACPtr rows[3];
+      ACType type = acs.Type();
+      for (size_t c = 0; c < 3; c++) {
+        rows[c] = acs.PlaneRow(c, group_index, 0);
+      }
+      size_t ac_offset = 0;
+
+      // TODO(veluca): SIMDfy.
+      for (size_t by = 0; by < rect.ysize(); ++by) {
+        AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+        for (size_t bx = 0; bx < rect.xsize(); ++bx) {
+          AcStrategy acs = acs_row[bx];
+          if (!acs.IsFirstBlock()) continue;
+          if (!use_sample()) continue;
+          size_t size = kDCTBlockSize << acs.log2_covered_blocks();
+          for (size_t c = 0; c < 3; ++c) {
+            const size_t order_offset =
+                CoeffOrderOffset(kStrategyOrder[acs.RawStrategy()], c);
+            if (type == ACType::k16) {
+              for (size_t k = 0; k < size; k++) {
+                bool is_zero = rows[c].ptr16[ac_offset + k] == 0;
+                num_zeros[order_offset + k] += is_zero ? 1 : 0;
+              }
+            } else {
+              for (size_t k = 0; k < size; k++) {
+                bool is_zero = rows[c].ptr32[ac_offset + k] == 0;
+                num_zeros[order_offset + k] += is_zero ? 1 : 0;
+              }
+            }
+            // Ensure LLFs are first in the order.
+            size_t cx = acs.covered_blocks_x();
+            size_t cy = acs.covered_blocks_y();
+            CoefficientLayout(&cy, &cx);
+            for (size_t iy = 0; iy < cy; iy++) {
+              for (size_t ix = 0; ix < cx; ix++) {
+                num_zeros[order_offset + iy * kBlockDim * cx + ix] = -1;
+              }
+            }
+          }
+          ac_offset += size;
+        }
+      }
+    }
+  }
+  struct PosAndCount {
+    uint32_t pos;
+    uint32_t count;
+  };
+  auto mem = hwy::AllocateAligned<PosAndCount>(AcStrategy::kMaxCoeffArea);
+
+  std::vector<coeff_order_t> natural_order_buffer;
+
+  uint16_t computed = 0;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    size_t sz = kDCTBlockSize * acs.covered_blocks_x() * acs.covered_blocks_y();
+
+    // Do nothing for transforms that don't appear.
+    if ((1 << ord) & ~used_acs) continue;
+
+    if (natural_order_buffer.size() < sz) natural_order_buffer.resize(sz);
+    acs.ComputeNaturalCoeffOrder(natural_order_buffer.data());
+
+    // Ensure natural coefficient order is not permuted if the order is
+    // not transmitted.
+    if ((1 << ord) & ~used_orders) {
+      for (size_t c = 0; c < 3; c++) {
+        size_t offset = CoeffOrderOffset(ord, c);
+        JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz);
+        memcpy(&order[offset], natural_order_buffer.data(),
+               sz * sizeof(*order));
+      }
+      continue;
+    }
+
+    bool is_nondefault = false;
+    for (uint8_t c = 0; c < 3; c++) {
+      // Apply zig-zag order.
+      PosAndCount* pos_and_val = mem.get();
+      size_t offset = CoeffOrderOffset(ord, c);
+      JXL_DASSERT(CoeffOrderOffset(ord, c + 1) - offset == sz);
+      float inv_sqrt_sz = 1.0f / std::sqrt(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        size_t pos = natural_order_buffer[i];
+        pos_and_val[i].pos = pos;
+        // We don't care for the exact number -> quantize number of zeros,
+        // to get less permuted order.
+        pos_and_val[i].count = num_zeros[offset + pos] * inv_sqrt_sz + 0.1f;
+      }
+
+      // Stable-sort -> elements with same number of zeros will preserve their
+      // order.
+      auto comparator = [](const PosAndCount& a, const PosAndCount& b) -> bool {
+        return a.count < b.count;
+      };
+      std::stable_sort(pos_and_val, pos_and_val + sz, comparator);
+
+      // Grab indices.
+      for (size_t i = 0; i < sz; ++i) {
+        order[offset + i] = pos_and_val[i].pos;
+        is_nondefault |= natural_order_buffer[i] != pos_and_val[i].pos;
+      }
+    }
+    if (!is_nondefault) {
+      used_orders &= ~(1 << ord);
+    }
+  }
+}
+
+namespace {
+
+void TokenizePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                         size_t size, std::vector<Token>* tokens) {
+  std::vector<LehmerT> lehmer(size);
+  std::vector<uint32_t> temp(size + 1);
+  ComputeLehmerCode(order, temp.data(), size, lehmer.data());
+  size_t end = size;
+  while (end > skip && lehmer[end - 1] == 0) {
+    --end;
+  }
+  tokens->emplace_back(CoeffOrderContext(size), end - skip);
+  uint32_t last = 0;
+  for (size_t i = skip; i < end; ++i) {
+    tokens->emplace_back(CoeffOrderContext(last), lehmer[i]);
+    last = lehmer[i];
+  }
+}
+
+}  // namespace
+
+void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                       size_t size, BitWriter* writer, int layer,
+                       AuxOut* aux_out) {
+  std::vector<std::vector<Token>> tokens(1);
+  TokenizePermutation(order, skip, size, &tokens[0]);
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens,
+                           &codes, &context_map, writer, layer, aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+namespace {
+void EncodeCoeffOrder(const coeff_order_t* JXL_RESTRICT order, AcStrategy acs,
+                      std::vector<Token>* tokens, coeff_order_t* order_zigzag,
+                      std::vector<coeff_order_t>& natural_order_lut) {
+  const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+  const size_t size = kDCTBlockSize * llf;
+  for (size_t i = 0; i < size; ++i) {
+    order_zigzag[i] = natural_order_lut[order[i]];
+  }
+  TokenizePermutation(order_zigzag, llf, size, tokens);
+}
+}  // namespace
+
+void EncodeCoeffOrders(uint16_t used_orders,
+                       const coeff_order_t* JXL_RESTRICT order,
+                       BitWriter* writer, size_t layer,
+                       AuxOut* JXL_RESTRICT aux_out) {
+  auto mem = hwy::AllocateAligned<coeff_order_t>(AcStrategy::kMaxCoeffArea);
+  uint16_t computed = 0;
+  std::vector<std::vector<Token>> tokens(1);
+  std::vector<coeff_order_t> natural_order_lut;
+  for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
+    uint8_t ord = kStrategyOrder[o];
+    if (computed & (1 << ord)) continue;
+    computed |= 1 << ord;
+    if ((used_orders & (1 << ord)) == 0) continue;
+    AcStrategy acs = AcStrategy::FromRawStrategy(o);
+    const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
+    const size_t size = kDCTBlockSize * llf;
+    if (natural_order_lut.size() < size) natural_order_lut.resize(size);
+    acs.ComputeNaturalCoeffOrderLut(natural_order_lut.data());
+    for (size_t c = 0; c < 3; c++) {
+      EncodeCoeffOrder(&order[CoeffOrderOffset(ord, c)], acs, &tokens[0],
+                       mem.get(), natural_order_lut);
+    }
+  }
+  // Do not write anything if no order is used.
+  if (used_orders != 0) {
+    std::vector<uint8_t> context_map;
+    EntropyEncodingData codes;
+    BuildAndEncodeHistograms(HistogramParams(), kPermutationContexts, tokens,
+                             &codes, &context_map, writer, layer, aux_out);
+    WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h
new file mode 100644
index 0000000000..3a43f4f986
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_coeff_order.h
@@ -0,0 +1,54 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COEFF_ORDER_H_
+#define LIB_JXL_ENC_COEFF_ORDER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Orders that are actually used in part of image. `rect` is in block units.
+// Returns {orders that are used, orders that might be made non-default}.
+std::pair<uint32_t, uint32_t> ComputeUsedOrders(
+    SpeedTier speed, const AcStrategyImage& ac_strategy, const Rect& rect);
+
+// Modify zig-zag order, so that DCT bands with more zeros go later.
+// Order of DCT bands with same number of zeros is untouched, so
+// permutation will be cheaper to encode.
+void ComputeCoeffOrder(SpeedTier speed, const ACImage& acs,
+                       const AcStrategyImage& ac_strategy,
+                       const FrameDimensions& frame_dim, uint32_t& used_orders,
+                       uint16_t used_acs, coeff_order_t* JXL_RESTRICT order);
+
+void EncodeCoeffOrders(uint16_t used_orders,
+                       const coeff_order_t* JXL_RESTRICT order,
+                       BitWriter* writer, size_t layer,
+                       AuxOut* JXL_RESTRICT aux_out);
+
+// Encoding/decoding of a single permutation. `size`: number of elements in the
+// permutation. `skip`: number of elements to skip from the *beginning* of the
+// permutation.
+void EncodePermutation(const coeff_order_t* JXL_RESTRICT order, size_t skip,
+                       size_t size, BitWriter* writer, int layer,
+                       AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COEFF_ORDER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.cc b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc
new file mode 100644
index 0000000000..8a23ead473
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.cc
@@ -0,0 +1,1293 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_color_management.h"
+
+#ifndef JPEGXL_ENABLE_SKCMS
+#define JPEGXL_ENABLE_SKCMS 0
+#endif
+
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_color_management.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/transfer_functions-inl.h"
+#if JPEGXL_ENABLE_SKCMS
+#include "lib/jxl/enc_jxl_skcms.h"
+#else  // JPEGXL_ENABLE_SKCMS
+#include "lcms2.h"
+#include "lcms2_plugin.h"
+#endif  // JPEGXL_ENABLE_SKCMS
+
+#define JXL_CMS_VERBOSE 0
+
+// Define these only once. We can't use HWY_ONCE here because it is defined as
+// 1 only on the last pass.
+#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+#define LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+
+namespace jxl {
+namespace {
+struct JxlCms {
+#if JPEGXL_ENABLE_SKCMS
+  PaddedBytes icc_src, icc_dst;
+  skcms_ICCProfile profile_src, profile_dst;
+#else
+  void* lcms_transform;
+#endif
+
+  // These fields are used when the HLG OOTF or inverse OOTF must be applied.
+  bool apply_hlg_ootf;
+  size_t hlg_ootf_num_channels;
+  // Y component of the primaries.
+  std::array<float, 3> hlg_ootf_luminances;
+
+  size_t channels_src;
+  size_t channels_dst;
+  ImageF buf_src;
+  ImageF buf_dst;
+  float intensity_target;
+  bool skip_lcms = false;
+  ExtraTF preprocess = ExtraTF::kNone;
+  ExtraTF postprocess = ExtraTF::kNone;
+};
+
+Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
+                    bool forward);
+}  // namespace
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+#if JXL_CMS_VERBOSE >= 2
+const size_t kX = 0;  // pixel index, multiplied by 3 for RGB
+#endif
+
+// xform_src = UndoGammaCompression(buf_src).
+Status BeforeTransform(JxlCms* t, const float* buf_src, float* xform_src,
+                       size_t buf_size) {
+  switch (t->preprocess) {
+    case ExtraTF::kNone:
+      JXL_DASSERT(false);  // unreachable
+      break;
+
+    case ExtraTF::kPQ: {
+      // By default, PQ content has an intensity target of 10000, stored
+      // exactly.
+      HWY_FULL(float) df;
+      const auto multiplier = Set(df, t->intensity_target == 10000.f
+                                          ? 1.0f
+                                          : 10000.f / t->intensity_target);
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_src + i);
+        const auto result =
+            Mul(multiplier, TF_PQ().DisplayFromEncoded(df, val));
+        Store(result, df, xform_src + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoPQ %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+    }
+
+    case ExtraTF::kHLG:
+      for (size_t i = 0; i < buf_size; ++i) {
+        xform_src[i] = static_cast<float>(
+            TF_HLG().DisplayFromEncoded(static_cast<double>(buf_src[i])));
+      }
+      if (t->apply_hlg_ootf) {
+        JXL_RETURN_IF_ERROR(
+            ApplyHlgOotf(t, xform_src, buf_size, /*forward=*/true));
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoHLG %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+
+    case ExtraTF::kSRGB:
+      HWY_FULL(float) df;
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_src + i);
+        const auto result = TF_SRGB().DisplayFromEncoded(val);
+        Store(result, df, xform_src + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("pre in %.4f %.4f %.4f undoSRGB %.4f %.4f %.4f\n", buf_src[3 * kX],
+             buf_src[3 * kX + 1], buf_src[3 * kX + 2], xform_src[3 * kX],
+             xform_src[3 * kX + 1], xform_src[3 * kX + 2]);
+#endif
+      break;
+  }
+  return true;
+}
+
+// Applies gamma compression in-place.
+Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) {
+  switch (t->postprocess) {
+    case ExtraTF::kNone:
+      JXL_DASSERT(false);  // unreachable
+      break;
+    case ExtraTF::kPQ: {
+      HWY_FULL(float) df;
+      const auto multiplier =
+          Set(df, t->intensity_target == 10000.f ? 1.0f
+                                                 : t->intensity_target * 1e-4f);
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_dst + i);
+        const auto result =
+            TF_PQ().EncodedFromDisplay(df, Mul(multiplier, val));
+        Store(result, df, buf_dst + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after PQ enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+    }
+    case ExtraTF::kHLG:
+      if (t->apply_hlg_ootf) {
+        JXL_RETURN_IF_ERROR(
+            ApplyHlgOotf(t, buf_dst, buf_size, /*forward=*/false));
+      }
+      for (size_t i = 0; i < buf_size; ++i) {
+        buf_dst[i] = static_cast<float>(
+            TF_HLG().EncodedFromDisplay(static_cast<double>(buf_dst[i])));
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after HLG enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+    case ExtraTF::kSRGB:
+      HWY_FULL(float) df;
+      for (size_t i = 0; i < buf_size; i += Lanes(df)) {
+        const auto val = Load(df, buf_dst + i);
+        const auto result =
+            TF_SRGB().EncodedFromDisplay(HWY_FULL(float)(), val);
+        Store(result, df, buf_dst + i);
+      }
+#if JXL_CMS_VERBOSE >= 2
+      printf("after SRGB enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
+             buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+      break;
+  }
+  return true;
+}
+
+Status DoColorSpaceTransform(void* cms_data, const size_t thread,
+                             const float* buf_src, float* buf_dst,
+                             size_t xsize) {
+  // No lock needed.
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+
+  const float* xform_src = buf_src;  // Read-only.
+  if (t->preprocess != ExtraTF::kNone) {
+    float* mutable_xform_src = t->buf_src.Row(thread);  // Writable buffer.
+    JXL_RETURN_IF_ERROR(BeforeTransform(t, buf_src, mutable_xform_src,
+                                        xsize * t->channels_src));
+    xform_src = mutable_xform_src;
+  }
+
+#if JPEGXL_ENABLE_SKCMS
+  if (t->channels_src == 1 && !t->skip_lcms) {
+    // Expand from 1 to 3 channels, starting from the end in case
+    // xform_src == t->buf_src.Row(thread).
+    float* mutable_xform_src = t->buf_src.Row(thread);
+    for (size_t i = 0; i < xsize; ++i) {
+      const size_t x = xsize - i - 1;
+      mutable_xform_src[x * 3] = mutable_xform_src[x * 3 + 1] =
+          mutable_xform_src[x * 3 + 2] = xform_src[x];
+    }
+    xform_src = mutable_xform_src;
+  }
+#else
+  if (t->channels_src == 4 && !t->skip_lcms) {
+    // LCMS does CMYK in a weird way: 0 = white, 100 = max ink
+    float* mutable_xform_src = t->buf_src.Row(thread);
+    for (size_t x = 0; x < xsize * 4; ++x) {
+      mutable_xform_src[x] = 100.f - 100.f * mutable_xform_src[x];
+    }
+    xform_src = mutable_xform_src;
+  }
+#endif
+
+#if JXL_CMS_VERBOSE >= 2
+  // Save inputs for printing before in-place transforms overwrite them.
+  const float in0 = xform_src[3 * kX + 0];
+  const float in1 = xform_src[3 * kX + 1];
+  const float in2 = xform_src[3 * kX + 2];
+#endif
+
+  if (t->skip_lcms) {
+    if (buf_dst != xform_src) {
+      memcpy(buf_dst, xform_src, xsize * t->channels_src * sizeof(*buf_dst));
+    }  // else: in-place, no need to copy
+  } else {
+#if JPEGXL_ENABLE_SKCMS
+    JXL_CHECK(
+        skcms_Transform(xform_src,
+                        (t->channels_src == 4 ? skcms_PixelFormat_RGBA_ffff
+                                              : skcms_PixelFormat_RGB_fff),
+                        skcms_AlphaFormat_Opaque, &t->profile_src, buf_dst,
+                        skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque,
+                        &t->profile_dst, xsize));
+#else   // JPEGXL_ENABLE_SKCMS
+    cmsDoTransform(t->lcms_transform, xform_src, buf_dst,
+                   static_cast<cmsUInt32Number>(xsize));
+#endif  // JPEGXL_ENABLE_SKCMS
+  }
+#if JXL_CMS_VERBOSE >= 2
+  printf("xform skip%d: %.4f %.4f %.4f (%p) -> (%p) %.4f %.4f %.4f\n",
+         t->skip_lcms, in0, in1, in2, xform_src, buf_dst, buf_dst[3 * kX],
+         buf_dst[3 * kX + 1], buf_dst[3 * kX + 2]);
+#endif
+
+#if JPEGXL_ENABLE_SKCMS
+  if (t->channels_dst == 1 && !t->skip_lcms) {
+    // Contract back from 3 to 1 channel, this time forward.
+    float* grayscale_buf_dst = t->buf_dst.Row(thread);
+    for (size_t x = 0; x < xsize; ++x) {
+      grayscale_buf_dst[x] = buf_dst[x * 3];
+    }
+    buf_dst = grayscale_buf_dst;
+  }
+#endif
+
+  if (t->postprocess != ExtraTF::kNone) {
+    JXL_RETURN_IF_ERROR(AfterTransform(t, buf_dst, xsize * t->channels_dst));
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(DoColorSpaceTransform);
+int DoColorSpaceTransform(void* t, size_t thread, const float* buf_src,
+                          float* buf_dst, size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(DoColorSpaceTransform)(t, thread, buf_src,
+                                                     buf_dst, xsize);
+}
+
+// Define to 1 on OS X as a workaround for older LCMS lacking MD5.
+#define JXL_CMS_OLD_VERSION 0
+
+#if JPEGXL_ENABLE_SKCMS
+
+JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const float XYZ[3]) {
+  const float factor = 1.f / (XYZ[0] + XYZ[1] + XYZ[2]);
+  CIExy xy;
+  xy.x = XYZ[0] * factor;
+  xy.y = XYZ[1] * factor;
+  return xy;
+}
+
+#else  // JPEGXL_ENABLE_SKCMS
+// (LCMS interface requires xyY but we omit the Y for white points/primaries.)
+
+JXL_MUST_USE_RESULT CIExy CIExyFromxyY(const cmsCIExyY& xyY) {
+  CIExy xy;
+  xy.x = xyY.x;
+  xy.y = xyY.y;
+  return xy;
+}
+
+JXL_MUST_USE_RESULT CIExy CIExyFromXYZ(const cmsCIEXYZ& XYZ) {
+  cmsCIExyY xyY;
+  cmsXYZ2xyY(/*Dest=*/&xyY, /*Source=*/&XYZ);
+  return CIExyFromxyY(xyY);
+}
+
+JXL_MUST_USE_RESULT cmsCIEXYZ D50_XYZ() {
+  // Quantized D50 as stored in ICC profiles.
+  return {0.96420288, 1.0, 0.82490540};
+}
+
+// RAII
+
+struct ProfileDeleter {
+  void operator()(void* p) { cmsCloseProfile(p); }
+};
+using Profile = std::unique_ptr<void, ProfileDeleter>;
+
+struct TransformDeleter {
+  void operator()(void* p) { cmsDeleteTransform(p); }
+};
+using Transform = std::unique_ptr<void, TransformDeleter>;
+
+struct CurveDeleter {
+  void operator()(cmsToneCurve* p) { cmsFreeToneCurve(p); }
+};
+using Curve = std::unique_ptr<cmsToneCurve, CurveDeleter>;
+
+Status CreateProfileXYZ(const cmsContext context,
+                        Profile* JXL_RESTRICT profile) {
+  profile->reset(cmsCreateXYZProfileTHR(context));
+  if (profile->get() == nullptr) return JXL_FAILURE("Failed to create XYZ");
+  return true;
+}
+
+#endif  // !JPEGXL_ENABLE_SKCMS
+
+#if JPEGXL_ENABLE_SKCMS
+// IMPORTANT: icc must outlive profile.
+Status DecodeProfile(const uint8_t* icc, size_t size,
+                     skcms_ICCProfile* const profile) {
+  if (!skcms_Parse(icc, size, profile)) {
+    return JXL_FAILURE("Failed to parse ICC profile with %" PRIuS " bytes",
+                       size);
+  }
+  return true;
+}
+#else  // JPEGXL_ENABLE_SKCMS
+Status DecodeProfile(const cmsContext context, const PaddedBytes& icc,
+                     Profile* profile) {
+  profile->reset(cmsOpenProfileFromMemTHR(context, icc.data(), icc.size()));
+  if (profile->get() == nullptr) {
+    return JXL_FAILURE("Failed to decode profile");
+  }
+
+  // WARNING: due to the LCMS MD5 issue mentioned above, many existing
+  // profiles have incorrect MD5, so do not even bother checking them nor
+  // generating warning clutter.
+
+  return true;
+}
+#endif  // JPEGXL_ENABLE_SKCMS
+
+#if JPEGXL_ENABLE_SKCMS
+
+ColorSpace ColorSpaceFromProfile(const skcms_ICCProfile& profile) {
+  switch (profile.data_color_space) {
+    case skcms_Signature_RGB:
+    case skcms_Signature_CMYK:
+      // spec says CMYK is encoded as RGB (the kBlack extra channel signals that
+      // it is actually CMYK)
+      return ColorSpace::kRGB;
+    case skcms_Signature_Gray:
+      return ColorSpace::kGray;
+    default:
+      return ColorSpace::kUnknown;
+  }
+}
+
+// vector_out := matmul(matrix, vector_in)
+void MatrixProduct(const skcms_Matrix3x3& matrix, const float vector_in[3],
+                   float vector_out[3]) {
+  for (int i = 0; i < 3; ++i) {
+    vector_out[i] = 0;
+    for (int j = 0; j < 3; ++j) {
+      vector_out[i] += matrix.vals[i][j] * vector_in[j];
+    }
+  }
+}
+
+// Returns white point that was specified when creating the profile.
+JXL_MUST_USE_RESULT Status UnadaptedWhitePoint(const skcms_ICCProfile& profile,
+                                               CIExy* out) {
+  float media_white_point_XYZ[3];
+  if (!skcms_GetWTPT(&profile, media_white_point_XYZ)) {
+    return JXL_FAILURE("ICC profile does not contain WhitePoint tag");
+  }
+  skcms_Matrix3x3 CHAD;
+  if (!skcms_GetCHAD(&profile, &CHAD)) {
+    // If there is no chromatic adaptation matrix, it means that the white point
+    // is already unadapted.
+    *out = CIExyFromXYZ(media_white_point_XYZ);
+    return true;
+  }
+  // Otherwise, it has been adapted to the PCS white point using said matrix,
+  // and the adaptation needs to be undone.
+  skcms_Matrix3x3 inverse_CHAD;
+  if (!skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD)) {
+    return JXL_FAILURE("Non-invertible ChromaticAdaptation matrix");
+  }
+  float unadapted_white_point_XYZ[3];
+  MatrixProduct(inverse_CHAD, media_white_point_XYZ, unadapted_white_point_XYZ);
+  *out = CIExyFromXYZ(unadapted_white_point_XYZ);
+  return true;
+}
+
+Status IdentifyPrimaries(const skcms_ICCProfile& profile,
+                         const CIExy& wp_unadapted, ColorEncoding* c) {
+  if (!c->HasPrimaries()) return true;
+
+  skcms_Matrix3x3 CHAD, inverse_CHAD;
+  if (skcms_GetCHAD(&profile, &CHAD)) {
+    JXL_RETURN_IF_ERROR(skcms_Matrix3x3_invert(&CHAD, &inverse_CHAD));
+  } else {
+    static constexpr skcms_Matrix3x3 kLMSFromXYZ = {
+        {{0.8951, 0.2664, -0.1614},
+         {-0.7502, 1.7135, 0.0367},
+         {0.0389, -0.0685, 1.0296}}};
+    static constexpr skcms_Matrix3x3 kXYZFromLMS = {
+        {{0.9869929, -0.1470543, 0.1599627},
+         {0.4323053, 0.5183603, 0.0492912},
+         {-0.0085287, 0.0400428, 0.9684867}}};
+    static constexpr float kWpD50XYZ[3] = {0.96420288, 1.0, 0.82490540};
+    float wp_unadapted_XYZ[3];
+    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp_unadapted, wp_unadapted_XYZ));
+    float wp_D50_LMS[3], wp_unadapted_LMS[3];
+    MatrixProduct(kLMSFromXYZ, kWpD50XYZ, wp_D50_LMS);
+    MatrixProduct(kLMSFromXYZ, wp_unadapted_XYZ, wp_unadapted_LMS);
+    inverse_CHAD = {{{wp_unadapted_LMS[0] / wp_D50_LMS[0], 0, 0},
+                     {0, wp_unadapted_LMS[1] / wp_D50_LMS[1], 0},
+                     {0, 0, wp_unadapted_LMS[2] / wp_D50_LMS[2]}}};
+    inverse_CHAD = skcms_Matrix3x3_concat(&kXYZFromLMS, &inverse_CHAD);
+    inverse_CHAD = skcms_Matrix3x3_concat(&inverse_CHAD, &kLMSFromXYZ);
+  }
+
+  float XYZ[3];
+  PrimariesCIExy primaries;
+  CIExy* const chromaticities[] = {&primaries.r, &primaries.g, &primaries.b};
+  for (int i = 0; i < 3; ++i) {
+    float RGB[3] = {};
+    RGB[i] = 1;
+    skcms_Transform(RGB, skcms_PixelFormat_RGB_fff, skcms_AlphaFormat_Opaque,
+                    &profile, XYZ, skcms_PixelFormat_RGB_fff,
+                    skcms_AlphaFormat_Opaque, skcms_XYZD50_profile(), 1);
+    float unadapted_XYZ[3];
+    MatrixProduct(inverse_CHAD, XYZ, unadapted_XYZ);
+    *chromaticities[i] = CIExyFromXYZ(unadapted_XYZ);
+  }
+  return c->SetPrimaries(primaries);
+}
+
+void DetectTransferFunction(const skcms_ICCProfile& profile,
+                            ColorEncoding* JXL_RESTRICT c) {
+  if (c->tf.SetImplicit()) return;
+
+  float gamma[3] = {};
+  if (profile.has_trc) {
+    const auto IsGamma = [](const skcms_TransferFunction& tf) {
+      return tf.a == 1 && tf.b == 0 &&
+             /* if b and d are zero, it is fine for c not to be */ tf.d == 0 &&
+             tf.e == 0 && tf.f == 0;
+    };
+    for (int i = 0; i < 3; ++i) {
+      if (profile.trc[i].table_entries == 0 &&
+          IsGamma(profile.trc->parametric)) {
+        gamma[i] = 1.f / profile.trc->parametric.g;
+      } else {
+        skcms_TransferFunction approximate_tf;
+        float max_error;
+        if (skcms_ApproximateCurve(&profile.trc[i], &approximate_tf,
+                                   &max_error)) {
+          if (IsGamma(approximate_tf)) {
+            gamma[i] = 1.f / approximate_tf.g;
+          }
+        }
+      }
+    }
+  }
+  if (gamma[0] != 0 && std::abs(gamma[0] - gamma[1]) < 1e-4f &&
+      std::abs(gamma[1] - gamma[2]) < 1e-4f) {
+    if (c->tf.SetGamma(gamma[0])) {
+      skcms_ICCProfile profile_test;
+      PaddedBytes bytes;
+      if (MaybeCreateProfile(*c, &bytes) &&
+          DecodeProfile(bytes.data(), bytes.size(), &profile_test) &&
+          skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) {
+        return;
+      }
+    }
+  }
+
+  for (TransferFunction tf : Values<TransferFunction>()) {
+    // Can only create profile from known transfer function.
+    if (tf == TransferFunction::kUnknown) continue;
+
+    c->tf.SetTransferFunction(tf);
+
+    skcms_ICCProfile profile_test;
+    PaddedBytes bytes;
+    if (MaybeCreateProfile(*c, &bytes) &&
+        DecodeProfile(bytes.data(), bytes.size(), &profile_test) &&
+        skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) {
+      return;
+    }
+  }
+
+  c->tf.SetTransferFunction(TransferFunction::kUnknown);
+}
+
+#else  // JPEGXL_ENABLE_SKCMS
+
+uint32_t Type32(const ColorEncoding& c, bool cmyk) {
+  if (cmyk) return TYPE_CMYK_FLT;
+  if (c.IsGray()) return TYPE_GRAY_FLT;
+  return TYPE_RGB_FLT;
+}
+
+uint32_t Type64(const ColorEncoding& c) {
+  if (c.IsGray()) return TYPE_GRAY_DBL;
+  return TYPE_RGB_DBL;
+}
+
+ColorSpace ColorSpaceFromProfile(const Profile& profile) {
+  switch (cmsGetColorSpace(profile.get())) {
+    case cmsSigRgbData:
+    case cmsSigCmykData:
+      return ColorSpace::kRGB;
+    case cmsSigGrayData:
+      return ColorSpace::kGray;
+    default:
+      return ColorSpace::kUnknown;
+  }
+}
+
+// "profile1" is pre-decoded to save time in DetectTransferFunction.
+Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1,
+                              const PaddedBytes& icc, const ColorEncoding& c) {
+  const uint32_t type_src = Type64(c);
+
+  Profile profile2;
+  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc, &profile2));
+
+  Profile profile_xyz;
+  JXL_RETURN_IF_ERROR(CreateProfileXYZ(context, &profile_xyz));
+
+  const uint32_t intent = INTENT_RELATIVE_COLORIMETRIC;
+  const uint32_t flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_BLACKPOINTCOMPENSATION |
+                         cmsFLAGS_HIGHRESPRECALC;
+  Transform xform1(cmsCreateTransformTHR(context, profile1.get(), type_src,
+                                         profile_xyz.get(), TYPE_XYZ_DBL,
+                                         intent, flags));
+  Transform xform2(cmsCreateTransformTHR(context, profile2.get(), type_src,
+                                         profile_xyz.get(), TYPE_XYZ_DBL,
+                                         intent, flags));
+  if (xform1 == nullptr || xform2 == nullptr) {
+    return JXL_FAILURE("Failed to create transform");
+  }
+
+  double in[3];
+  double out1[3];
+  double out2[3];
+
+  // Uniformly spaced samples from very dark to almost fully bright.
+  const double init = 1E-3;
+  const double step = 0.2;
+
+  if (c.IsGray()) {
+    // Finer sampling and replicate each component.
+    for (in[0] = init; in[0] < 1.0; in[0] += step / 8) {
+      cmsDoTransform(xform1.get(), in, out1, 1);
+      cmsDoTransform(xform2.get(), in, out2, 1);
+      if (!ApproxEq(out1[0], out2[0], 2E-4)) {
+        return false;
+      }
+    }
+  } else {
+    for (in[0] = init; in[0] < 1.0; in[0] += step) {
+      for (in[1] = init; in[1] < 1.0; in[1] += step) {
+        for (in[2] = init; in[2] < 1.0; in[2] += step) {
+          cmsDoTransform(xform1.get(), in, out1, 1);
+          cmsDoTransform(xform2.get(), in, out2, 1);
+          for (size_t i = 0; i < 3; ++i) {
+            if (!ApproxEq(out1[i], out2[i], 2E-4)) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+// Returns white point that was specified when creating the profile.
+// NOTE: we can't just use cmsSigMediaWhitePointTag because its interpretation
+// differs between ICC versions.
+JXL_MUST_USE_RESULT cmsCIEXYZ UnadaptedWhitePoint(const cmsContext context,
+                                                  const Profile& profile,
+                                                  const ColorEncoding& c) {
+  const cmsCIEXYZ* white_point = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigMediaWhitePointTag));
+  if (white_point != nullptr &&
+      cmsReadTag(profile.get(), cmsSigChromaticAdaptationTag) == nullptr) {
+    // No chromatic adaptation matrix: the white point is already unadapted.
+    return *white_point;
+  }
+
+  cmsCIEXYZ XYZ = {1.0, 1.0, 1.0};
+  Profile profile_xyz;
+  if (!CreateProfileXYZ(context, &profile_xyz)) return XYZ;
+  // Array arguments are one per profile.
+  cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()};
+  // Leave white point unchanged - that is what we're trying to extract.
+  cmsUInt32Number intents[2] = {INTENT_ABSOLUTE_COLORIMETRIC,
+                                INTENT_ABSOLUTE_COLORIMETRIC};
+  cmsBool black_compensation[2] = {0, 0};
+  cmsFloat64Number adaption[2] = {0.0, 0.0};
+  // Only transforming a single pixel, so skip expensive optimizations.
+  cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC;
+  Transform xform(cmsCreateExtendedTransform(
+      context, 2, profiles, black_compensation, intents, adaption, nullptr, 0,
+      Type64(c), TYPE_XYZ_DBL, flags));
+  if (!xform) return XYZ;  // TODO(lode): return error
+
+  // xy are relative, so magnitude does not matter if we ignore output Y.
+  const cmsFloat64Number in[3] = {1.0, 1.0, 1.0};
+  cmsDoTransform(xform.get(), in, &XYZ.X, 1);
+  return XYZ;
+}
+
+Status IdentifyPrimaries(const cmsContext context, const Profile& profile,
+                         const cmsCIEXYZ& wp_unadapted, ColorEncoding* c) {
+  if (!c->HasPrimaries()) return true;
+  if (ColorSpaceFromProfile(profile) == ColorSpace::kUnknown) return true;
+
+  // These were adapted to the profile illuminant before storing in the profile.
+  const cmsCIEXYZ* adapted_r = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigRedColorantTag));
+  const cmsCIEXYZ* adapted_g = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigGreenColorantTag));
+  const cmsCIEXYZ* adapted_b = static_cast<const cmsCIEXYZ*>(
+      cmsReadTag(profile.get(), cmsSigBlueColorantTag));
+
+  cmsCIEXYZ converted_rgb[3];
+  if (adapted_r == nullptr || adapted_g == nullptr || adapted_b == nullptr) {
+    // No colorant tag, determine the XYZ coordinates of the primaries by
+    // converting from the colorspace.
+    Profile profile_xyz;
+    if (!CreateProfileXYZ(context, &profile_xyz)) {
+      return JXL_FAILURE("Failed to retrieve colorants");
+    }
+    // Array arguments are one per profile.
+    cmsHPROFILE profiles[2] = {profile.get(), profile_xyz.get()};
+    cmsUInt32Number intents[2] = {INTENT_RELATIVE_COLORIMETRIC,
+                                  INTENT_RELATIVE_COLORIMETRIC};
+    cmsBool black_compensation[2] = {0, 0};
+    cmsFloat64Number adaption[2] = {0.0, 0.0};
+    // Only transforming three pixels, so skip expensive optimizations.
+    cmsUInt32Number flags = cmsFLAGS_NOOPTIMIZE | cmsFLAGS_HIGHRESPRECALC;
+    Transform xform(cmsCreateExtendedTransform(
+        context, 2, profiles, black_compensation, intents, adaption, nullptr, 0,
+        Type64(*c), TYPE_XYZ_DBL, flags));
+    if (!xform) return JXL_FAILURE("Failed to retrieve colorants");
+
+    const cmsFloat64Number in[9] = {1.0, 0.0, 0.0, 0.0, 1.0,
+                                    0.0, 0.0, 0.0, 1.0};
+    cmsDoTransform(xform.get(), in, &converted_rgb->X, 3);
+    adapted_r = &converted_rgb[0];
+    adapted_g = &converted_rgb[1];
+    adapted_b = &converted_rgb[2];
+  }
+
+  // TODO(janwas): no longer assume Bradford and D50.
+  // Undo the chromatic adaptation.
+  const cmsCIEXYZ d50 = D50_XYZ();
+
+  cmsCIEXYZ r, g, b;
+  cmsAdaptToIlluminant(&r, &d50, &wp_unadapted, adapted_r);
+  cmsAdaptToIlluminant(&g, &d50, &wp_unadapted, adapted_g);
+  cmsAdaptToIlluminant(&b, &d50, &wp_unadapted, adapted_b);
+
+  const PrimariesCIExy rgb = {CIExyFromXYZ(r), CIExyFromXYZ(g),
+                              CIExyFromXYZ(b)};
+  return c->SetPrimaries(rgb);
+}
+
+void DetectTransferFunction(const cmsContext context, const Profile& profile,
+                            ColorEncoding* JXL_RESTRICT c) {
+  if (c->tf.SetImplicit()) return;
+
+  float gamma = 0;
+  if (const auto* gray_trc = reinterpret_cast<const cmsToneCurve*>(
+          cmsReadTag(profile.get(), cmsSigGrayTRCTag))) {
+    const double estimated_gamma =
+        cmsEstimateGamma(gray_trc, /*precision=*/1e-4);
+    if (estimated_gamma > 0) {
+      gamma = 1. / estimated_gamma;
+    }
+  } else {
+    float rgb_gamma[3] = {};
+    int i = 0;
+    for (const auto tag :
+         {cmsSigRedTRCTag, cmsSigGreenTRCTag, cmsSigBlueTRCTag}) {
+      if (const auto* trc = reinterpret_cast<const cmsToneCurve*>(
+              cmsReadTag(profile.get(), tag))) {
+        const double estimated_gamma =
+            cmsEstimateGamma(trc, /*precision=*/1e-4);
+        if (estimated_gamma > 0) {
+          rgb_gamma[i] = 1. / estimated_gamma;
+        }
+      }
+      ++i;
+    }
+    if (rgb_gamma[0] != 0 && std::abs(rgb_gamma[0] - rgb_gamma[1]) < 1e-4f &&
+        std::abs(rgb_gamma[1] - rgb_gamma[2]) < 1e-4f) {
+      gamma = rgb_gamma[0];
+    }
+  }
+
+  if (gamma != 0 && c->tf.SetGamma(gamma)) {
+    PaddedBytes icc_test;
+    if (MaybeCreateProfile(*c, &icc_test) &&
+        ProfileEquivalentToICC(context, profile, icc_test, *c)) {
+      return;
+    }
+  }
+
+  for (TransferFunction tf : Values<TransferFunction>()) {
+    // Can only create profile from known transfer function.
+    if (tf == TransferFunction::kUnknown) continue;
+
+    c->tf.SetTransferFunction(tf);
+
+    PaddedBytes icc_test;
+    if (MaybeCreateProfile(*c, &icc_test) &&
+        ProfileEquivalentToICC(context, profile, icc_test, *c)) {
+      return;
+    }
+  }
+
+  c->tf.SetTransferFunction(TransferFunction::kUnknown);
+}
+
+void ErrorHandler(cmsContext context, cmsUInt32Number code, const char* text) {
+  JXL_WARNING("LCMS error %u: %s", code, text);
+}
+
+// Returns a context for the current thread, creating it if necessary.
+cmsContext GetContext() {
+  static thread_local void* context_;
+  if (context_ == nullptr) {
+    context_ = cmsCreateContext(nullptr, nullptr);
+    JXL_ASSERT(context_ != nullptr);
+
+    cmsSetLogErrorHandlerTHR(static_cast<cmsContext>(context_), &ErrorHandler);
+  }
+  return static_cast<cmsContext>(context_);
+}
+
+#endif  // JPEGXL_ENABLE_SKCMS
+
+Status GetPrimariesLuminances(const ColorEncoding& encoding,
+                              float luminances[3]) {
+  // Explanation:
+  // We know that the three primaries must sum to white:
+  //
+  // [Xr, Xg, Xb;     [1;     [Xw;
+  //  Yr, Yg, Yb;  ×   1;  =   Yw;
+  //  Zr, Zg, Zb]      1]      Zw]
+  //
+  // By noting that X = x·(X+Y+Z), Y = y·(X+Y+Z) and Z = z·(X+Y+Z) (note the
+  // lower case indicating chromaticity), and factoring the totals (X+Y+Z) out
+  // of the left matrix and into the all-ones vector, we get:
+  //
+  // [xr, xg, xb;     [Xr + Yr + Zr;     [Xw;
+  //  yr, yg, yb;  ×   Xg + Yg + Zg;  =   Yw;
+  //  zr, zg, zb]      Xb + Yb + Zb]      Zw]
+  //
+  // Which makes it apparent that we can compute those totals as:
+  //
+  //                  [Xr + Yr + Zr;     inv([xr, xg, xb;      [Xw;
+  //                   Xg + Yg + Zg;  =       yr, yg, yb;   ×   Yw;
+  //                   Xb + Yb + Zb]          zr, zg, zb])      Zw]
+  //
+  // From there, by multiplying each total by its corresponding y, we get Y for
+  // that primary.
+
+  float white_XYZ[3];
+  JXL_RETURN_IF_ERROR(
+      CIEXYZFromWhiteCIExy(encoding.GetWhitePoint(), white_XYZ));
+
+  const PrimariesCIExy primaries = encoding.GetPrimaries();
+  double chromaticities[3][3] = {
+      {primaries.r.x, primaries.g.x, primaries.b.x},
+      {primaries.r.y, primaries.g.y, primaries.b.y},
+      {1 - primaries.r.x - primaries.r.y, 1 - primaries.g.x - primaries.g.y,
+       1 - primaries.b.x - primaries.b.y}};
+  JXL_RETURN_IF_ERROR(Inv3x3Matrix(&chromaticities[0][0]));
+  const double ys[3] = {primaries.r.y, primaries.g.y, primaries.b.y};
+  for (size_t i = 0; i < 3; ++i) {
+    luminances[i] = ys[i] * (chromaticities[i][0] * white_XYZ[0] +
+                             chromaticities[i][1] * white_XYZ[1] +
+                             chromaticities[i][2] * white_XYZ[2]);
+  }
+  return true;
+}
+
+Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
+                    bool forward) {
+  if (295 <= t->intensity_target && t->intensity_target <= 305) {
+    // The gamma is approximately 1 so this can essentially be skipped.
+    return true;
+  }
+  float gamma = 1.2f * std::pow(1.111f, std::log2(t->intensity_target * 1e-3f));
+  if (!forward) gamma = 1.f / gamma;
+
+  switch (t->hlg_ootf_num_channels) {
+    case 1:
+      for (size_t x = 0; x < xsize; ++x) {
+        buf[x] = std::pow(buf[x], gamma);
+      }
+      break;
+
+    case 3:
+      for (size_t x = 0; x < xsize; x += 3) {
+        const float luminance = buf[x] * t->hlg_ootf_luminances[0] +
+                                buf[x + 1] * t->hlg_ootf_luminances[1] +
+                                buf[x + 2] * t->hlg_ootf_luminances[2];
+        const float ratio = std::pow(luminance, gamma - 1);
+        if (std::isfinite(ratio)) {
+          buf[x] *= ratio;
+          buf[x + 1] *= ratio;
+          buf[x + 2] *= ratio;
+          if (forward && gamma < 1) {
+            // If gamma < 1, the ratio above will be > 1 which can push bright
+            // saturated highlights out of gamut. There are several possible
+            // ways to bring them back in-gamut; this one preserves hue and
+            // saturation at the slight expense of luminance. If !forward, the
+            // previously-applied forward OOTF with gamma > 1 already pushed
+            // those highlights down and we are simply putting them back where
+            // they were so this is not necessary.
+            const float maximum =
+                std::max(buf[x], std::max(buf[x + 1], buf[x + 2]));
+            if (maximum > 1) {
+              const float normalizer = 1.f / maximum;
+              buf[x] *= normalizer;
+              buf[x + 1] *= normalizer;
+              buf[x + 2] *= normalizer;
+            }
+          }
+        }
+      }
+      break;
+
+    default:
+      return JXL_FAILURE("HLG OOTF not implemented for %" PRIuS " channels",
+                         t->hlg_ootf_num_channels);
+  }
+  return true;
+}
+
+bool ApplyCICP(const uint8_t color_primaries,
+               const uint8_t transfer_characteristics,
+               const uint8_t matrix_coefficients, const uint8_t full_range,
+               ColorEncoding* JXL_RESTRICT c) {
+  if (matrix_coefficients != 0) return false;
+  if (full_range != 1) return false;
+
+  const auto primaries = static_cast<Primaries>(color_primaries);
+  const auto tf = static_cast<TransferFunction>(transfer_characteristics);
+  if (tf == TransferFunction::kUnknown || !EnumValid(tf)) return false;
+  if (primaries == Primaries::kCustom ||
+      !(color_primaries == 12 || EnumValid(primaries))) {
+    return false;
+  }
+  c->SetColorSpace(ColorSpace::kRGB);
+  c->tf.SetTransferFunction(tf);
+  if (primaries == Primaries::kP3) {
+    c->white_point = WhitePoint::kDCI;
+    c->primaries = Primaries::kP3;
+  } else if (color_primaries == 12) {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = Primaries::kP3;
+  } else {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = primaries;
+  }
+  return true;
+}
+
+}  // namespace
+
+Status ColorEncoding::SetFieldsFromICC() {
+  // In case parsing fails, mark the ColorEncoding as invalid.
+  SetColorSpace(ColorSpace::kUnknown);
+  tf.SetTransferFunction(TransferFunction::kUnknown);
+
+  if (icc_.empty()) return JXL_FAILURE("Empty ICC profile");
+
+#if JPEGXL_ENABLE_SKCMS
+  if (icc_.size() < 128) {
+    return JXL_FAILURE("ICC file too small");
+  }
+
+  skcms_ICCProfile profile;
+  JXL_RETURN_IF_ERROR(skcms_Parse(icc_.data(), icc_.size(), &profile));
+
+  // skcms does not return the rendering intent, so get it from the file. It
+  // is encoded as big-endian 32-bit integer in bytes 60..63.
+  uint32_t rendering_intent32 = icc_[67];
+  if (rendering_intent32 > 3 || icc_[64] != 0 || icc_[65] != 0 ||
+      icc_[66] != 0) {
+    return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
+  }
+  // ICC and RenderingIntent have the same values (0..3).
+  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  if (profile.has_CICP && ApplyCICP(profile.CICP.color_primaries,
+                                    profile.CICP.transfer_characteristics,
+                                    profile.CICP.matrix_coefficients,
+                                    profile.CICP.video_full_range_flag, this)) {
+    return true;
+  }
+
+  SetColorSpace(ColorSpaceFromProfile(profile));
+  cmyk_ = (profile.data_color_space == skcms_Signature_CMYK);
+
+  CIExy wp_unadapted;
+  JXL_RETURN_IF_ERROR(UnadaptedWhitePoint(profile, &wp_unadapted));
+  JXL_RETURN_IF_ERROR(SetWhitePoint(wp_unadapted));
+
+  // Relies on color_space.
+  JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this));
+
+  // Relies on color_space/white point/primaries being set already.
+  DetectTransferFunction(profile, this);
+#else  // JPEGXL_ENABLE_SKCMS
+
+  const cmsContext context = GetContext();
+
+  Profile profile;
+  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc_, &profile));
+
+  static constexpr size_t kCICPSize = 12;
+  static constexpr auto kCICPSignature =
+      static_cast<cmsTagSignature>(0x63696370);
+  uint8_t cicp_buffer[kCICPSize];
+  if (cmsReadRawTag(profile.get(), kCICPSignature, cicp_buffer, kCICPSize) ==
+          kCICPSize &&
+      ApplyCICP(cicp_buffer[8], cicp_buffer[9], cicp_buffer[10],
+                cicp_buffer[11], this)) {
+    return true;
+  }
+
+  const cmsUInt32Number rendering_intent32 =
+      cmsGetHeaderRenderingIntent(profile.get());
+  if (rendering_intent32 > 3) {
+    return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
+  }
+  // ICC and RenderingIntent have the same values (0..3).
+  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  SetColorSpace(ColorSpaceFromProfile(profile));
+  if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) {
+    cmyk_ = true;
+    return true;
+  }
+
+  const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, *this);
+  JXL_RETURN_IF_ERROR(SetWhitePoint(CIExyFromXYZ(wp_unadapted)));
+
+  // Relies on color_space.
+  JXL_RETURN_IF_ERROR(IdentifyPrimaries(context, profile, wp_unadapted, this));
+
+  // Relies on color_space/white point/primaries being set already.
+  DetectTransferFunction(context, profile, this);
+
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  return true;
+}
+
+void ColorEncoding::DecideIfWantICC() {
+  PaddedBytes icc_new;
+#if JPEGXL_ENABLE_SKCMS
+  skcms_ICCProfile profile;
+  if (!DecodeProfile(ICC().data(), ICC().size(), &profile)) return;
+  if (!MaybeCreateProfile(*this, &icc_new)) return;
+#else   // JPEGXL_ENABLE_SKCMS
+  const cmsContext context = GetContext();
+  Profile profile;
+  if (!DecodeProfile(context, ICC(), &profile)) return;
+  if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) return;
+  if (!MaybeCreateProfile(*this, &icc_new)) return;
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  want_icc_ = false;
+}
+
+namespace {
+
+void JxlCmsDestroy(void* cms_data) {
+  if (cms_data == nullptr) return;
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+#if !JPEGXL_ENABLE_SKCMS
+  TransformDeleter()(t->lcms_transform);
+#endif
+  delete t;
+}
+
+void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
+                 const JxlColorProfile* input, const JxlColorProfile* output,
+                 float intensity_target) {
+  auto t = jxl::make_unique<JxlCms>();
+  PaddedBytes icc_src, icc_dst;
+  icc_src.assign(input->icc.data, input->icc.data + input->icc.size);
+  ColorEncoding c_src;
+  if (!c_src.SetICC(std::move(icc_src))) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse input ICC");
+    return nullptr;
+  }
+  icc_dst.assign(output->icc.data, output->icc.data + output->icc.size);
+  ColorEncoding c_dst;
+  if (!c_dst.SetICC(std::move(icc_dst))) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse output ICC");
+    return nullptr;
+  }
+#if JXL_CMS_VERBOSE
+  printf("%s -> %s\n", Description(c_src).c_str(), Description(c_dst).c_str());
+#endif
+
+#if JPEGXL_ENABLE_SKCMS
+  if (!DecodeProfile(input->icc.data, input->icc.size, &t->profile_src)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse input ICC");
+    return nullptr;
+  }
+  if (!DecodeProfile(output->icc.data, output->icc.size, &t->profile_dst)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: skcms failed to parse output ICC");
+    return nullptr;
+  }
+#else   // JPEGXL_ENABLE_SKCMS
+  const cmsContext context = GetContext();
+  Profile profile_src, profile_dst;
+  if (!DecodeProfile(context, c_src.ICC(), &profile_src)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse input ICC");
+    return nullptr;
+  }
+  if (!DecodeProfile(context, c_dst.ICC(), &profile_dst)) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse output ICC");
+    return nullptr;
+  }
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  t->skip_lcms = false;
+  if (c_src.SameColorEncoding(c_dst)) {
+    t->skip_lcms = true;
+#if JXL_CMS_VERBOSE
+    printf("Skip CMS\n");
+#endif
+  }
+
+  t->apply_hlg_ootf = c_src.tf.IsHLG() != c_dst.tf.IsHLG();
+  if (t->apply_hlg_ootf) {
+    const ColorEncoding* c_hlg = c_src.tf.IsHLG() ? &c_src : &c_dst;
+    t->hlg_ootf_num_channels = c_hlg->Channels();
+    if (t->hlg_ootf_num_channels == 3 &&
+        !GetPrimariesLuminances(*c_hlg, t->hlg_ootf_luminances.data())) {
+      JXL_NOTIFY_ERROR(
+          "JxlCmsInit: failed to compute the luminances of primaries");
+      return nullptr;
+    }
+  }
+
+  // Special-case SRGB <=> linear if the primaries / white point are the same,
+  // or any conversion where PQ or HLG is involved:
+  bool src_linear = c_src.tf.IsLinear();
+  const bool dst_linear = c_dst.tf.IsLinear();
+
+  if (c_src.tf.IsPQ() || c_src.tf.IsHLG() ||
+      (c_src.tf.IsSRGB() && dst_linear && c_src.SameColorSpace(c_dst))) {
+    // Construct new profile as if the data were already/still linear.
+    ColorEncoding c_linear_src = c_src;
+    c_linear_src.tf.SetTransferFunction(TransferFunction::kLinear);
+#if JPEGXL_ENABLE_SKCMS
+    skcms_ICCProfile new_src;
+#else  // JPEGXL_ENABLE_SKCMS
+    Profile new_src;
+#endif  // JPEGXL_ENABLE_SKCMS
+        // Only enable ExtraTF if profile creation succeeded.
+    if (MaybeCreateProfile(c_linear_src, &icc_src) &&
+#if JPEGXL_ENABLE_SKCMS
+        DecodeProfile(icc_src.data(), icc_src.size(), &new_src)) {
+#else   // JPEGXL_ENABLE_SKCMS
+        DecodeProfile(context, icc_src, &new_src)) {
+#endif  // JPEGXL_ENABLE_SKCMS
+#if JXL_CMS_VERBOSE
+      printf("Special HLG/PQ/sRGB -> linear\n");
+#endif
+#if JPEGXL_ENABLE_SKCMS
+      t->icc_src = std::move(icc_src);
+      t->profile_src = new_src;
+#else   // JPEGXL_ENABLE_SKCMS
+      profile_src.swap(new_src);
+#endif  // JPEGXL_ENABLE_SKCMS
+      t->preprocess = c_src.tf.IsSRGB()
+                          ? ExtraTF::kSRGB
+                          : (c_src.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG);
+      c_src = c_linear_src;
+      src_linear = true;
+    } else {
+      if (t->apply_hlg_ootf) {
+        JXL_NOTIFY_ERROR(
+            "Failed to create extra linear source profile, and HLG OOTF "
+            "required");
+        return nullptr;
+      }
+      JXL_WARNING("Failed to create extra linear destination profile");
+    }
+  }
+
+  if (c_dst.tf.IsPQ() || c_dst.tf.IsHLG() ||
+      (c_dst.tf.IsSRGB() && src_linear && c_src.SameColorSpace(c_dst))) {
+    ColorEncoding c_linear_dst = c_dst;
+    c_linear_dst.tf.SetTransferFunction(TransferFunction::kLinear);
+#if JPEGXL_ENABLE_SKCMS
+    skcms_ICCProfile new_dst;
+#else   // JPEGXL_ENABLE_SKCMS
+    Profile new_dst;
+#endif  // JPEGXL_ENABLE_SKCMS
+    // Only enable ExtraTF if profile creation succeeded.
+    if (MaybeCreateProfile(c_linear_dst, &icc_dst) &&
+#if JPEGXL_ENABLE_SKCMS
+        DecodeProfile(icc_dst.data(), icc_dst.size(), &new_dst)) {
+#else   // JPEGXL_ENABLE_SKCMS
+        DecodeProfile(context, icc_dst, &new_dst)) {
+#endif  // JPEGXL_ENABLE_SKCMS
+#if JXL_CMS_VERBOSE
+      printf("Special linear -> HLG/PQ/sRGB\n");
+#endif
+#if JPEGXL_ENABLE_SKCMS
+      t->icc_dst = std::move(icc_dst);
+      t->profile_dst = new_dst;
+#else   // JPEGXL_ENABLE_SKCMS
+      profile_dst.swap(new_dst);
+#endif  // JPEGXL_ENABLE_SKCMS
+      t->postprocess = c_dst.tf.IsSRGB()
+                           ? ExtraTF::kSRGB
+                           : (c_dst.tf.IsPQ() ? ExtraTF::kPQ : ExtraTF::kHLG);
+      c_dst = c_linear_dst;
+    } else {
+      if (t->apply_hlg_ootf) {
+        JXL_NOTIFY_ERROR(
+            "Failed to create extra linear destination profile, and inverse "
+            "HLG OOTF required");
+        return nullptr;
+      }
+      JXL_WARNING("Failed to create extra linear destination profile");
+    }
+  }
+
+  if (c_src.SameColorEncoding(c_dst)) {
+#if JXL_CMS_VERBOSE
+    printf("Same intermediary linear profiles, skipping CMS\n");
+#endif
+    t->skip_lcms = true;
+  }
+
+#if JPEGXL_ENABLE_SKCMS
+  if (!skcms_MakeUsableAsDestination(&t->profile_dst)) {
+    JXL_NOTIFY_ERROR(
+        "Failed to make %s usable as a color transform destination",
+        Description(c_dst).c_str());
+    return nullptr;
+  }
+#endif  // JPEGXL_ENABLE_SKCMS
+
+  // Not including alpha channel (copied separately).
+  const size_t channels_src = (c_src.IsCMYK() ? 4 : c_src.Channels());
+  const size_t channels_dst = c_dst.Channels();
+  JXL_CHECK(channels_src == channels_dst ||
+            (channels_src == 4 && channels_dst == 3));
+#if JXL_CMS_VERBOSE
+  printf("Channels: %" PRIuS "; Threads: %" PRIuS "\n", channels_src,
+         num_threads);
+#endif
+
+#if !JPEGXL_ENABLE_SKCMS
+  // Type includes color space (XYZ vs RGB), so can be different.
+  const uint32_t type_src = Type32(c_src, channels_src == 4);
+  const uint32_t type_dst = Type32(c_dst, false);
+  const uint32_t intent = static_cast<uint32_t>(c_dst.rendering_intent);
+  // Use cmsFLAGS_NOCACHE to disable the 1-pixel cache and make calling
+  // cmsDoTransform() thread-safe.
+  const uint32_t flags = cmsFLAGS_NOCACHE | cmsFLAGS_BLACKPOINTCOMPENSATION |
+                         cmsFLAGS_HIGHRESPRECALC;
+  t->lcms_transform =
+      cmsCreateTransformTHR(context, profile_src.get(), type_src,
+                            profile_dst.get(), type_dst, intent, flags);
+  if (t->lcms_transform == nullptr) {
+    JXL_NOTIFY_ERROR("Failed to create transform");
+    return nullptr;
+  }
+#endif  // !JPEGXL_ENABLE_SKCMS
+
+  // Ideally LCMS would convert directly from External to Image3. However,
+  // cmsDoTransformLineStride only accepts 32-bit BytesPerPlaneIn, whereas our
+  // planes can be more than 4 GiB apart. Hence, transform inputs/outputs must
+  // be interleaved. Calling cmsDoTransform for each pixel is expensive
+  // (indirect call). We therefore transform rows, which requires per-thread
+  // buffers. To avoid separate allocations, we use the rows of an image.
+  // Because LCMS apparently also cannot handle <= 16 bit inputs and 32-bit
+  // outputs (or vice versa), we use floating point input/output.
+  t->channels_src = channels_src;
+  t->channels_dst = channels_dst;
+#if JPEGXL_ENABLE_SKCMS
+  // SkiaCMS doesn't support grayscale float buffers, so we create space for RGB
+  // float buffers anyway.
+  t->buf_src = ImageF(xsize * (channels_src == 4 ? 4 : 3), num_threads);
+  t->buf_dst = ImageF(xsize * 3, num_threads);
+#else
+  t->buf_src = ImageF(xsize * channels_src, num_threads);
+  t->buf_dst = ImageF(xsize * channels_dst, num_threads);
+#endif
+  t->intensity_target = intensity_target;
+  return t.release();
+}
+
+float* JxlCmsGetSrcBuf(void* cms_data, size_t thread) {
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+  return t->buf_src.Row(thread);
+}
+
+float* JxlCmsGetDstBuf(void* cms_data, size_t thread) {
+  JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
+  return t->buf_dst.Row(thread);
+}
+
+}  // namespace
+
+const JxlCmsInterface& GetJxlCms() {
+  static constexpr JxlCmsInterface kInterface = {
+      /*init_data=*/nullptr,
+      /*init=*/&JxlCmsInit,
+      /*get_src_buf=*/&JxlCmsGetSrcBuf,
+      /*get_dst_buf=*/&JxlCmsGetDstBuf,
+      /*run=*/&DoColorSpaceTransform,
+      /*destroy=*/&JxlCmsDestroy};
+  return kInterface;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_color_management.h b/third_party/jpeg-xl/lib/jxl/enc_color_management.h
new file mode 100644
index 0000000000..6f6e9023a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_color_management.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_H_
+#define LIB_JXL_ENC_COLOR_MANAGEMENT_H_
+
+// ICC profiles and color space conversions.
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Internal C++ wrapper for a JxlCmsInterface.
+class ColorSpaceTransform {
+ public:
+  explicit ColorSpaceTransform(const JxlCmsInterface& cms) : cms_(cms) {}
+  ~ColorSpaceTransform() {
+    if (cms_data_ != nullptr) {
+      cms_.destroy(cms_data_);
+    }
+  }
+
+  // Cannot copy.
+  ColorSpaceTransform(const ColorSpaceTransform&) = delete;
+  ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete;
+
+  Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst,
+              float intensity_target, size_t xsize, size_t num_threads) {
+    xsize_ = xsize;
+    JxlColorProfile input_profile;
+    icc_src_ = c_src.ICC();
+    input_profile.icc.data = icc_src_.data();
+    input_profile.icc.size = icc_src_.size();
+    ConvertInternalToExternalColorEncoding(c_src,
+                                           &input_profile.color_encoding);
+    input_profile.num_channels = c_src.IsCMYK() ? 4 : c_src.Channels();
+    JxlColorProfile output_profile;
+    icc_dst_ = c_dst.ICC();
+    output_profile.icc.data = icc_dst_.data();
+    output_profile.icc.size = icc_dst_.size();
+    ConvertInternalToExternalColorEncoding(c_dst,
+                                           &output_profile.color_encoding);
+    if (c_dst.IsCMYK())
+      return JXL_FAILURE("Conversion to CMYK is not supported");
+    output_profile.num_channels = c_dst.Channels();
+    cms_data_ = cms_.init(cms_.init_data, num_threads, xsize, &input_profile,
+                          &output_profile, intensity_target);
+    JXL_RETURN_IF_ERROR(cms_data_ != nullptr);
+    return true;
+  }
+
+  float* BufSrc(const size_t thread) const {
+    return cms_.get_src_buf(cms_data_, thread);
+  }
+
+  float* BufDst(const size_t thread) const {
+    return cms_.get_dst_buf(cms_data_, thread);
+  }
+
+  Status Run(const size_t thread, const float* buf_src, float* buf_dst) {
+    return cms_.run(cms_data_, thread, buf_src, buf_dst, xsize_);
+  }
+
+ private:
+  JxlCmsInterface cms_;
+  void* cms_data_ = nullptr;
+  // The interface may retain pointers into these.
+  PaddedBytes icc_src_;
+  PaddedBytes icc_dst_;
+  size_t xsize_;
+};
+
+const JxlCmsInterface& GetJxlCms();
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.cc b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc
new file mode 100644
index 0000000000..cbdd0f78d9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.cc
@@ -0,0 +1,130 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_comparator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/enc_image_bundle.h"
+
+namespace jxl {
+namespace {
+
+// color is linear, but blending happens in gamma-compressed space using
+// (gamma-compressed) grayscale background color, alpha image represents
+// weights of the sRGB colors in the [0 .. (1 << bit_depth) - 1] interval,
+// output image is in linear space.
+void AlphaBlend(const Image3F& in, const size_t c, float background_linear,
+                const ImageF& alpha, Image3F* out) {
+  const float background = LinearToSrgb8Direct(background_linear);
+
+  for (size_t y = 0; y < out->ysize(); ++y) {
+    const float* JXL_RESTRICT row_a = alpha.ConstRow(y);
+    const float* JXL_RESTRICT row_i = in.ConstPlaneRow(c, y);
+    float* JXL_RESTRICT row_o = out->PlaneRow(c, y);
+    for (size_t x = 0; x < out->xsize(); ++x) {
+      const float a = row_a[x];
+      if (a <= 0.f) {
+        row_o[x] = background_linear;
+      } else if (a >= 1.f) {
+        row_o[x] = row_i[x];
+      } else {
+        const float w_fg = a;
+        const float w_bg = 1.0f - w_fg;
+        const float fg = w_fg * LinearToSrgb8Direct(row_i[x]);
+        const float bg = w_bg * background;
+        row_o[x] = Srgb8ToLinearDirect(fg + bg);
+      }
+    }
+  }
+}
+
+void AlphaBlend(float background_linear, ImageBundle* io_linear_srgb) {
+  // No alpha => all opaque.
+  if (!io_linear_srgb->HasAlpha()) return;
+
+  for (size_t c = 0; c < 3; ++c) {
+    AlphaBlend(*io_linear_srgb->color(), c, background_linear,
+               *io_linear_srgb->alpha(), io_linear_srgb->color());
+  }
+}
+
+float ComputeScoreImpl(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                       Comparator* comparator, ImageF* distmap) {
+  JXL_CHECK(comparator->SetReferenceImage(rgb0));
+  float score;
+  JXL_CHECK(comparator->CompareWith(rgb1, distmap, &score));
+  return score;
+}
+
+}  // namespace
+
+float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                   Comparator* comparator, const JxlCmsInterface& cms,
+                   ImageF* diffmap, ThreadPool* pool) {
+  PROFILER_FUNC;
+  // Convert to linear sRGB (unless already in that space)
+  ImageMetadata metadata0 = *rgb0.metadata();
+  ImageBundle store0(&metadata0);
+  const ImageBundle* linear_srgb0;
+  JXL_CHECK(TransformIfNeeded(rgb0, ColorEncoding::LinearSRGB(rgb0.IsGray()),
+                              cms, pool, &store0, &linear_srgb0));
+  ImageMetadata metadata1 = *rgb1.metadata();
+  ImageBundle store1(&metadata1);
+  const ImageBundle* linear_srgb1;
+  JXL_CHECK(TransformIfNeeded(rgb1, ColorEncoding::LinearSRGB(rgb1.IsGray()),
+                              cms, pool, &store1, &linear_srgb1));
+
+  // No alpha: skip blending, only need a single call to Butteraugli.
+  if (!rgb0.HasAlpha() && !rgb1.HasAlpha()) {
+    return ComputeScoreImpl(*linear_srgb0, *linear_srgb1, comparator, diffmap);
+  }
+
+  // Blend on black and white backgrounds
+
+  const float black = 0.0f;
+  ImageBundle blended_black0 = linear_srgb0->Copy();
+  ImageBundle blended_black1 = linear_srgb1->Copy();
+  AlphaBlend(black, &blended_black0);
+  AlphaBlend(black, &blended_black1);
+
+  const float white = 1.0f;
+  ImageBundle blended_white0 = linear_srgb0->Copy();
+  ImageBundle blended_white1 = linear_srgb1->Copy();
+
+  AlphaBlend(white, &blended_white0);
+  AlphaBlend(white, &blended_white1);
+
+  ImageF diffmap_black, diffmap_white;
+  const float dist_black = ComputeScoreImpl(blended_black0, blended_black1,
+                                            comparator, &diffmap_black);
+  const float dist_white = ComputeScoreImpl(blended_white0, blended_white1,
+                                            comparator, &diffmap_white);
+
+  // diffmap and return values are the max of diffmap_black/white.
+  if (diffmap != nullptr) {
+    const size_t xsize = rgb0.xsize();
+    const size_t ysize = rgb0.ysize();
+    *diffmap = ImageF(xsize, ysize);
+    for (size_t y = 0; y < ysize; ++y) {
+      const float* JXL_RESTRICT row_black = diffmap_black.ConstRow(y);
+      const float* JXL_RESTRICT row_white = diffmap_white.ConstRow(y);
+      float* JXL_RESTRICT row_out = diffmap->Row(y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = std::max(row_black[x], row_white[x]);
+      }
+    }
+  }
+  return std::max(dist_black, dist_white);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_comparator.h b/third_party/jpeg-xl/lib/jxl/enc_comparator.h
new file mode 100644
index 0000000000..0ac4df8296
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_comparator.h
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_COMPARATOR_H_
+#define LIB_JXL_ENC_COMPARATOR_H_
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+class Comparator {
+ public:
+  virtual ~Comparator() = default;
+
+  // Sets the reference image, the first to compare
+  // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as
+  // the range from standard black point to standard white point, but values
+  // outside permitted.
+  virtual Status SetReferenceImage(const ImageBundle& ref) = 0;
+
+  // Sets the actual image (with loss), the second to compare
+  // Image must be in linear sRGB (gamma expanded) in range 0.0f-1.0f as
+  // the range from standard black point to standard white point, but values
+  // outside permitted.
+  // In diffmap it outputs the local score per pixel, while in score it outputs
+  // a single score. Any one may be set to nullptr to not compute it.
+  virtual Status CompareWith(const ImageBundle& actual, ImageF* diffmap,
+                             float* score) = 0;
+
+  // Quality thresholds for diffmap and score values.
+  // The good score must represent a value where the images are considered to
+  // be perceptually indistinguishable (but not identical)
+  // The bad value must be larger than good to indicate "lower means better"
+  // and smaller than good to indicate "higher means better"
+  virtual float GoodQualityScore() const = 0;
+  virtual float BadQualityScore() const = 0;
+};
+
+// Computes the score given images in any RGB color model, optionally with
+// alpha channel.
+float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
+                   Comparator* comparator, const JxlCmsInterface& cms,
+                   ImageF* diffmap = nullptr, ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_COMPARATOR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.cc b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc
new file mode 100644
index 0000000000..842dd12423
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Library to encode the context map.
+
+#include "lib/jxl/enc_context_map.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/entropy_coder.h"
+
+namespace jxl {
+
+namespace {
+
+size_t IndexOf(const std::vector<uint8_t>& v, uint8_t value) {
+  size_t i = 0;
+  for (; i < v.size(); ++i) {
+    if (v[i] == value) return i;
+  }
+  return i;
+}
+
+void MoveToFront(std::vector<uint8_t>* v, size_t index) {
+  uint8_t value = (*v)[index];
+  for (size_t i = index; i != 0; --i) {
+    (*v)[i] = (*v)[i - 1];
+  }
+  (*v)[0] = value;
+}
+
+std::vector<uint8_t> MoveToFrontTransform(const std::vector<uint8_t>& v) {
+  if (v.empty()) return v;
+  uint8_t max_value = *std::max_element(v.begin(), v.end());
+  std::vector<uint8_t> mtf(max_value + 1);
+  for (size_t i = 0; i <= max_value; ++i) mtf[i] = i;
+  std::vector<uint8_t> result(v.size());
+  for (size_t i = 0; i < v.size(); ++i) {
+    size_t index = IndexOf(mtf, v[i]);
+    JXL_ASSERT(index < mtf.size());
+    result[i] = static_cast<uint8_t>(index);
+    MoveToFront(&mtf, index);
+  }
+  return result;
+}
+
+}  // namespace
+
+void EncodeContextMap(const std::vector<uint8_t>& context_map,
+                      size_t num_histograms, BitWriter* writer, size_t layer,
+                      AuxOut* aux_out) {
+  if (num_histograms == 1) {
+    // Simple code
+    writer->Write(1, 1);
+    // 0 bits per entry.
+    writer->Write(2, 0);
+    return;
+  }
+
+  std::vector<uint8_t> transformed_symbols = MoveToFrontTransform(context_map);
+  std::vector<std::vector<Token>> tokens(1), mtf_tokens(1);
+  EntropyEncodingData codes;
+  std::vector<uint8_t> dummy_context_map;
+  for (size_t i = 0; i < context_map.size(); i++) {
+    tokens[0].emplace_back(0, context_map[i]);
+  }
+  for (size_t i = 0; i < transformed_symbols.size(); i++) {
+    mtf_tokens[0].emplace_back(0, transformed_symbols[i]);
+  }
+  HistogramParams params;
+  params.uint_method = HistogramParams::HybridUintMethod::kContextMap;
+  size_t ans_cost = BuildAndEncodeHistograms(
+      params, 1, tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+  size_t mtf_cost = BuildAndEncodeHistograms(
+      params, 1, mtf_tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+  bool use_mtf = mtf_cost < ans_cost;
+  // Rebuild token list.
+  tokens[0].clear();
+  for (size_t i = 0; i < transformed_symbols.size(); i++) {
+    tokens[0].emplace_back(0,
+                           use_mtf ? transformed_symbols[i] : context_map[i]);
+  }
+  size_t entry_bits = CeilLog2Nonzero(num_histograms);
+  size_t simple_cost = entry_bits * context_map.size();
+  if (entry_bits < 4 && simple_cost < ans_cost && simple_cost < mtf_cost) {
+    writer->Write(1, 1);
+    writer->Write(2, entry_bits);
+    for (size_t i = 0; i < context_map.size(); i++) {
+      writer->Write(entry_bits, context_map[i]);
+    }
+  } else {
+    writer->Write(1, 0);
+    writer->Write(1, use_mtf);  // Use/don't use MTF.
+    BuildAndEncodeHistograms(params, 1, tokens, &codes, &dummy_context_map,
+                             writer, layer, aux_out);
+    WriteTokens(tokens[0], codes, dummy_context_map, writer);
+  }
+}
+
+void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
+                       AuxOut* aux_out) {
+  auto& dct = block_ctx_map.dc_thresholds;
+  auto& qft = block_ctx_map.qf_thresholds;
+  auto& ctx_map = block_ctx_map.ctx_map;
+  BitWriter::Allotment allotment(
+      writer,
+      (dct[0].size() + dct[1].size() + dct[2].size() + qft.size()) * 34 + 1 +
+          4 + 4 + ctx_map.size() * 10 + 1024);
+  if (dct[0].empty() && dct[1].empty() && dct[2].empty() && qft.empty() &&
+      ctx_map.size() == 21 &&
+      std::equal(ctx_map.begin(), ctx_map.end(), BlockCtxMap::kDefaultCtxMap)) {
+    writer->Write(1, 1);  // default
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+    return;
+  }
+  writer->Write(1, 0);
+  for (int j : {0, 1, 2}) {
+    writer->Write(4, dct[j].size());
+    for (int i : dct[j]) {
+      JXL_CHECK(U32Coder::Write(kDCThresholdDist, PackSigned(i), writer));
+    }
+  }
+  writer->Write(4, qft.size());
+  for (uint32_t i : qft) {
+    JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, writer));
+  }
+  EncodeContextMap(ctx_map, block_ctx_map.num_ctxs, writer, kLayerAC, aux_out);
+  allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_context_map.h b/third_party/jpeg-xl/lib/jxl/enc_context_map.h
new file mode 100644
index 0000000000..041e71de7a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_context_map.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_CONTEXT_MAP_H_
+#define LIB_JXL_ENC_CONTEXT_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Max limit is 255 because encoding assumes numbers < 255
+// More clusters can help compression, but makes encode/decode somewhat slower
+static const size_t kClustersLimit = 128;
+
+// Encodes the given context map to the bit stream. The number of different
+// histogram ids is given by num_histograms.
+void EncodeContextMap(const std::vector<uint8_t>& context_map,
+                      size_t num_histograms, BitWriter* writer, size_t layer,
+                      AuxOut* aux_out);
+
+void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
+                       AuxOut* aux_out);
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_CONTEXT_MAP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc
new file mode 100644
index 0000000000..5819036987
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.cc
@@ -0,0 +1,626 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_detect_dots.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdio>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_detect_dots.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_linalg.h"
+#include "lib/jxl/enc_optimize.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+// Set JXL_DEBUG_DOT_DETECT to 1 to enable debugging.
+#ifndef JXL_DEBUG_DOT_DETECT
+#define JXL_DEBUG_DOT_DETECT 0
+#endif
+
+#if JXL_DEBUG_DOT_DETECT
+#include "lib/jxl/enc_aux_out.h"
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Sub;
+
+ImageF SumOfSquareDifferences(const Image3F& forig, const Image3F& smooth,
+                              ThreadPool* pool) {
+  const HWY_FULL(float) d;
+  const auto color_coef0 = Set(d, 0.0f);
+  const auto color_coef1 = Set(d, 10.0f);
+  const auto color_coef2 = Set(d, 0.0f);
+
+  ImageF sum_of_squares(forig.xsize(), forig.ysize());
+  JXL_CHECK(RunOnPool(
+      pool, 0, forig.ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t thread) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT orig_row0 = forig.Plane(0).ConstRow(y);
+        const float* JXL_RESTRICT orig_row1 = forig.Plane(1).ConstRow(y);
+        const float* JXL_RESTRICT orig_row2 = forig.Plane(2).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row0 = smooth.Plane(0).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row1 = smooth.Plane(1).ConstRow(y);
+        const float* JXL_RESTRICT smooth_row2 = smooth.Plane(2).ConstRow(y);
+        float* JXL_RESTRICT sos_row = sum_of_squares.Row(y);
+
+        for (size_t x = 0; x < forig.xsize(); x += Lanes(d)) {
+          auto v0 = Sub(Load(d, orig_row0 + x), Load(d, smooth_row0 + x));
+          auto v1 = Sub(Load(d, orig_row1 + x), Load(d, smooth_row1 + x));
+          auto v2 = Sub(Load(d, orig_row2 + x), Load(d, smooth_row2 + x));
+          v0 = Mul(Mul(v0, v0), color_coef0);
+          v1 = Mul(Mul(v1, v1), color_coef1);
+          v2 = Mul(Mul(v2, v2), color_coef2);
+          const auto sos =
+              Add(v0, Add(v1, v2));  // weighted sum of square diffs
+          Store(sos, d, sos_row + x);
+        }
+      },
+      "ComputeEnergyImage"));
+  return sum_of_squares;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(SumOfSquareDifferences);  // Local function
+
+const int kEllipseWindowSize = 5;
+
+namespace {
+struct GaussianEllipse {
+  double x;                         // position in x
+  double y;                         // position in y
+  double sigma_x;                   // scale in x
+  double sigma_y;                   // scale in y
+  double angle;                     // ellipse rotation in radians
+  std::array<double, 3> intensity;  // intensity in each channel
+
+  // The following variables do not need to be encoded
+  double l2_loss;  // error after the Gaussian was fit
+  double l1_loss;
+  double ridge_loss;              // the l2_loss plus regularization term
+  double custom_loss;             // experimental custom loss
+  std::array<double, 3> bgColor;  // best background color
+  size_t neg_pixels;  // number of negative pixels when subtracting dot
+  std::array<double, 3> neg_value;  // debt due to channel truncation
+};
+double DotGaussianModel(double dx, double dy, double ct, double st,
+                        double sigma_x, double sigma_y, double intensity) {
+  double rx = ct * dx + st * dy;
+  double ry = -st * dx + ct * dy;
+  double md = (rx * rx / sigma_x) + (ry * ry / sigma_y);
+  double value = intensity * exp(-0.5 * md);
+  return value;
+}
+
+constexpr bool kOptimizeBackground = true;
+
+// Gaussian that smooths noise but preserves dots
+const WeightsSeparable5& WeightsSeparable5Gaussian0_65() {
+  constexpr float w0 = 0.558311f;
+  constexpr float w1 = 0.210395f;
+  constexpr float w2 = 0.010449f;
+  static constexpr WeightsSeparable5 weights = {
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+  return weights;
+}
+
+// (Iterated) Gaussian that removes dots.
+const WeightsSeparable5& WeightsSeparable5Gaussian3() {
+  constexpr float w0 = 0.222338f;
+  constexpr float w1 = 0.210431f;
+  constexpr float w2 = 0.1784f;
+  static constexpr WeightsSeparable5 weights = {
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+      {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+  return weights;
+}
+
+ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth,
+                          ThreadPool* pool) {
+  PROFILER_FUNC;
+
+  // Prepare guidance images for dot selection.
+  Image3F forig(orig.xsize(), orig.ysize());
+  *smooth = Image3F(orig.xsize(), orig.ysize());
+  Rect rect(orig);
+
+  const auto& weights1 = WeightsSeparable5Gaussian0_65();
+  const auto& weights3 = WeightsSeparable5Gaussian3();
+
+  for (size_t c = 0; c < 3; ++c) {
+    // Use forig as temporary storage to reduce memory and keep it warmer.
+    Separable5(orig.Plane(c), rect, weights3, pool, &forig.Plane(c));
+    Separable5(forig.Plane(c), rect, weights3, pool, &smooth->Plane(c));
+    Separable5(orig.Plane(c), rect, weights1, pool, &forig.Plane(c));
+  }
+
+#if JXL_DEBUG_DOT_DETECT
+  AuxOut aux;
+  aux.debug_prefix = "/tmp/sebastian/";
+  aux.DumpImage("filtered", forig);
+  aux.DumpImage("sm", *smooth);
+#endif
+
+  return HWY_DYNAMIC_DISPATCH(SumOfSquareDifferences)(forig, *smooth, pool);
+}
+
+struct Pixel {
+  int x;
+  int y;
+};
+
+Pixel operator+(const Pixel& a, const Pixel& b) {
+  return Pixel{a.x + b.x, a.y + b.y};
+}
+
+// Maximum area in pixels of a ellipse
+const size_t kMaxCCSize = 1000;
+
+// Extracts a connected component from a Binary image where seed is part
+// of the component
+bool ExtractComponent(ImageF* img, std::vector<Pixel>* pixels,
+                      const Pixel& seed, double threshold) {
+  PROFILER_FUNC;
+  static const std::vector<Pixel> neighbors{{1, -1}, {1, 0},   {1, 1},  {0, -1},
+                                            {0, 1},  {-1, -1}, {-1, 1}, {1, 0}};
+  std::vector<Pixel> q{seed};
+  while (!q.empty()) {
+    Pixel current = q.back();
+    q.pop_back();
+    pixels->push_back(current);
+    if (pixels->size() > kMaxCCSize) return false;
+    for (const Pixel& delta : neighbors) {
+      Pixel child = current + delta;
+      if (child.x >= 0 && static_cast<size_t>(child.x) < img->xsize() &&
+          child.y >= 0 && static_cast<size_t>(child.y) < img->ysize()) {
+        float* value = &img->Row(child.y)[child.x];
+        if (*value > threshold) {
+          *value = 0.0;
+          q.push_back(child);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+inline bool PointInRect(const Rect& r, const Pixel& p) {
+  return (static_cast<size_t>(p.x) >= r.x0() &&
+          static_cast<size_t>(p.x) < (r.x0() + r.xsize()) &&
+          static_cast<size_t>(p.y) >= r.y0() &&
+          static_cast<size_t>(p.y) < (r.y0() + r.ysize()));
+}
+
+struct ConnectedComponent {
+  ConnectedComponent(const Rect& bounds, const std::vector<Pixel>&& pixels)
+      : bounds(bounds), pixels(pixels) {}
+  Rect bounds;
+  std::vector<Pixel> pixels;
+  float maxEnergy;
+  float meanEnergy;
+  float varEnergy;
+  float meanBg;
+  float varBg;
+  float score;
+  Pixel mode;
+
+  void CompStats(const ImageF& energy, int extra) {
+    PROFILER_FUNC;
+    maxEnergy = 0.0;
+    meanEnergy = 0.0;
+    varEnergy = 0.0;
+    meanBg = 0.0;
+    varBg = 0.0;
+    int nIn = 0;
+    int nOut = 0;
+    mode.x = 0;
+    mode.y = 0;
+    for (int sy = -extra; sy < (static_cast<int>(bounds.ysize()) + extra);
+         sy++) {
+      int y = sy + static_cast<int>(bounds.y0());
+      if (y < 0 || static_cast<size_t>(y) >= energy.ysize()) continue;
+      const float* JXL_RESTRICT erow = energy.ConstRow(y);
+      for (int sx = -extra; sx < (static_cast<int>(bounds.xsize()) + extra);
+           sx++) {
+        int x = sx + static_cast<int>(bounds.x0());
+        if (x < 0 || static_cast<size_t>(x) >= energy.xsize()) continue;
+        if (erow[x] > maxEnergy) {
+          maxEnergy = erow[x];
+          mode.x = x;
+          mode.y = y;
+        }
+        if (PointInRect(bounds, Pixel{x, y})) {
+          meanEnergy += erow[x];
+          varEnergy += erow[x] * erow[x];
+          nIn++;
+        } else {
+          meanBg += erow[x];
+          varBg += erow[x] * erow[x];
+          nOut++;
+        }
+      }
+    }
+    meanEnergy = meanEnergy / nIn;
+    meanBg = meanBg / nOut;
+    varEnergy = (varEnergy / nIn) - meanEnergy * meanEnergy;
+    varBg = (varBg / nOut) - meanBg * meanBg;
+    score = (meanEnergy - meanBg) / std::sqrt(varBg);
+  }
+};
+
+Rect BoundingRectangle(const std::vector<Pixel>& pixels) {
+  PROFILER_FUNC;
+  JXL_ASSERT(!pixels.empty());
+  int low_x, high_x, low_y, high_y;
+  low_x = high_x = pixels[0].x;
+  low_y = high_y = pixels[0].y;
+  for (const Pixel& p : pixels) {
+    low_x = std::min(low_x, p.x);
+    high_x = std::max(high_x, p.x);
+    low_y = std::min(low_y, p.y);
+    high_y = std::max(high_y, p.y);
+  }
+  return Rect(low_x, low_y, high_x - low_x + 1, high_y - low_y + 1);
+}
+
+std::vector<ConnectedComponent> FindCC(const ImageF& energy, double t_low,
+                                       double t_high, uint32_t maxWindow,
+                                       double minScore) {
+  PROFILER_FUNC;
+  const int kExtraRect = 4;
+  ImageF img = CopyImage(energy);
+  std::vector<ConnectedComponent> ans;
+  for (size_t y = 0; y < img.ysize(); y++) {
+    float* JXL_RESTRICT row = img.Row(y);
+    for (size_t x = 0; x < img.xsize(); x++) {
+      if (row[x] > t_high) {
+        std::vector<Pixel> pixels;
+        row[x] = 0.0;
+        bool success = ExtractComponent(
+            &img, &pixels, Pixel{static_cast<int>(x), static_cast<int>(y)},
+            t_low);
+        if (!success) continue;
+#if JXL_DEBUG_DOT_DETECT
+        for (size_t i = 0; i < pixels.size(); i++) {
+          fprintf(stderr, "(%d,%d) ", pixels[i].x, pixels[i].y);
+        }
+        fprintf(stderr, "\n");
+#endif  // JXL_DEBUG_DOT_DETECT
+        Rect bounds = BoundingRectangle(pixels);
+        if (bounds.xsize() < maxWindow && bounds.ysize() < maxWindow) {
+          ConnectedComponent cc{bounds, std::move(pixels)};
+          cc.CompStats(energy, kExtraRect);
+          if (cc.score < minScore) continue;
+          JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+                    "cc mode: (%d,%d), max: %f, bgMean: %f bgVar: "
+                    "%f bound:(%" PRIuS ",%" PRIuS ",%" PRIuS ",%" PRIuS ")\n",
+                    cc.mode.x, cc.mode.y, cc.maxEnergy, cc.meanEnergy,
+                    cc.varEnergy, cc.bounds.x0(), cc.bounds.y0(),
+                    cc.bounds.xsize(), cc.bounds.ysize());
+          ans.push_back(cc);
+        }
+      }
+    }
+  }
+  return ans;
+}
+
+// TODO (sggonzalez): Adapt this function for the different color spaces or
+// remove it if the color space with the best performance does not need it
+void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc,
+                      const Image3F& img, const Image3F& background) {
+  PROFILER_FUNC;
+  const int rectBounds = 2;
+  const double kIntensityR = 0.0;   // 0.015;
+  const double kSigmaR = 0.0;       // 0.01;
+  const double kZeroEpsilon = 0.1;  // Tolerance to consider a value negative
+  double ct = cos(ellipse->angle), st = sin(ellipse->angle);
+  const std::array<double, 3> channelGains{{1.0, 1.0, 1.0}};
+  int N = 0;
+  ellipse->l1_loss = 0.0;
+  ellipse->l2_loss = 0.0;
+  ellipse->neg_pixels = 0;
+  ellipse->neg_value.fill(0.0);
+  double distMeanModeSq = (cc.mode.x - ellipse->x) * (cc.mode.x - ellipse->x) +
+                          (cc.mode.y - ellipse->y) * (cc.mode.y - ellipse->y);
+  ellipse->custom_loss = 0.0;
+  for (int c = 0; c < 3; c++) {
+    for (int sy = -rectBounds;
+         sy < (static_cast<int>(cc.bounds.ysize()) + rectBounds); sy++) {
+      int y = sy + cc.bounds.y0();
+      if (y < 0 || static_cast<size_t>(y) >= img.ysize()) continue;
+      const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y);
+      // bgrow is only used if kOptimizeBackground is false.
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+      const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y);
+      for (int sx = -rectBounds;
+           sx < (static_cast<int>(cc.bounds.xsize()) + rectBounds); sx++) {
+        int x = sx + cc.bounds.x0();
+        if (x < 0 || static_cast<size_t>(x) >= img.xsize()) continue;
+        double target = row[x];
+        double dotDelta = DotGaussianModel(
+            x - ellipse->x, y - ellipse->y, ct, st, ellipse->sigma_x,
+            ellipse->sigma_y, ellipse->intensity[c]);
+        if (dotDelta > target + kZeroEpsilon) {
+          ellipse->neg_pixels++;
+          ellipse->neg_value[c] += dotDelta - target;
+        }
+        double bkg = kOptimizeBackground ? ellipse->bgColor[c] : bgrow[x];
+        double pred = bkg + dotDelta;
+        double diff = target - pred;
+        double l2 = channelGains[c] * diff * diff;
+        double l1 = channelGains[c] * std::fabs(diff);
+        ellipse->l2_loss += l2;
+        ellipse->l1_loss += l1;
+        double w = DotGaussianModel(x - cc.mode.x, y - cc.mode.y, 1.0, 0.0,
+                                    1.0 + ellipse->sigma_x,
+                                    1.0 + ellipse->sigma_y, 1.0);
+        ellipse->custom_loss += w * l2;
+        N++;
+      }
+    }
+  }
+  ellipse->l2_loss /= N;
+  ellipse->custom_loss /= N;
+  ellipse->custom_loss += 20.0 * distMeanModeSq + ellipse->neg_value[1];
+  ellipse->l1_loss /= N;
+  double ridgeTerm = kSigmaR * ellipse->sigma_x + kSigmaR * ellipse->sigma_y;
+  for (int c = 0; c < 3; c++) {
+    ridgeTerm += kIntensityR * ellipse->intensity[c] * ellipse->intensity[c];
+  }
+  ellipse->ridge_loss = ellipse->l2_loss + ridgeTerm;
+}
+
+GaussianEllipse FitGaussianFast(const ConnectedComponent& cc,
+                                const ImageF& energy, const Image3F& img,
+                                const Image3F& background) {
+  PROFILER_FUNC;
+  constexpr bool leastSqIntensity = true;
+  constexpr double kEpsilon = 1e-6;
+  GaussianEllipse ans;
+  constexpr int kRectBounds = (kEllipseWindowSize >> 1);
+
+  // Compute the 1st and 2nd moments of the CC
+  double sum = 0.0;
+  int N = 0;
+  std::array<double, 3> m1{{0.0, 0.0, 0.0}};
+  std::array<double, 3> m2{{0.0, 0.0, 0.0}};
+  std::array<double, 3> color{{0.0, 0.0, 0.0}};
+  std::array<double, 3> bgColor{{0.0, 0.0, 0.0}};
+
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+            "%" PRIuS " %" PRIuS " %" PRIuS " %" PRIuS "\n", cc.bounds.x0(),
+            cc.bounds.y0(), cc.bounds.xsize(), cc.bounds.ysize());
+  for (int c = 0; c < 3; c++) {
+    color[c] = img.ConstPlaneRow(c, cc.mode.y)[cc.mode.x] -
+               background.ConstPlaneRow(c, cc.mode.y)[cc.mode.x];
+  }
+  double sign = (color[1] > 0) ? 1 : -1;
+  for (int sy = -kRectBounds; sy <= kRectBounds; sy++) {
+    int y = sy + cc.mode.y;
+    if (y < 0 || static_cast<size_t>(y) >= energy.ysize()) continue;
+    const float* JXL_RESTRICT row = img.ConstPlaneRow(1, y);
+    const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(1, y);
+    for (int sx = -kRectBounds; sx <= kRectBounds; sx++) {
+      int x = sx + cc.mode.x;
+      if (x < 0 || static_cast<size_t>(x) >= energy.xsize()) continue;
+      double w = std::max(kEpsilon, sign * (row[x] - bgrow[x]));
+      sum += w;
+
+      m1[0] += w * x;
+      m1[1] += w * y;
+      m2[0] += w * x * x;
+      m2[1] += w * x * y;
+      m2[2] += w * y * y;
+      for (int c = 0; c < 3; c++) {
+        bgColor[c] += background.ConstPlaneRow(c, y)[x];
+      }
+      N++;
+    }
+  }
+  JXL_CHECK(N > 0);
+
+  for (int i = 0; i < 3; i++) {
+    m1[i] /= sum;
+    m2[i] /= sum;
+    bgColor[i] /= N;
+  }
+
+  // Some magic constants
+  constexpr double kSigmaMult = 1.0;
+  constexpr std::array<double, 3> kScaleMult{{1.1, 1.1, 1.1}};
+
+  // Now set the parameters of the Gaussian
+  ans.x = m1[0];
+  ans.y = m1[1];
+  for (int j = 0; j < 3; j++) {
+    ans.intensity[j] = kScaleMult[j] * color[j];
+  }
+
+  ImageD Sigma(2, 2), D(1, 2), U(2, 2);
+  Sigma.Row(0)[0] = m2[0] - m1[0] * m1[0];
+  Sigma.Row(1)[1] = m2[2] - m1[1] * m1[1];
+  Sigma.Row(0)[1] = Sigma.Row(1)[0] = m2[1] - m1[0] * m1[1];
+  ConvertToDiagonal(Sigma, &D, &U);
+  const double* JXL_RESTRICT d = D.ConstRow(0);
+  const double* JXL_RESTRICT u = U.ConstRow(1);
+  int p1 = 0, p2 = 1;
+  if (d[0] < d[1]) std::swap(p1, p2);
+  ans.sigma_x = kSigmaMult * d[p1];
+  ans.sigma_y = kSigmaMult * d[p2];
+  ans.angle = std::atan2(u[p1], u[p2]);
+  ans.l2_loss = 0.0;
+  ans.bgColor = bgColor;
+  if (leastSqIntensity) {
+    GaussianEllipse* ellipse = &ans;
+    double ct = cos(ans.angle), st = sin(ans.angle);
+    // Estimate intensity with least squares (fixed background)
+    for (int c = 0; c < 3; c++) {
+      double gg = 0.0;
+      double gd = 0.0;
+      int yc = static_cast<int>(cc.mode.y);
+      int xc = static_cast<int>(cc.mode.x);
+      for (int y = yc - kRectBounds; y <= yc + kRectBounds; y++) {
+        if (y < 0 || static_cast<size_t>(y) >= img.ysize()) continue;
+        const float* JXL_RESTRICT row = img.ConstPlaneRow(c, y);
+        const float* JXL_RESTRICT bgrow = background.ConstPlaneRow(c, y);
+        for (int x = xc - kRectBounds; x <= xc + kRectBounds; x++) {
+          if (x < 0 || static_cast<size_t>(x) >= img.xsize()) continue;
+          double target = row[x] - bgrow[x];
+          double gaussian =
+              DotGaussianModel(x - ellipse->x, y - ellipse->y, ct, st,
+                               ellipse->sigma_x, ellipse->sigma_y, 1.0);
+          gg += gaussian * gaussian;
+          gd += gaussian * target;
+        }
+      }
+      ans.intensity[c] = gd / (gg + 1e-6);  // Regularized least squares
+    }
+  }
+  ComputeDotLosses(&ans, cc, img, background);
+  return ans;
+}
+
+GaussianEllipse FitGaussian(const ConnectedComponent& cc, const ImageF& energy,
+                            const Image3F& img, const Image3F& background) {
+  auto ellipse = FitGaussianFast(cc, energy, img, background);
+  if (ellipse.sigma_x < ellipse.sigma_y) {
+    std::swap(ellipse.sigma_x, ellipse.sigma_y);
+    ellipse.angle += kPi / 2.0;
+  }
+  ellipse.angle -= kPi * std::floor(ellipse.angle / kPi);
+  if (fabs(ellipse.angle - kPi) < 1e-6 || fabs(ellipse.angle) < 1e-6) {
+    ellipse.angle = 0.0;
+  }
+  JXL_CHECK(ellipse.angle >= 0 && ellipse.angle <= kPi &&
+            ellipse.sigma_x >= ellipse.sigma_y);
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT,
+            "Ellipse mu=(%lf,%lf) sigma=(%lf,%lf) angle=%lf "
+            "intensity=(%lf,%lf,%lf) bg=(%lf,%lf,%lf) l2_loss=%lf "
+            "custom_loss=%lf, neg_pix=%" PRIuS ", neg_v=(%lf,%lf,%lf)\n",
+            ellipse.x, ellipse.y, ellipse.sigma_x, ellipse.sigma_y,
+            ellipse.angle, ellipse.intensity[0], ellipse.intensity[1],
+            ellipse.intensity[2], ellipse.bgColor[0], ellipse.bgColor[1],
+            ellipse.bgColor[2], ellipse.l2_loss, ellipse.custom_loss,
+            ellipse.neg_pixels, ellipse.neg_value[0], ellipse.neg_value[1],
+            ellipse.neg_value[2]);
+  return ellipse;
+}
+
+}  // namespace
+
+std::vector<PatchInfo> DetectGaussianEllipses(
+    const Image3F& opsin, const GaussianDetectParams& params,
+    const EllipseQuantParams& qParams, ThreadPool* pool) {
+  PROFILER_FUNC;
+  std::vector<PatchInfo> dots;
+  Image3F smooth(opsin.xsize(), opsin.ysize());
+  ImageF energy = ComputeEnergyImage(opsin, &smooth, pool);
+#if JXL_DEBUG_DOT_DETECT
+  AuxOut aux;
+  aux.debug_prefix = "/tmp/sebastian/";
+  aux.DumpXybImage("smooth", smooth);
+  aux.DumpPlaneNormalized("energy", energy);
+#endif  // JXL_DEBUG_DOT_DETECT
+  std::vector<ConnectedComponent> components = FindCC(
+      energy, params.t_low, params.t_high, params.maxWinSize, params.minScore);
+  size_t numCC =
+      std::min(params.maxCC, (components.size() * params.percCC) / 100);
+  if (components.size() > numCC) {
+    std::sort(
+        components.begin(), components.end(),
+        [](const ConnectedComponent& a, const ConnectedComponent& b) -> bool {
+          return a.score > b.score;
+        });
+    components.erase(components.begin() + numCC, components.end());
+  }
+  for (const auto& cc : components) {
+    GaussianEllipse ellipse = FitGaussian(cc, energy, opsin, smooth);
+    if (ellipse.x < 0.0 ||
+        std::ceil(ellipse.x) >= static_cast<double>(opsin.xsize()) ||
+        ellipse.y < 0.0 ||
+        std::ceil(ellipse.y) >= static_cast<double>(opsin.ysize())) {
+      continue;
+    }
+    if (ellipse.neg_pixels > params.maxNegPixels) continue;
+    double intensity = 0.21 * ellipse.intensity[0] +
+                       0.72 * ellipse.intensity[1] +
+                       0.07 * ellipse.intensity[2];
+    double intensitySq = intensity * intensity;
+    // for (int c = 0; c < 3; c++) {
+    //  intensitySq += ellipse.intensity[c] * ellipse.intensity[c];
+    //}
+    double sqDistMeanMode = (ellipse.x - cc.mode.x) * (ellipse.x - cc.mode.x) +
+                            (ellipse.y - cc.mode.y) * (ellipse.y - cc.mode.y);
+    if (ellipse.l2_loss < params.maxL2Loss &&
+        ellipse.custom_loss < params.maxCustomLoss &&
+        intensitySq > (params.minIntensity * params.minIntensity) &&
+        sqDistMeanMode < params.maxDistMeanMode * params.maxDistMeanMode) {
+      size_t x0 = cc.bounds.x0();
+      size_t y0 = cc.bounds.y0();
+      dots.emplace_back();
+      dots.back().second.emplace_back(x0, y0);
+      QuantizedPatch& patch = dots.back().first;
+      patch.xsize = cc.bounds.xsize();
+      patch.ysize = cc.bounds.ysize();
+      for (size_t y = 0; y < patch.ysize; y++) {
+        for (size_t x = 0; x < patch.xsize; x++) {
+          for (size_t c = 0; c < 3; c++) {
+            patch.fpixels[c][y * patch.xsize + x] =
+                opsin.ConstPlaneRow(c, y0 + y)[x0 + x] -
+                smooth.ConstPlaneRow(c, y0 + y)[x0 + x];
+          }
+        }
+      }
+    }
+  }
+#if JXL_DEBUG_DOT_DETECT
+  JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "Candidates: %" PRIuS ", Dots: %" PRIuS "\n",
+            components.size(), dots.size());
+  ApplyGaussianEllipses(&smooth, dots, 1.0);
+  aux.DumpXybImage("draw", smooth);
+  ApplyGaussianEllipses(&smooth, dots, -1.0);
+
+  auto qdots = QuantizeGaussianEllipses(dots, qParams);
+  auto deq = DequantizeGaussianEllipses(qdots, qParams);
+  ApplyGaussianEllipses(&smooth, deq, 1.0);
+  aux.DumpXybImage("qdraw", smooth);
+  ApplyGaussianEllipses(&smooth, deq, -1.0);
+#endif  // JXL_DEBUG_DOT_DETECT
+  return dots;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h
new file mode 100644
index 0000000000..c3071d9a2f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_detect_dots.h
@@ -0,0 +1,67 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// We attempt to remove dots, or speckle from images using Gaussian blur.
+#ifndef LIB_JXL_ENC_DETECT_DOTS_H_
+#define LIB_JXL_ENC_DETECT_DOTS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct GaussianDetectParams {
+  double t_high = 0;  // at least one pixel must have larger energy than t_high
+  double t_low = 0;   // all pixels must have a larger energy than tLow
+  uint32_t maxWinSize = 0;  // discard dots larger than this containing window
+  double maxL2Loss = 0;
+  double maxCustomLoss = 0;
+  double minIntensity = 0;     // If the intensity is too low, discard it
+  double maxDistMeanMode = 0;  // The mean and the mode must be close
+  size_t maxNegPixels = 0;     // Maximum number of negative pixel
+  size_t minScore = 0;
+  size_t maxCC = 50;   // Maximum number of CC to keep
+  size_t percCC = 15;  // Percentage in [0,100] of CC to keep
+};
+
+// Ellipse Quantization Params
+struct EllipseQuantParams {
+  size_t xsize;      // Image size in x
+  size_t ysize;      // Image size in y
+  size_t qPosition;  // Position quantization delta
+  // Quantization for the Gaussian sigma parameters
+  double minSigma;
+  double maxSigma;
+  size_t qSigma;  // number of quantization levels
+  // Quantization for the rotation angle (between -pi and pi)
+  size_t qAngle;
+  // Quantization for the intensity
+  std::array<double, 3> minIntensity;
+  std::array<double, 3> maxIntensity;
+  std::array<size_t, 3> qIntensity;  // number of quantization levels
+  // Extra parameters for the encoding
+  bool subtractQuantized;  // Should we subtract quantized or detected dots?
+  float ytox;
+  float ytob;
+
+  void QuantPositionSize(size_t* xsize, size_t* ysize) const;
+};
+
+// Detects dots in XYB image.
+std::vector<PatchInfo> DetectGaussianEllipses(
+    const Image3F& opsin, const GaussianDetectParams& params,
+    const EllipseQuantParams& qParams, ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_DETECT_DOTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc
new file mode 100644
index 0000000000..2d22c1edb8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.cc
@@ -0,0 +1,71 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_dot_dictionary.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_detect_dots.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Private implementation of Dictionary Encode/Decode
+namespace {
+
+/* Quantization constants for Ellipse dots */
+const size_t kEllipsePosQ = 2;        // Quantization level for the position
+const double kEllipseMinSigma = 0.1;  // Minimum sigma value
+const double kEllipseMaxSigma = 3.1;  // Maximum Sigma value
+const size_t kEllipseSigmaQ = 16;     // Number of quantization levels for sigma
+const size_t kEllipseAngleQ = 8;      // Quantization level for the angle
+// TODO: fix these values.
+const std::array<double, 3> kEllipseMinIntensity{{-0.05, 0.0, -0.5}};
+const std::array<double, 3> kEllipseMaxIntensity{{0.05, 1.0, 0.4}};
+const std::array<size_t, 3> kEllipseIntensityQ{{10, 36, 10}};
+}  // namespace
+
+std::vector<PatchInfo> FindDotDictionary(const CompressParams& cparams,
+                                         const Image3F& opsin,
+                                         const ColorCorrelationMap& cmap,
+                                         ThreadPool* pool) {
+  if (ApplyOverride(cparams.dots,
+                    cparams.butteraugli_distance >= kMinButteraugliForDots)) {
+    GaussianDetectParams ellipse_params;
+    ellipse_params.t_high = 0.04;
+    ellipse_params.t_low = 0.02;
+    ellipse_params.maxWinSize = 5;
+    ellipse_params.maxL2Loss = 0.005;
+    ellipse_params.maxCustomLoss = 300;
+    ellipse_params.minIntensity = 0.12;
+    ellipse_params.maxDistMeanMode = 1.0;
+    ellipse_params.maxNegPixels = 0;
+    ellipse_params.minScore = 12.0;
+    ellipse_params.maxCC = 100;
+    ellipse_params.percCC = 100;
+    EllipseQuantParams qParams{
+        opsin.xsize(),      opsin.ysize(),        kEllipsePosQ,
+        kEllipseMinSigma,   kEllipseMaxSigma,     kEllipseSigmaQ,
+        kEllipseAngleQ,     kEllipseMinIntensity, kEllipseMaxIntensity,
+        kEllipseIntensityQ, kEllipsePosQ <= 5,    cmap.YtoXRatio(0),
+        cmap.YtoBRatio(0)};
+
+    return DetectGaussianEllipses(opsin, ellipse_params, qParams, pool);
+  }
+  return {};
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h
new file mode 100644
index 0000000000..2ba4393f30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_dot_dictionary.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_DOT_DICTIONARY_H_
+#define LIB_JXL_ENC_DOT_DICTIONARY_H_
+
+// Dots are stored in a dictionary to avoid storing similar dots multiple
+// times.
+
+#include <stddef.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+std::vector<PatchInfo> FindDotDictionary(const CompressParams& cparams,
+                                         const Image3F& opsin,
+                                         const ColorCorrelationMap& cmap,
+                                         ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_DOT_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc
new file mode 100644
index 0000000000..c634445e83
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.cc
@@ -0,0 +1,274 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_entropy_coder.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_entropy_coder.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::GetLane;
+
+// Returns number of non-zero coefficients (but skip LLF).
+// We cannot rely on block[] being all-zero bits, so first truncate to integer.
+// Also writes the per-8x8 block nzeros starting at nzeros_pos.
+int32_t NumNonZeroExceptLLF(const size_t cx, const size_t cy,
+                            const AcStrategy acs, const size_t covered_blocks,
+                            const size_t log2_covered_blocks,
+                            const int32_t* JXL_RESTRICT block,
+                            const size_t nzeros_stride,
+                            int32_t* JXL_RESTRICT nzeros_pos) {
+  const HWY_CAPPED(int32_t, kBlockDim) di;
+
+  const auto zero = Zero(di);
+  // Add FF..FF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+
+  {
+    // Mask sufficient for one row of coefficients.
+    HWY_ALIGN const int32_t
+        llf_mask_lanes[AcStrategy::kMaxCoeffBlocks * (1 + kBlockDim)] = {
+            -1, -1, -1, -1};
+    // First cx=1,2,4 elements are FF..FF, others 0.
+    const int32_t* llf_mask_pos =
+        llf_mask_lanes + AcStrategy::kMaxCoeffBlocks - cx;
+
+    // Rows with LLF: mask out the LLF
+    for (size_t y = 0; y < cy; y++) {
+      for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
+        const auto llf_mask = LoadU(di, llf_mask_pos + x);
+
+        // LLF counts as zero so we don't include it in nzeros.
+        const auto coef =
+            AndNot(llf_mask, Load(di, &block[y * cx * kBlockDim + x]));
+
+        neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+      }
+    }
+  }
+
+  // Remaining rows: no mask
+  for (size_t y = cy; y < cy * kBlockDim; y++) {
+    for (size_t x = 0; x < cx * kBlockDim; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * cx * kBlockDim + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want area - sum_zero, add because neg_sum_zero is already negated.
+  const int32_t nzeros =
+      int32_t(cx * cy * kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
+
+  const int32_t shifted_nzeros = static_cast<int32_t>(
+      (nzeros + covered_blocks - 1) >> log2_covered_blocks);
+  // Need non-canonicalized dimensions!
+  for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
+    for (size_t x = 0; x < acs.covered_blocks_x(); x++) {
+      nzeros_pos[x + y * nzeros_stride] = shifted_nzeros;
+    }
+  }
+
+  return nzeros;
+}
+
+// Specialization for 8x8, where only top-left is LLF/DC.
+// About 1% overall speedup vs. NumNonZeroExceptLLF.
+int32_t NumNonZero8x8ExceptDC(const int32_t* JXL_RESTRICT block,
+                              int32_t* JXL_RESTRICT nzeros_pos) {
+  const HWY_CAPPED(int32_t, kBlockDim) di;
+
+  const auto zero = Zero(di);
+  // Add FF..FF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+
+  {
+    // First row has DC, so mask
+    const size_t y = 0;
+    HWY_ALIGN const int32_t dc_mask_lanes[kBlockDim] = {-1};
+
+    for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
+      const auto dc_mask = Load(di, dc_mask_lanes + x);
+
+      // DC counts as zero so we don't include it in nzeros.
+      const auto coef = AndNot(dc_mask, Load(di, &block[y * kBlockDim + x]));
+
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // Remaining rows: no mask
+  for (size_t y = 1; y < kBlockDim; y++) {
+    for (size_t x = 0; x < kBlockDim; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * kBlockDim + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want 64 - sum_zero, add because neg_sum_zero is already negated.
+  const int32_t nzeros =
+      int32_t(kDCTBlockSize) + GetLane(SumOfLanes(di, neg_sum_zero));
+
+  *nzeros_pos = nzeros;
+
+  return nzeros;
+}
+
+// The number of nonzeros of each block is predicted from the top and the left
+// blocks, with opportune scaling to take into account the number of blocks of
+// each strategy.  The predicted number of nonzeros divided by two is used as a
+// context; if this number is above 63, a specific context is used.  If the
+// number of nonzeros of a strategy is above 63, it is written directly using a
+// fixed number of bits (that depends on the size of the strategy).
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map) {
+  const size_t xsize_blocks = rect.xsize();
+  const size_t ysize_blocks = rect.ysize();
+
+  // TODO(user): update the estimate: usually less coefficients are used.
+  output->reserve(output->size() +
+                  3 * xsize_blocks * ysize_blocks * kDCTBlockSize);
+
+  size_t offset[3] = {};
+  const size_t nzeros_stride = tmp_num_nzeroes->PixelsPerRow();
+  for (size_t by = 0; by < ysize_blocks; ++by) {
+    size_t sby[3] = {by >> cs.VShift(0), by >> cs.VShift(1),
+                     by >> cs.VShift(2)};
+    int32_t* JXL_RESTRICT row_nzeros[3] = {
+        tmp_num_nzeroes->PlaneRow(0, sby[0]),
+        tmp_num_nzeroes->PlaneRow(1, sby[1]),
+        tmp_num_nzeroes->PlaneRow(2, sby[2]),
+    };
+    const int32_t* JXL_RESTRICT row_nzeros_top[3] = {
+        sby[0] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(0, sby[0] - 1),
+        sby[1] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(1, sby[1] - 1),
+        sby[2] == 0 ? nullptr : tmp_num_nzeroes->ConstPlaneRow(2, sby[2] - 1),
+    };
+    const uint8_t* JXL_RESTRICT row_qdc =
+        qdc.ConstRow(rect.y0() + by) + rect.x0();
+    const int32_t* JXL_RESTRICT row_qf = rect.ConstRow(qf, by);
+    AcStrategyRow acs_row = ac_strategy.ConstRow(rect, by);
+    for (size_t bx = 0; bx < xsize_blocks; ++bx) {
+      AcStrategy acs = acs_row[bx];
+      if (!acs.IsFirstBlock()) continue;
+      size_t sbx[3] = {bx >> cs.HShift(0), bx >> cs.HShift(1),
+                       bx >> cs.HShift(2)};
+      size_t cx = acs.covered_blocks_x();
+      size_t cy = acs.covered_blocks_y();
+      const size_t covered_blocks = cx * cy;  // = #LLF coefficients
+      const size_t log2_covered_blocks =
+          Num0BitsBelowLS1Bit_Nonzero(covered_blocks);
+      const size_t size = covered_blocks * kDCTBlockSize;
+
+      CoefficientLayout(&cy, &cx);  // swap cx/cy to canonical order
+
+      for (int c : {1, 0, 2}) {
+        if (sbx[c] << cs.HShift(c) != bx) continue;
+        if (sby[c] << cs.VShift(c) != by) continue;
+        const int32_t* JXL_RESTRICT block = ac_rows[c] + offset[c];
+
+        int32_t nzeros =
+            (covered_blocks == 1)
+                ? NumNonZero8x8ExceptDC(block, row_nzeros[c] + sbx[c])
+                : NumNonZeroExceptLLF(cx, cy, acs, covered_blocks,
+                                      log2_covered_blocks, block, nzeros_stride,
+                                      row_nzeros[c] + sbx[c]);
+
+        int ord = kStrategyOrder[acs.RawStrategy()];
+        const coeff_order_t* JXL_RESTRICT order =
+            &orders[CoeffOrderOffset(ord, c)];
+
+        int32_t predicted_nzeros =
+            PredictFromTopAndLeft(row_nzeros_top[c], row_nzeros[c], sbx[c], 32);
+        size_t block_ctx =
+            block_ctx_map.Context(row_qdc[bx], row_qf[sbx[c]], ord, c);
+        const int32_t nzero_ctx =
+            block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx);
+
+        output->emplace_back(nzero_ctx, nzeros);
+        const size_t histo_offset =
+            block_ctx_map.ZeroDensityContextsOffset(block_ctx);
+        // Skip LLF.
+        size_t prev = (nzeros > static_cast<ssize_t>(size / 16) ? 0 : 1);
+        for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
+          int32_t coeff = block[order[k]];
+          size_t ctx =
+              histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
+                                                log2_covered_blocks, prev);
+          uint32_t u_coeff = PackSigned(coeff);
+          output->emplace_back(ctx, u_coeff);
+          prev = coeff != 0;
+          nzeros -= prev;
+        }
+        JXL_DASSERT(nzeros == 0);
+        offset[c] += size;
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(TokenizeCoefficients);
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map) {
+  return HWY_DYNAMIC_DISPATCH(TokenizeCoefficients)(
+      orders, rect, ac_rows, ac_strategy, cs, tmp_num_nzeroes, output, qdc, qf,
+      block_ctx_map);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h
new file mode 100644
index 0000000000..7dfc71c726
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_entropy_coder.h
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ENTROPY_CODER_H_
+#define LIB_JXL_ENC_ENTROPY_CODER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"  // BlockCtxMap
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/frame_header.h"  // YCbCrChromaSubsampling
+#include "lib/jxl/image.h"
+
+// Entropy coding and context modeling of DC and AC coefficients, as well as AC
+// strategy and quantization field.
+
+namespace jxl {
+
+// Generate DCT NxN quantized AC values tokens.
+// Only the subset "rect" [in units of blocks] within all images.
+// See also DecodeACVarBlock.
+void TokenizeCoefficients(const coeff_order_t* JXL_RESTRICT orders,
+                          const Rect& rect,
+                          const int32_t* JXL_RESTRICT* JXL_RESTRICT ac_rows,
+                          const AcStrategyImage& ac_strategy,
+                          YCbCrChromaSubsampling cs,
+                          Image3I* JXL_RESTRICT tmp_num_nzeroes,
+                          std::vector<Token>* JXL_RESTRICT output,
+                          const ImageB& qdc, const ImageI& qf,
+                          const BlockCtxMap& block_ctx_map);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ENTROPY_CODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
new file mode 100644
index 0000000000..1408746476
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
@@ -0,0 +1,183 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_external_image.h"
+
+#include <jxl/types.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/float.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+namespace {
+
+size_t JxlDataTypeBytes(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 1;
+    case JXL_TYPE_UINT16:
+      return 2;
+    case JXL_TYPE_FLOAT16:
+      return 2;
+    case JXL_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
+  }
+}
+
+}  // namespace
+
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, size_t bits_per_sample,
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel) {
+  if (format.data_type == JXL_TYPE_UINT8) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 16);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_RETURN_IF_ERROR(bits_per_sample == 32);
+  } else {
+    JXL_FAILURE("unsupported pixel format data type %d", format.data_type);
+  }
+  size_t bytes_per_channel = JxlDataTypeBytes(format.data_type);
+  size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
+  size_t pixel_offset = c * bytes_per_channel;
+  // Only for uint8/16.
+  float scale = 1. / ((1ull << bits_per_sample) - 1);
+
+  const size_t last_row_size = xsize * bytes_per_pixel;
+  const size_t align = format.align;
+  const size_t row_size =
+      (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
+  const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
+  if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
+  if (bytes.size() < bytes_to_read) {
+    return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS
+                       " got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS
+                       "x%u, bytes_per_channel: %" PRIuS ")",
+                       bytes_to_read, bytes.size(), xsize, ysize,
+                       format.num_channels, bytes_per_channel);
+  }
+  JXL_ASSERT(channel->xsize() == xsize);
+  JXL_ASSERT(channel->ysize() == ysize);
+  // Too large buffer is likely an application bug, so also fail for that.
+  // Do allow padding to stride in last row though.
+  if (bytes.size() > row_size * ysize) {
+    return JXL_FAILURE("Buffer size is too large");
+  }
+
+  const bool little_endian =
+      format.endianness == JXL_LITTLE_ENDIAN ||
+      (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+
+  const uint8_t* const in = bytes.data();
+
+  std::atomic<size_t> error_count = {0};
+
+  const auto convert_row = [&](const uint32_t task, size_t /*thread*/) {
+    const size_t y = task;
+    size_t offset = row_size * task + pixel_offset;
+    float* JXL_RESTRICT row_out = channel->Row(y);
+    const auto save_value = [&](size_t index, float value) {
+      row_out[index] = value;
+    };
+    if (!LoadFloatRow(in + offset, xsize, bytes_per_pixel, format.data_type,
+                      little_endian, scale, save_value)) {
+      error_count++;
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
+                                ThreadPool::NoInit, convert_row,
+                                "ConvertExtraChannel"));
+
+  if (error_count) {
+    JXL_FAILURE("unsupported pixel format data type");
+  }
+
+  return true;
+}
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, const ColorEncoding& c_current,
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib) {
+  const size_t color_channels = c_current.Channels();
+  bool has_alpha = format.num_channels == 2 || format.num_channels == 4;
+  if (format.num_channels < color_channels) {
+    return JXL_FAILURE("Expected %" PRIuS
+                       " color channels, received only %u channels",
+                       color_channels, format.num_channels);
+  }
+
+  Image3F color(xsize, ysize);
+  for (size_t c = 0; c < color_channels; ++c) {
+    JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize,
+                                            bits_per_sample, format, c, pool,
+                                            &color.Plane(c)));
+  }
+  if (color_channels == 1) {
+    CopyImageTo(color.Plane(0), &color.Plane(1));
+    CopyImageTo(color.Plane(0), &color.Plane(2));
+  }
+  ib->SetFromImage(std::move(color), c_current);
+
+  // Passing an interleaved image with an alpha channel to an image that doesn't
+  // have alpha channel just discards the passed alpha channel.
+  if (has_alpha && ib->HasAlpha()) {
+    ImageF alpha(xsize, ysize);
+    JXL_RETURN_IF_ERROR(
+        ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format,
+                            format.num_channels - 1, pool, &alpha));
+    ib->SetAlpha(std::move(alpha));
+  } else if (!has_alpha && ib->HasAlpha()) {
+    // if alpha is not passed, but it is expected, then assume
+    // it is all-opaque
+    ImageF alpha(xsize, ysize);
+    FillImage(1.0f, &alpha);
+    ib->SetAlpha(std::move(alpha));
+  }
+
+  return true;
+}
+
+Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
+                      size_t ysize, const void* buffer, size_t size,
+                      ThreadPool* pool, ImageF* channel) {
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  return ConvertFromExternal(
+      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
+      xsize, ysize, bitdepth, pixel_format, 0, pool, channel);
+}
+
+Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
+                           uint32_t ysize, const void* buffer, size_t size,
+                           jxl::ThreadPool* pool,
+                           const jxl::ColorEncoding& c_current,
+                           jxl::ImageBundle* ib) {
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  JXL_RETURN_IF_ERROR(ConvertFromExternal(
+      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
+      xsize, ysize, c_current, bitdepth, pixel_format, pool, ib));
+  ib->VerifyMetadata();
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image.h b/third_party/jpeg-xl/lib/jxl/enc_external_image.h
new file mode 100644
index 0000000000..3b2b295076
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_EXTERNAL_IMAGE_H_
+#define LIB_JXL_ENC_EXTERNAL_IMAGE_H_
+
+// Interleaved image for color transforms and Codec.
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, size_t bits_per_sample,
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel);
+
+// Convert an interleaved pixel buffer to the internal ImageBundle
+// representation. This is the opposite of ConvertToExternal().
+Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
+                           size_t ysize, const ColorEncoding& c_current,
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib);
+Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
+                      size_t ysize, const void* buffer, size_t size,
+                      ThreadPool* pool, ImageF* channel);
+Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
+                           uint32_t ysize, const void* buffer, size_t size,
+                           jxl::ThreadPool* pool,
+                           const jxl::ColorEncoding& c_current,
+                           jxl::ImageBundle* ib);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_EXTERNAL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
new file mode 100644
index 0000000000..4b7147817a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_gbench.cc
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+// Encoder case, deinterleaves a buffer.
+void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) {
+  const size_t kNumIter = 5;
+  size_t xsize = state.range();
+  size_t ysize = state.range();
+
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+
+  std::vector<uint8_t> interleaved(xsize * ysize * 4);
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+  for (auto _ : state) {
+    for (size_t i = 0; i < kNumIter; ++i) {
+      JXL_CHECK(ConvertFromExternal(
+          Span<const uint8_t>(interleaved.data(), interleaved.size()), xsize,
+          ysize,
+          /*c_current=*/ColorEncoding::SRGB(),
+          /*bits_per_sample=*/8, format,
+          /*pool=*/nullptr, &ib));
+    }
+  }
+
+  // Pixels per second.
+  state.SetItemsProcessed(kNumIter * state.iterations() * xsize * ysize);
+  state.SetBytesProcessed(kNumIter * state.iterations() * interleaved.size());
+}
+
+BENCHMARK(BM_EncExternalImage_ConvertImageRGBA)
+    ->RangeMultiplier(2)
+    ->Range(256, 2048);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
new file mode 100644
index 0000000000..7be8d45f2d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_external_image.h"
+
+#include <array>
+#include <new>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+#if !defined(JXL_CRASH_ON_ERROR)
+TEST(ExternalImageTest, InvalidSize) {
+  ImageMetadata im;
+  im.SetAlphaBits(8);
+  ImageBundle ib(&im);
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  const uint8_t buf[10 * 100 * 8] = {};
+  EXPECT_FALSE(ConvertFromExternal(
+      Span<const uint8_t>(buf, 10), /*xsize=*/10, /*ysize=*/100,
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*bits_per_sample=*/16, format, nullptr, &ib));
+  EXPECT_FALSE(ConvertFromExternal(
+      Span<const uint8_t>(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*bits_per_sample=*/16, format, nullptr, &ib));
+  EXPECT_TRUE(
+      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), /*xsize=*/10,
+                          /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(),
+                          /*bits_per_sample=*/16, format, nullptr, &ib));
+}
+#endif
+
+TEST(ExternalImageTest, AlphaMissing) {
+  ImageMetadata im;
+  im.SetAlphaBits(0);  // No alpha
+  ImageBundle ib(&im);
+
+  const size_t xsize = 10;
+  const size_t ysize = 20;
+  const uint8_t buf[xsize * ysize * 4] = {};
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
+  // has_alpha is true but the ImageBundle has no alpha. Alpha channel should
+  // be ignored.
+  EXPECT_TRUE(ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize,
+                                  ysize,
+                                  /*c_current=*/ColorEncoding::SRGB(),
+                                  /*bits_per_sample=*/8, format, nullptr, &ib));
+  EXPECT_FALSE(ib.HasAlpha());
+}
+
+TEST(ExternalImageTest, AlphaPremultiplied) {
+  ImageMetadata im;
+  im.SetAlphaBits(8, true);
+
+  ImageBundle ib(&im);
+  const size_t xsize = 10;
+  const size_t ysize = 20;
+  const size_t size = xsize * ysize * 8;
+  const uint8_t buf[size] = {};
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(BufferToImageBundle(format, xsize, ysize, buf, size, nullptr,
+                                  ColorEncoding::SRGB(), &ib));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
new file mode 100644
index 0000000000..286990ee8a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
@@ -0,0 +1,3860 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef FJXL_SELF_INCLUDE
+
+#include "lib/jxl/enc_fast_lossless.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include <vector>
+
+// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
+// support it.
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 1
+#endif
+
+#elif (defined(__x86_64__) || defined(_M_X64)) && !defined(_MSC_VER)
+#include <immintrin.h>
+
+// manually add _mm512_cvtsi512_si32 definition if missing
+// (e.g. with Xcode on macOS Mojave)
+// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
+     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
+inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsi512_si32(__m512i __A) {
+  __v16si __B = (__v16si)__A;
+  return __B[0];
+}
+#endif
+
+// TODO(veluca): MSVC support for dynamic dispatch.
+#if defined(__clang__) || defined(__GNUC__)
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 1
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
+#if (defined(__clang__) &&                                             \
+         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
+     (defined(__apple_build_version__) &&                              \
+      __apple_build_version__ > 10010046)) ||                          \
+    (defined(__GNUC__) && __GNUC__ > 10)
+#define FJXL_ENABLE_AVX512 1
+#endif
+#endif
+
+#endif
+
+#endif
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+#define FJXL_ENABLE_AVX512 0
+#endif
+
+namespace {
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FJXL_INLINE __forceinline
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  unsigned long index;
+  _BitScanReverse(&index, v);
+  return index;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
+  unsigned long index;
+  _BitScanForward(&index, v);
+  return index;
+}
+#else
+#define FJXL_INLINE inline __attribute__((always_inline))
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  return v ? 31 - __builtin_clz(v) : 0;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { return __builtin_ctzll(v); }
+#endif
+
+// Compiles to a memcpy on little-endian systems.
+FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
+#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
+  for (int i = 0; i < 8; i++) {
+    tgt[i] = (data >> (i * 8)) & 0xFF;
+  }
+#else
+  memcpy(tgt, &data, 8);
+#endif
+}
+
+FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
+                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
+  bit_buffer |= bits << bits_in_buffer;
+  bits_in_buffer += count;
+  StoreLE64(data_buf, bit_buffer);
+  size_t bytes_in_buffer = bits_in_buffer / 8;
+  bits_in_buffer -= bytes_in_buffer * 8;
+  bit_buffer >>= bytes_in_buffer * 8;
+  return bytes_in_buffer;
+}
+
+struct BitWriter {
+  void Allocate(size_t maximum_bit_size) {
+    assert(data == nullptr);
+    // Leave some padding.
+    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
+  }
+
+  void Write(uint32_t count, uint64_t bits) {
+    bytes_written += AddBits(count, bits, data.get() + bytes_written,
+                             bits_in_buffer, buffer);
+  }
+
+  void ZeroPadToByte() {
+    if (bits_in_buffer != 0) {
+      Write(8 - bits_in_buffer, 0);
+    }
+  }
+
+  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
+                                 size_t n) {
+    // Necessary because Write() is only guaranteed to work with <=56 bits.
+    // Trying to SIMD-fy this code results in lower speed (and definitely less
+    // clarity).
+    {
+      for (size_t i = 0; i < n; i++) {
+        this->buffer |= bits[i] << this->bits_in_buffer;
+        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+        uint64_t shift = 64 - this->bits_in_buffer;
+        this->bits_in_buffer += nbits[i];
+        // This `if` seems to be faster than using ternaries.
+        if (this->bits_in_buffer >= 64) {
+          uint64_t next_buffer = bits[i] >> shift;
+          this->buffer = next_buffer;
+          this->bits_in_buffer -= 64;
+          this->bytes_written += 8;
+        }
+      }
+      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+      size_t bytes_in_buffer = this->bits_in_buffer / 8;
+      this->bits_in_buffer -= bytes_in_buffer * 8;
+      this->buffer >>= bytes_in_buffer * 8;
+      this->bytes_written += bytes_in_buffer;
+    }
+  }
+
+  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
+  size_t bytes_written = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t buffer = 0;
+};
+
+}  // namespace
+
+extern "C" {
+
+struct JxlFastLosslessFrameState {
+  size_t width;
+  size_t height;
+  size_t nb_chans;
+  size_t bitdepth;
+  BitWriter header;
+  std::vector<std::array<BitWriter, 4>> group_data;
+  size_t current_bit_writer = 0;
+  size_t bit_writer_byte_pos = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t bit_buffer = 0;
+};
+
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
+  size_t total_size_groups = 0;
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    total_size_groups += sz;
+  }
+  return frame->header.bytes_written + total_size_groups;
+}
+
+size_t JxlFastLosslessMaxRequiredOutput(
+    const JxlFastLosslessFrameState* frame) {
+  return JxlFastLosslessOutputSize(frame) + 32;
+}
+
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last) {
+  BitWriter* output = &frame->header;
+  output->Allocate(1000 + frame->group_data.size() * 32);
+
+  std::vector<size_t> group_sizes(frame->group_data.size());
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    group_sizes[i] = sz;
+  }
+
+  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
+
+  if (add_image_header) {
+    // Signature
+    output->Write(16, 0x0AFF);
+
+    // Size header, hand-crafted.
+    // Not small
+    output->Write(1, 0);
+
+    auto wsz = [output](size_t size) {
+      if (size - 1 < (1 << 9)) {
+        output->Write(2, 0b00);
+        output->Write(9, size - 1);
+      } else if (size - 1 < (1 << 13)) {
+        output->Write(2, 0b01);
+        output->Write(13, size - 1);
+      } else if (size - 1 < (1 << 18)) {
+        output->Write(2, 0b10);
+        output->Write(18, size - 1);
+      } else {
+        output->Write(2, 0b11);
+        output->Write(30, size - 1);
+      }
+    };
+
+    wsz(frame->height);
+
+    // No special ratio.
+    output->Write(3, 0);
+
+    wsz(frame->width);
+
+    // Hand-crafted ImageMetadata.
+    output->Write(1, 0);  // all_default
+    output->Write(1, 0);  // extra_fields
+    output->Write(1, 0);  // bit_depth.floating_point_sample
+    if (frame->bitdepth == 8) {
+      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
+    } else if (frame->bitdepth == 10) {
+      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
+    } else if (frame->bitdepth == 12) {
+      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
+    } else {
+      output->Write(2, 0b11);  // 1 + u(6)
+      output->Write(6, frame->bitdepth - 1);
+    }
+    if (frame->bitdepth <= 14) {
+      output->Write(1, 1);  // 16-bit-buffer sufficient
+    } else {
+      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
+    }
+    if (have_alpha) {
+      output->Write(2, 0b01);  // One extra channel
+      output->Write(1, 1);     // ... all_default (ie. 8-bit alpha)
+    } else {
+      output->Write(2, 0b00);  // No extra channel
+    }
+    output->Write(1, 0);  // Not XYB
+    if (frame->nb_chans > 2) {
+      output->Write(1, 1);  // color_encoding.all_default (sRGB)
+    } else {
+      output->Write(1, 0);     // color_encoding.all_default false
+      output->Write(1, 0);     // color_encoding.want_icc false
+      output->Write(2, 1);     // grayscale
+      output->Write(2, 1);     // D65
+      output->Write(1, 0);     // no gamma transfer function
+      output->Write(2, 0b10);  // tf: 2 + u(4)
+      output->Write(4, 11);    // tf of sRGB
+      output->Write(2, 1);     // relative rendering intent
+    }
+    output->Write(2, 0b00);  // No extensions.
+
+    output->Write(1, 1);  // all_default transform data
+
+    // No ICC, no preview. Frame should start at byte boundery.
+    output->ZeroPadToByte();
+  }
+
+  // Handcrafted frame header.
+  output->Write(1, 0);     // all_default
+  output->Write(2, 0b00);  // regular frame
+  output->Write(1, 1);     // modular
+  output->Write(2, 0b00);  // default flags
+  output->Write(1, 0);     // not YCbCr
+  output->Write(2, 0b00);  // no upsampling
+  if (have_alpha) {
+    output->Write(2, 0b00);  // no alpha upsampling
+  }
+  output->Write(2, 0b01);  // default group size
+  output->Write(2, 0b00);  // exactly one pass
+  output->Write(1, 0);     // no custom size or origin
+  output->Write(2, 0b00);  // kReplace blending mode
+  if (have_alpha) {
+    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
+  }
+  output->Write(1, is_last);  // is_last
+  output->Write(2, 0b00);     // a frame has no name
+  output->Write(1, 0);        // loop filter is not all_default
+  output->Write(1, 0);        // no gaborish
+  output->Write(2, 0);        // 0 EPF iters
+  output->Write(2, 0b00);     // No LF extensions
+  output->Write(2, 0b00);     // No FH extensions
+
+  output->Write(1, 0);      // No TOC permutation
+  output->ZeroPadToByte();  // TOC is byte-aligned.
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = group_sizes[i];
+    if (sz < (1 << 10)) {
+      output->Write(2, 0b00);
+      output->Write(10, sz);
+    } else if (sz - 1024 < (1 << 14)) {
+      output->Write(2, 0b01);
+      output->Write(14, sz - 1024);
+    } else if (sz - 17408 < (1 << 22)) {
+      output->Write(2, 0b10);
+      output->Write(22, sz - 17408);
+    } else {
+      output->Write(2, 0b11);
+      output->Write(30, sz - 4211712);
+    }
+  }
+  output->ZeroPadToByte();  // Groups are byte-aligned.
+}
+
+#if FJXL_ENABLE_AVX512
+__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
+    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
+    unsigned char* output, uint64_t& bit_buffer) {
+  if (n < 128) {
+    return 0;
+  }
+
+  size_t i = 0;
+  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
+  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
+
+  for (; i + 64 <= n; i += 64) {
+    __m512i current = _mm512_loadu_si512(data + i);
+    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
+    carry = current;
+    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
+    _mm512_storeu_si512(output + i, out);
+  }
+
+  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
+
+  return i;
+}
+#endif
+
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size) {
+  assert(output_size >= 32);
+  unsigned char* initial_output = output;
+  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
+                                         unsigned char*, uint64_t&) = nullptr;
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512vbmi2")) {
+    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
+  }
+#endif
+
+  while (true) {
+    size_t& cur = frame->current_bit_writer;
+    size_t& bw_pos = frame->bit_writer_byte_pos;
+    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
+      return output - initial_output;
+    }
+    if (output_size <= 8) {
+      return output - initial_output;
+    }
+    size_t nbc = frame->nb_chans;
+    const BitWriter& writer =
+        cur == 0 ? frame->header
+                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
+    size_t full_byte_count =
+        std::min(output_size - 8, writer.bytes_written - bw_pos);
+    if (frame->bits_in_buffer == 0) {
+      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
+    } else {
+      size_t i = 0;
+      if (append_bytes_with_bit_offset) {
+        i += append_bytes_with_bit_offset(
+            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
+            output, frame->bit_buffer);
+      }
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      // Copy 8 bytes at a time until we reach the border.
+      for (; i + 8 < full_byte_count; i += 8) {
+        uint64_t chunk;
+        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
+        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
+        memcpy(output + i, &out, 8);
+        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
+      }
+#endif
+      for (; i < full_byte_count; i++) {
+        AddBits(8, writer.data.get()[bw_pos + i], output + i,
+                frame->bits_in_buffer, frame->bit_buffer);
+      }
+    }
+    output += full_byte_count;
+    output_size -= full_byte_count;
+    bw_pos += full_byte_count;
+    if (bw_pos == writer.bytes_written) {
+      auto write = [&](size_t num, uint64_t bits) {
+        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
+                           frame->bit_buffer);
+        output += n;
+        output_size -= n;
+      };
+      if (writer.bits_in_buffer) {
+        write(writer.bits_in_buffer, writer.buffer);
+      }
+      bw_pos = 0;
+      cur++;
+      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
+        write(8 - frame->bits_in_buffer, 0);
+      }
+    }
+  }
+}
+
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
+  delete frame;
+}
+
+}  // extern "C"
+
+#endif
+
+#ifdef FJXL_SELF_INCLUDE
+
+namespace {
+
+constexpr size_t kNumRawSymbols = 19;
+constexpr size_t kNumLZ77 = 33;
+constexpr size_t kLZ77CacheSize = 32;
+
+constexpr size_t kLZ77Offset = 224;
+constexpr size_t kLZ77MinLength = 7;
+
+void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
+                          uint32_t* bits) {
+  // 400 config
+  uint32_t n = FloorLog2(value);
+  *token = value < 16 ? value : 16 + n - 4;
+  *nbits = value < 16 ? 0 : n;
+  *bits = value < 16 ? 0 : value - (1 << *nbits);
+}
+
+struct PrefixCode {
+  uint8_t raw_nbits[kNumRawSymbols] = {};
+  uint8_t raw_bits[kNumRawSymbols] = {};
+
+  alignas(64) uint8_t raw_nbits_simd[16] = {};
+  alignas(64) uint8_t raw_bits_simd[16] = {};
+
+  uint8_t lz77_nbits[kNumLZ77] = {};
+  uint16_t lz77_bits[kNumLZ77] = {};
+
+  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
+  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
+
+  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
+    constexpr uint16_t kNibbleLookup[16] = {
+        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
+        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
+    };
+    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
+                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
+                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
+                     (kNibbleLookup[bits >> 12]);
+    return rev16 >> (16 - nbits);
+  }
+
+  // Create the prefix codes given the code lengths.
+  // Supports the code lengths being split into two halves.
+  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
+                                   uint8_t* first_chunk_bits,
+                                   size_t first_chunk_size,
+                                   const uint8_t* second_chunk_nbits,
+                                   uint16_t* second_chunk_bits,
+                                   size_t second_chunk_size) {
+    constexpr size_t kMaxCodeLength = 15;
+    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      code_length_counts[first_chunk_nbits[i]]++;
+      assert(first_chunk_nbits[i] <= kMaxCodeLength);
+      assert(first_chunk_nbits[i] <= 8);
+      assert(first_chunk_nbits[i] > 0);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      code_length_counts[second_chunk_nbits[i]]++;
+      assert(second_chunk_nbits[i] <= kMaxCodeLength);
+    }
+
+    uint16_t next_code[kMaxCodeLength + 1] = {};
+
+    uint16_t code = 0;
+    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
+      code = (code + code_length_counts[i - 1]) << 1;
+      next_code[i] = code;
+    }
+
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      first_chunk_bits[i] =
+          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      second_chunk_bits[i] =
+          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
+    }
+  }
+
+  template <typename T>
+  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
+                                            size_t precision, T infty,
+                                            uint8_t* min_limit,
+                                            uint8_t* max_limit,
+                                            uint8_t* nbits) {
+    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
+    auto d = [&](size_t sym, size_t off) -> T& {
+      return dynp[sym * ((1 << precision) + 1) + off];
+    };
+    d(0, 0) = 0;
+    for (size_t sym = 0; sym < n; sym++) {
+      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
+          d(sym + 1, off + off_delta) =
+              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
+                       d(sym + 1, off + off_delta));
+        }
+      }
+    }
+
+    size_t sym = n;
+    size_t off = 1U << precision;
+
+    assert(d(sym, off) != infty);
+
+    while (sym-- > 0) {
+      assert(off > 0);
+      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        if (off_delta <= off &&
+            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
+          off -= off_delta;
+          nbits[sym] = bits;
+          break;
+        }
+      }
+    }
+  }
+
+  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
+  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
+  // freqs[i]).
+  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
+                                        uint8_t* min_limit, uint8_t* max_limit,
+                                        uint8_t* nbits) {
+    size_t precision = 0;
+    size_t shortest_length = 255;
+    uint64_t freqsum = 0;
+    for (size_t i = 0; i < n; i++) {
+      assert(freqs[i] != 0);
+      freqsum += freqs[i];
+      if (min_limit[i] < 1) min_limit[i] = 1;
+      assert(min_limit[i] <= max_limit[i]);
+      precision = std::max<size_t>(max_limit[i], precision);
+      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
+    }
+    // If all the minimum limits are greater than 1, shift precision so that we
+    // behave as if the shortest was 1.
+    precision -= shortest_length - 1;
+    uint64_t infty = freqsum * precision;
+    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
+                                    static_cast<uint32_t>(infty), min_limit,
+                                    max_limit, nbits);
+    } else {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
+                                    max_limit, nbits);
+    }
+  }
+
+  static constexpr size_t kMaxNumSymbols =
+      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
+  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
+                                 const uint8_t* min_limit_in,
+                                 const uint8_t* max_limit_in, uint8_t* nbits) {
+    assert(n <= kMaxNumSymbols);
+    uint64_t compact_freqs[kMaxNumSymbols];
+    uint8_t min_limit[kMaxNumSymbols];
+    uint8_t max_limit[kMaxNumSymbols];
+    size_t ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      if (freqs[i]) {
+        compact_freqs[ni] = freqs[i];
+        min_limit[ni] = min_limit_in[i];
+        max_limit[ni] = max_limit_in[i];
+        ni++;
+      }
+    }
+    uint8_t num_bits[kMaxNumSymbols] = {};
+    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
+                              num_bits);
+    ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      nbits[i] = 0;
+      if (freqs[i]) {
+        nbits[i] = num_bits[ni++];
+      }
+    }
+  }
+
+  // Invalid code, used to construct arrays.
+  PrefixCode() {}
+
+  template <typename BitDepth>
+  PrefixCode(BitDepth, uint64_t* raw_counts, uint64_t* lz77_counts) {
+    // "merge" together all the lz77 counts in a single symbol for the level 1
+    // table (containing just the raw symbols, up to length 7).
+    uint64_t level1_counts[kNumRawSymbols + 1];
+    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
+    size_t numraw = kNumRawSymbols;
+    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
+
+    level1_counts[numraw] = 0;
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      level1_counts[numraw] += lz77_counts[i];
+    }
+    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
+    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
+                       BitDepth::kMaxRawLength, level1_nbits);
+
+    uint8_t level2_nbits[kNumLZ77] = {};
+    uint8_t min_lengths[kNumLZ77] = {};
+    uint8_t l = 15 - level1_nbits[numraw];
+    uint8_t max_lengths[kNumLZ77];
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      max_lengths[i] = l;
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
+    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
+                       level2_nbits);
+    for (size_t i = 0; i < numraw; i++) {
+      raw_nbits[i] = level1_nbits[i];
+    }
+    for (size_t i = 0; i < num_lz77; i++) {
+      lz77_nbits[i] =
+          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
+    }
+
+    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
+                         kNumLZ77);
+    BitDepth::PrepareForSimd(raw_nbits, raw_bits, numraw, raw_nbits_simd,
+                             raw_bits_simd);
+
+    // Prepare lz77 cache
+    for (size_t count = 0; count < kLZ77CacheSize; count++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
+      lz77_cache_bits[count] =
+          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
+          raw_bits[0];
+    }
+  }
+
+  void WriteTo(BitWriter* writer) const {
+    uint64_t code_length_counts[18] = {};
+    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      code_length_counts[raw_nbits[i]]++;
+    }
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      code_length_counts[lz77_nbits[i]]++;
+    }
+    uint8_t code_length_nbits[18] = {};
+    uint8_t code_length_nbits_min[18] = {};
+    uint8_t code_length_nbits_max[18] = {
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    };
+    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
+                       code_length_nbits_max, code_length_nbits);
+    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
+
+    // As per Brotli RFC.
+    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
+                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
+    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
+    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
+
+    // Encode lengths of code lengths.
+    size_t num_code_lengths = 18;
+    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
+      num_code_lengths--;
+    }
+    for (size_t i = 0; i < num_code_lengths; i++) {
+      int symbol = code_length_nbits[code_length_order[i]];
+      writer->Write(code_length_length_nbits[symbol],
+                    code_length_length_bits[symbol]);
+    }
+
+    // Compute the canonical codes for the codes that represent the lengths of
+    // the actual codes for data.
+    uint16_t code_length_bits[18] = {};
+    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
+                         code_length_bits, 18);
+    // Encode raw bit code lengths.
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      writer->Write(code_length_nbits[raw_nbits[i]],
+                    code_length_bits[raw_nbits[i]]);
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (lz77_nbits[num_lz77 - 1] == 0) {
+      num_lz77--;
+    }
+    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
+    // 205.
+    static_assert(kLZ77Offset == 224, "");
+    static_assert(kNumRawSymbols == 19, "");
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // 5
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
+    // Encode LZ77 symbols, with values 224+i.
+    for (size_t i = 0; i < num_lz77; i++) {
+      writer->Write(code_length_nbits[lz77_nbits[i]],
+                    code_length_bits[lz77_nbits[i]]);
+    }
+  }
+};
+
+template <typename T>
+struct VecPair {
+  T low;
+  T hi;
+};
+
+#ifdef FJXL_GENERIC_SIMD
+#undef FJXL_GENERIC_SIMD
+#endif
+
+#ifdef FJXL_AVX512
+#define FJXL_GENERIC_SIMD
+struct SIMDVec32;
+struct Mask32 {
+  __mmask16 mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
+  }
+};
+
+struct SIMDVec32 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm512_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{
+        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm512_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __mmask32 mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_kand_mask32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
+  }
+};
+
+struct SIMDVec16 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 32;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm512_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
+    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    return SIMDVec16{
+        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto c16 = _mm512_set1_epi32(16);
+    auto c32 = _mm512_set1_epi32(32);
+    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
+    auto lzhi =
+        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
+    auto lzlo = _mm512_sub_epi32(
+        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
+    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm512_shuffle_epi8(
+        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
+    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
+    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm512_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
+    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
+    __m512i alpha = _mm512_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i g = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
+                                        _mm512_and_si512(bytes2, g_mask)));
+    __m512i a = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 =
+        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
+
+    // 0x7A = element of upper half of second vector = 0 after lookup; still in
+    // the upper half once we add 1 or 2.
+    uint8_t z = 0x7A;
+    __m512i ridx =
+        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
+                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
+                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
+                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
+    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
+    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
+    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
+    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
+    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+
+    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
+                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
+                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
+    // -1 is such that when adding 1 or 2, we get the correct index for
+    // green/blue.
+    __m512i ridx_hi =
+        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
+    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
+    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
+    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
+
+    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
+    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
+
+    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
+    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
+    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
+    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
+    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
+    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i rg = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
+                                        _mm512_and_si512(bytes2, rg_mask)));
+    __m512i ba = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
+    __m512i g = _mm512_srli_epi16(rg, 8);
+    __m512i b = _mm512_and_si512(ba, _mm512_set1_epi16(0xFF));
+    __m512i a = _mm512_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
+
+    auto pack32 = [](__m512i a, __m512i b) {
+      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
+    };
+    auto packlow32 = [&pack32](__m512i a, __m512i b) {
+      __m512i mask = _mm512_set1_epi32(0xFFFF);
+      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
+    };
+    auto packhi32 = [&pack32](__m512i a, __m512i b) {
+      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
+    };
+
+    __m512i rb0 = packlow32(bytes0, bytes1);
+    __m512i rb1 = packlow32(bytes2, bytes3);
+    __m512i ga0 = packhi32(bytes0, bytes1);
+    __m512i ga1 = packhi32(bytes2, bytes3);
+
+    __m512i r = packlow32(rb0, rb1);
+    __m512i g = packlow32(ga0, ga1);
+    __m512i b = packhi32(rb0, rb1);
+    __m512i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm512_broadcast_i32x4(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm512_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 8;
+
+  __m512i nbits;
+  __m512i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
+    _mm512_storeu_si512((__m512i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
+    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
+
+    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_AVX2
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  __m256i mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~static_cast<uint64_t>(
+        (uint8_t)_mm256_movemask_ps(_mm256_castsi256_ps(mask))));
+  }
+};
+
+struct SIMDVec32 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm256_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    // we know that each value has at most 20 bits, so we just need 5 nibbles
+    // and don't need to mask the fifth. However we do need to set the higher
+    // bytes to 0xFF, which will make table lookups return 0.
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi32(0xF)),
+                        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 4), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 8), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble3 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 12), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble4 = _mm256_or_si256(_mm256_srli_epi32(vec, 16),
+                                   _mm256_set1_epi32(0xFFFFFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+    auto lut4 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+    auto token4 = _mm256_shuffle_epi8(lut4, nibble4);
+
+    auto token =
+        _mm256_max_epi32(_mm256_max_epi32(_mm256_max_epi32(token0, token1),
+                                          _mm256_max_epi32(token2, token3)),
+                         token4);
+    return SIMDVec32{token};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm256_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __m256i mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_mm256_and_si256(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(
+               ~static_cast<uint64_t>((uint32_t)_mm256_movemask_epi8(mask))) /
+           2;
+  }
+};
+
+struct SIMDVec16 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm256_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
+    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
+                        _mm256_set1_epi16(0xFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble3 =
+        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+
+    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
+                                  _mm256_max_epi16(token2, token3));
+    return SIMDVec16{token};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
+
+    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
+
+    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
+    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
+
+    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
+    return SIMDVec16{pow2};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm256_shuffle_epi8(
+        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
+    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
+    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
+    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
+    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm256_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m128i bytes = _mm_loadu_si128((__m128i*)data);
+    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+    __m256i alpha = _mm256_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i g = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
+                            _mm256_and_si256(bytes2, g_mask)),
+        0b11011000);
+    __m256i a = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
+    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
+    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
+
+    __m128i idx =
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+
+    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
+    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
+    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
+
+    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
+                                    0xFF, 0, 0, 0, 0, 0);
+    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
+                                    0xFF, 0xFF, 0xFF);
+
+    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    auto load_and_split_lohi = [](const unsigned char* data) {
+      // LHLHLH...
+      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+      // L0L0L0...
+      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+      // H0H0H0...
+      __m256i hi = _mm256_srli_epi16(bytes, 8);
+      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
+      __m256i packed = _mm256_packus_epi16(lo, hi);
+      return _mm256_permute4x64_epi64(packed, 0b11011000);
+    };
+    __m256i bytes0 = load_and_split_lohi(data);
+    __m256i bytes1 = load_and_split_lohi(data + 32);
+    __m256i bytes2 = load_and_split_lohi(data + 64);
+
+    __m256i idx = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
+
+    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
+    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
+    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
+
+    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
+    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
+
+    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
+    // lower half, and the high bytes in their upper half.
+
+    auto combine_low_hi = [](__m256i v) {
+      __m128i low = _mm256_extracti128_si256(v, 0);
+      __m128i hi = _mm256_extracti128_si256(v, 1);
+      __m256i low16 = _mm256_cvtepu8_epi16(low);
+      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
+      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
+    };
+
+    return {SIMDVec16{combine_low_hi(r0r1r2)},
+            SIMDVec16{combine_low_hi(g0g1g2)},
+            SIMDVec16{combine_low_hi(b0b1b2)}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i rg = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
+                            _mm256_and_si256(bytes2, rg_mask)),
+        0b11011000);
+    __m256i ba = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
+    __m256i g = _mm256_srli_epi16(rg, 8);
+    __m256i b = _mm256_and_si256(ba, _mm256_set1_epi16(0xFF));
+    __m256i a = _mm256_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
+    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
+
+    auto pack32 = [](__m256i a, __m256i b) {
+      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
+    };
+    auto packlow32 = [&pack32](__m256i a, __m256i b) {
+      __m256i mask = _mm256_set1_epi32(0xFFFF);
+      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
+    };
+    auto packhi32 = [&pack32](__m256i a, __m256i b) {
+      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
+    };
+
+    __m256i rb0 = packlow32(bytes0, bytes1);
+    __m256i rb1 = packlow32(bytes2, bytes3);
+    __m256i ga0 = packhi32(bytes0, bytes1);
+    __m256i ga1 = packhi32(bytes2, bytes3);
+
+    __m256i r = packlow32(rb0, rb1);
+    __m256i g = packlow32(ga0, ga1);
+    __m256i b = packhi32(rb0, rb1);
+    __m256i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm256_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 4;
+
+  __m256i nbits;
+  __m256i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
+    _mm256_storeu_si256((__m256i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
+    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
+
+    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm256_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto low_nbits_masked =
+        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
+
+    auto bits_shifted = _mm256_mullo_epi16(
+        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
+
+    nbits = _mm256_add_epi16(nbits, low.nbits);
+    bits = _mm256_or_si256(bits_shifted, low.bits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_NEON
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  uint32x4_t mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  Mask32 And(const Mask32& oth) const {
+    return Mask32{vandq_u32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint32_t val_unset[4] = {0, 1, 2, 3};
+    uint32_t val_set[4] = {4, 4, 4, 4};
+    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
+    return vminvq_u32(val);
+  }
+};
+
+struct SIMDVec32 {
+  uint32x4_t vec;
+
+  static constexpr size_t kLanes = 4;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{vld1q_u32(data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{vdupq_n_u32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{vaddq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{veorq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{vceqq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{
+        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  uint16x8_t mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{vandq_u16(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
+    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
+    return vminvq_u16(val);
+  }
+};
+
+struct SIMDVec16 {
+  uint16x8_t vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{vld1q_u16(data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{vdupq_n_u16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{vaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{vminq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{vceqq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{
+        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{vorrq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{veorq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{vandq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    uint8x16_t tbl = vld1q_u8(table);
+    uint8x16_t indices = vreinterpretq_u8_u16(vec);
+    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
+            SIMDVec16{vzip2q_u16(low.vec, vec)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
+    uint32x4_t hi = vmovl_high_u16(vec);
+    return {SIMDVec32{lo}, SIMDVec32{hi}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{
+        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    uint8x8_t v = vld1_u8(data);
+    return {SIMDVec16{vmovl_u8(v)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    uint8x8x2_t v = vld2_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    uint8x8x3_t v = vld3_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    uint8x8x4_t v = vld4_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
+            SIMDVec16{v.val[3]}};
+  }
+
+  void SwapEndian() {
+    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 2;
+
+  uint64x2_t nbits;
+  uint64x2_t bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    vst1q_u64(nbits_out, nbits);
+    vst1q_u64(bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  uint32x4_t nbits;
+  uint32x4_t bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint64x2_t nbits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t bits_hi32 =
+        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
+                  vreinterpretq_s64_u64(nbits_lo32));
+    uint64x2_t bits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t nbits64 =
+        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
+    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits =
+        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
+    nbits = vaddq_u32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+};
+
+struct Bits16 {
+  uint16x8_t nbits;
+  uint16x8_t bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint32x4_t nbits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
+    uint32x4_t bits_hi16 =
+        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
+                  vreinterpretq_s32_u32(nbits_lo16));
+    uint32x4_t bits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
+    uint32x4_t nbits32 =
+        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
+    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits =
+        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
+    nbits = vaddq_u16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_GENERIC_SIMD
+constexpr size_t SIMDVec32::kLanes;
+constexpr size_t SIMDVec16::kLanes;
+
+//  Each of these functions will process SIMDVec16::kLanes worth of values.
+
+FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
+                              uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 res = SIMDVec16::Load(residuals);
+  SIMDVec16 token = res.ValToToken();
+  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
+  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
+  token.Store(token_out);
+  nbits.Store(nbits_out);
+  bits.Store(bits_out);
+}
+
+FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
+                              uint32_t* nbits_out, uint32_t* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
+  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
+  SIMDVec32 token_lo = res_lo.ValToToken();
+  SIMDVec32 token_hi = res_hi.ValToToken();
+  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
+  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
+  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
+  token.Store(token_out);
+  nbits_lo.Store(nbits_out);
+  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
+  bits_lo.Store(bits_out);
+  bits_hi.Store(bits_out + SIMDVec32::kLanes);
+}
+
+FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
+                                   const PrefixCode& code, uint16_t* nbits_out,
+                                   uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
+  tok.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+  tok.U8Lookup(code.raw_bits_simd).Store(bits_out);
+}
+
+FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens, const PrefixCode& code,
+                               uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 token_cap = SIMDVec16::Val(15);
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 16; the Huffman code is constructed in
+  // such a way that the code for token 15 is the same as the code for 16,
+  // except for the highest bit.
+  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
+                                    const PrefixCode& code, uint16_t* nbits_out,
+                                    uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  // We assume `tok` fits in a *signed* 16-bit integer.
+  Mask16 above = tok.Gt(SIMDVec16::Val(12));
+  // 13, 14 -> 13
+  // 15, 16 -> 14
+  // 17, 18 -> 15
+  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
+  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 14, 16, 18.
+  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
+                                const uint16_t* bits_tok,
+                                const uint16_t* nbits_huff,
+                                const uint16_t* bits_huff, size_t n,
+                                size_t skip, Bits32* bits_out) {
+  Bits16 bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
+  Bits16 huff_bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
+  bits.Interleave(huff_bits);
+  bits.ClipTo(n);
+  bits.Skip(skip);
+  bits_out[0] = bits.Merge();
+}
+
+// Huffman and raw bits don't necessarily fit in a single u16 here.
+FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
+                                 const uint16_t* bits_tok,
+                                 const uint16_t* nbits_huff,
+                                 const uint16_t* bits_huff, size_t n,
+                                 size_t skip, Bits32* bits_out) {
+  VecPair<SIMDVec16> bits =
+      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
+  VecPair<SIMDVec16> nbits =
+      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
+  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
+  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
+  low.ClipTo(2 * n);
+  low.Skip(2 * skip);
+  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+
+  bits_out[0] = low.Merge();
+  bits_out[1] = hi.Merge();
+}
+
+FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
+                                  const uint32_t* bits_tok,
+                                  const uint16_t* nbits_huff,
+                                  const uint16_t* bits_huff, size_t n,
+                                  size_t skip, Bits32* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  Bits32 bits_low =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
+  Bits32 bits_hi =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
+                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
+
+  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
+  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
+
+  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
+  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
+
+  bits_low.Interleave(huff_low);
+  bits_low.ClipTo(n);
+  bits_low.Skip(skip);
+  bits_out[0] = bits_low;
+  bits_hi.Interleave(huff_hi);
+  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_out[1] = bits_hi;
+}
+
+#ifdef FJXL_AVX512
+FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
+  __m512i bits = bits32.bits;
+  __m512i nbits = bits32.nbits;
+
+  // Insert the leftover bits from the bit buffer at the bottom of the vector
+  // and extract the top of the vector.
+  uint64_t trail_bits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
+  uint64_t trail_nbits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
+  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
+  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
+  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
+  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
+
+  // Merge 32 -> 64 bits.
+  Bits32 b{nbits, bits};
+  Bits64 b64 = b.Merge();
+  bits = b64.bits;
+  nbits = b64.nbits;
+
+  __m512i zero = _mm512_setzero_si512();
+
+  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
+  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
+  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
+
+  // Compute first-past-end-bit-position.
+  __m512i end_interm0 = _mm512_add_epi64(nbits, sh1(nbits));
+  __m512i end_interm1 = _mm512_add_epi64(end_interm0, sh2(end_interm0));
+  __m512i end = _mm512_add_epi64(end_interm1, sh4(end_interm1));
+
+  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
+
+  // Compute begin-bit-position.
+  __m512i begin = _mm512_sub_epi64(end, nbits);
+
+  // Index of the last bit in the chunk, or the end bit if nbits==0.
+  __m512i last = _mm512_mask_sub_epi64(
+      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
+
+  __m512i lane_offset_mask = _mm512_set1_epi64(63);
+
+  // Starting position of the chunk that each lane will ultimately belong to.
+  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
+
+  // For all lanes that contain bits belonging to two different 64-bit chunks,
+  // compute the number of bits that belong to the first chunk.
+  // total # of bits fit in a u16, so we can satsub_u16 here.
+  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
+
+  // Move all the previous-chunk-bits to the previous lane.
+  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
+  __m512i first_chunk_bits =
+      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
+  __m512i first_chunk_bits_down =
+      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
+  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
+  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
+  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
+  begin = _mm512_add_epi64(begin, first_chunk_nbits);
+
+  // We now know that every lane should give bits to only one chunk. We can
+  // shift the bits and then horizontally-or-reduce them within the same chunk.
+  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
+  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
+  // h-or-reduce within same chunk
+  __m512i red0 = _mm512_mask_or_epi64(
+      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
+      sh1(aligned_bits), aligned_bits);
+  __m512i red1 = _mm512_mask_or_epi64(
+      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
+      red0);
+  __m512i reduced = _mm512_mask_or_epi64(
+      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
+      red1);
+  // Extract the highest lane that belongs to each chunk (the lane that ends up
+  // with the OR-ed value of all the other lanes of that chunk).
+  __m512i next_chunk_start =
+      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
+  __m512i result = _mm512_maskz_compress_epi64(
+      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
+
+  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
+                      result);
+
+  // Update the bit writer and add the last 32-bit lane.
+  // Note that since trail_nbits was at most 32 to begin with, operating on
+  // trail_bits does not risk overflowing.
+  output.bytes_written += simd_nbits / 8;
+  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
+  // that the byte of bitreader data we access is initialized. This is
+  // guaranteed because the remaining bits in the bitreader buffer are at most
+  // 7, so simd_nbits <= 505 always.
+  trail_bits = (trail_bits << (simd_nbits % 8)) +
+               output.data.get()[output.bytes_written];
+  trail_nbits += simd_nbits % 8;
+  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
+  size_t trail_bytes = trail_nbits / 8;
+  output.bits_in_buffer = trail_nbits % 8;
+  output.buffer = trail_bits >> (trail_bytes * 8);
+  output.bytes_written += trail_bytes;
+}
+
+#endif
+
+template <size_t n>
+FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
+#ifdef FJXL_AVX512
+  static_assert(n <= 2, "");
+  StoreToWriterAVX512(bits[0], output);
+  if (n == 2) {
+    StoreToWriterAVX512(bits[1], output);
+  }
+  return;
+#endif
+  static_assert(n <= 4, "");
+  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
+  alignas(64) uint64_t bits64[Bits64::kLanes * n];
+  bits[0].Merge().Store(nbits64, bits64);
+  if (n > 1) {
+    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
+  }
+  if (n > 2) {
+    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
+                          bits64 + 2 * Bits64::kLanes);
+  }
+  if (n > 3) {
+    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
+                          bits64 + 3 * Bits64::kLanes);
+  }
+  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
+}
+
+namespace detail {
+template <typename T>
+struct IntegerTypes;
+
+template <>
+struct IntegerTypes<SIMDVec16> {
+  using signed_ = int16_t;
+  using unsigned_ = uint16_t;
+};
+
+template <>
+struct IntegerTypes<SIMDVec32> {
+  using signed_ = int32_t;
+  using unsigned_ = uint32_t;
+};
+
+template <typename T>
+struct SIMDType;
+
+template <>
+struct SIMDType<int16_t> {
+  using type = SIMDVec16;
+};
+
+template <>
+struct SIMDType<int32_t> {
+  using type = SIMDVec32;
+};
+
+}  // namespace detail
+
+template <typename T>
+using signed_t = typename detail::IntegerTypes<T>::signed_;
+
+template <typename T>
+using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
+
+template <typename T>
+using simd_t = typename detail::SIMDType<T>::type;
+
+// This function will process exactly one vector worth of pixels.
+
+template <typename T>
+size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
+                     const signed_t<T>* pixels_top,
+                     const signed_t<T>* pixels_topleft,
+                     unsigned_t<T>* residuals) {
+  T px = T::Load((unsigned_t<T>*)pixels);
+  T left = T::Load((unsigned_t<T>*)pixels_left);
+  T top = T::Load((unsigned_t<T>*)pixels_top);
+  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
+  T ac = left.Sub(topleft);
+  T ab = left.Sub(top);
+  T bc = top.Sub(topleft);
+  T grad = ac.Add(top);
+  T d = ab.Xor(bc);
+  T zero = T::Val(0);
+  T clamp = zero.Gt(d).IfThenElse(top, left);
+  T s = ac.Xor(bc);
+  T pred = zero.Gt(s).IfThenElse(grad, clamp);
+  T res = px.Sub(pred);
+  T res_times_2 = res.Add(res);
+  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
+  res.Store(residuals);
+  return res.Eq(T::Val(0)).CountPrefix();
+}
+
+#endif
+
+void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
+                         uint32_t* bits) {
+  uint32_t n = FloorLog2(value);
+  *token = value ? n + 1 : 0;
+  *nbits = value ? n : 0;
+  *bits = value ? value - (1 << n) : 0;
+}
+
+#ifdef FJXL_AVX512
+constexpr static size_t kLogChunkSize = 5;
+#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
+// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
+// to process two vectors at a time.
+constexpr static size_t kLogChunkSize = 4;
+#else
+constexpr static size_t kLogChunkSize = 3;
+#endif
+
+constexpr static size_t kChunkSize = 1 << kLogChunkSize;
+
+template <typename Residual>
+void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
+                        const PrefixCode& code, BitWriter& output) {
+  for (size_t ix = skip; ix < n; ix++) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+    output.Write(code.raw_nbits[token] + nbits,
+                 code.raw_bits[token] | bits << code.raw_nbits[token]);
+  }
+}
+
+struct UpTo8Bits {
+  size_t bitdepth;
+  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 8);
+  }
+  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
+  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
+  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
+  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
+  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
+  // in total.
+  static constexpr uint8_t kMinRawLength[12] = {};
+  static constexpr uint8_t kMaxRawLength[12] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 16; }
+  static constexpr size_t kInputBytes = 1;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t UpTo8Bits::kMinRawLength[];
+constexpr uint8_t UpTo8Bits::kMaxRawLength[];
+
+struct From9To13Bits {
+  size_t bitdepth;
+  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 13 && bitdepth >= 9);
+  }
+  // Last symbol is used for LZ77 lengths and has no limitations except allowing
+  // to represent 32 symbols in total.
+  // We cannot fit all the bits in a u16, so do not even try and use up to 8
+  // bits per raw symbol.
+  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
+  // any special tricks.
+  static constexpr uint8_t kMinRawLength[17] = {};
+  static constexpr uint8_t kMaxRawLength[17] = {
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 21; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t From9To13Bits::kMinRawLength[];
+constexpr uint8_t From9To13Bits::kMaxRawLength[];
+
+void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
+  assert(nbits1 == 8);
+  assert(nbits2 == 8);
+  assert(bits2 == (bits1 | 128));
+}
+
+struct Exactly14Bits {
+  explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for 15 and 16 is identical up to one bit.
+  static constexpr uint8_t kMinRawLength[18] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[18] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
+  };
+  static constexpr size_t bitdepth = 14;
+  static size_t MaxEncodedBitsPerSample() { return 22; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 17);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMD14(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool) const { return 17; }
+};
+constexpr uint8_t Exactly14Bits::kMinRawLength[];
+constexpr uint8_t Exactly14Bits::kMaxRawLength[];
+
+struct MoreThan14Bits {
+  size_t bitdepth;
+  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth > 14);
+    assert(bitdepth <= 16);
+  }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
+  // bit.
+  static constexpr uint8_t kMinRawLength[20] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[20] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 24; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int32_t;
+  using upixel_t = uint32_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 19);
+    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
+    for (size_t i = 0; i < 14; i++) {
+      nbits_simd[i] = nbits[i];
+      bits_simd[i] = bits[i];
+    }
+    nbits_simd[14] = nbits[15];
+    bits_simd[14] = bits[15];
+    nbits_simd[15] = nbits[17];
+    bits_simd[15] = bits[17];
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint32_t bits[SIMDVec16::kLanes];
+    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDAbove14(token, code, nbits_huff, bits_huff);
+      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                       std::max(skip, i) - i,
+                       bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+  size_t NumSymbols(bool) const { return 19; }
+};
+constexpr uint8_t MoreThan14Bits::kMinRawLength[];
+constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
+
+void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
+                           const PrefixCode code[4], BitWriter* output) {
+  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
+  // No patches, spline or noise.
+  output->Write(1, 1);  // default DC dequantization factors (?)
+  output->Write(1, 1);  // use global tree / histograms
+  output->Write(1, 0);  // no lz77 for the tree
+
+  output->Write(1, 1);         // simple code for the tree's context map
+  output->Write(2, 0);         // all contexts clustered together
+  output->Write(1, 1);         // use prefix code for tree
+  output->Write(4, 0);         // 000 hybrid uint
+  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
+  output->Write(2, 1);         // simple prefix code
+  output->Write(2, 3);         // with 4 symbols
+  output->Write(2, 0);
+  output->Write(2, 1);
+  output->Write(2, 2);
+  output->Write(2, 3);
+  output->Write(1, 0);  // First tree encoding option
+  // Huffman table + extra bits for the tree.
+  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
+  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
+  // Write a tree with a leaf per channel, and gradient predictor for every
+  // leaf.
+  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
+                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
+    output->Write(symbol_nbits[v], symbol_bits[v]);
+  }
+
+  output->Write(1, 1);     // Enable lz77 for the main bitstream
+  output->Write(2, 0b00);  // lz77 offset 224
+  static_assert(kLZ77Offset == 224, "");
+  output->Write(4, 0b1010);  // lz77 min length 7
+  // 400 hybrid uint config for lz77
+  output->Write(4, 4);
+  output->Write(3, 0);
+  output->Write(3, 0);
+
+  output->Write(1, 1);  // simple code for the context map
+  output->Write(2, 3);  // 3 bits per entry
+  output->Write(3, 4);  // channel 3
+  output->Write(3, 3);  // channel 2
+  output->Write(3, 2);  // channel 1
+  output->Write(3, 1);  // channel 0
+  output->Write(3, 0);  // distance histogram first
+
+  output->Write(1, 1);  // use prefix codes
+  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
+  }
+
+  // Distance alphabet size:
+  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
+  // Symbol + LZ77 alphabet size:
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(1, 1);    // > 1
+    output->Write(4, 8);    // <= 512
+    output->Write(8, 256);  // == 512
+  }
+
+  // Distance histogram:
+  output->Write(2, 1);  // simple prefix code
+  output->Write(2, 0);  // with one symbol
+  output->Write(1, 1);  // 1
+
+  // Symbol + lz77 histogram:
+  for (size_t i = 0; i < 4; i++) {
+    code[i].WriteTo(output);
+  }
+
+  // Group header for global modular image.
+  output->Write(1, 1);  // Global tree
+  output->Write(1, 1);  // All default wp
+}
+
+void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
+                     size_t nb_chans, const PrefixCode code[4],
+                     BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  if (nb_chans > 2) {
+    output->Write(2, 0b01);     // 1 transform
+    output->Write(2, 0b00);     // RCT
+    output->Write(5, 0b00000);  // Starting from ch 0
+    output->Write(2, 0b00);     // YCoCg
+  } else {
+    output->Write(2, 0b00);  // no transforms
+  }
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <typename BitDepth>
+struct ChunkEncoder {
+  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
+                                    BitWriter& output) {
+    if (count == 0) return;
+    count -= kLZ77MinLength + 1;
+    if (count < kLZ77CacheSize) {
+      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
+    } else {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      uint64_t wbits = bits;
+      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
+      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
+      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
+    }
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    EncodeRle(run, *code, *output);
+    BitDepth::EncodeChunk(residuals, n, skip, *code, *output);
+  }
+
+  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
+
+  const PrefixCode* code;
+  BitWriter* output;
+};
+
+template <typename BitDepth>
+struct ChunkSampleCollector {
+  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts) {
+    if (count == 0) return;
+    raw_counts[0] += 1;
+    count -= kLZ77MinLength + 1;
+    unsigned token, nbits, bits;
+    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+    lz77_counts[token]++;
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    // Run is broken. Encode the run and encode the individual vector.
+    Rle(run, lz77_counts);
+    for (size_t ix = skip; ix < n; ix++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+      raw_counts[token]++;
+    }
+  }
+
+  // don't count final run since we don't know how long it really is
+  void Finalize(size_t run) {}
+
+  uint64_t* raw_counts;
+  uint64_t* lz77_counts;
+};
+
+constexpr uint32_t PackSigned(int32_t value) {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+template <typename T, typename BitDepth>
+struct ChannelRowProcessor {
+  using upixel_t = typename BitDepth::upixel_t;
+  using pixel_t = typename BitDepth::pixel_t;
+  T* t;
+  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
+                    const pixel_t* row_top, const pixel_t* row_topleft,
+                    size_t n) {
+    alignas(64) upixel_t residuals[kChunkSize] = {};
+    size_t prefix_size = 0;
+    size_t required_prefix_size = 0;
+#ifdef FJXL_GENERIC_SIMD
+    constexpr size_t kNum =
+        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
+    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
+      size_t c =
+          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
+                                         row_topleft + ix, residuals + ix);
+      prefix_size =
+          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
+      required_prefix_size += kNum;
+    }
+#else
+    for (size_t ix = 0; ix < kChunkSize; ix++) {
+      pixel_t px = row[ix];
+      pixel_t left = row_left[ix];
+      pixel_t top = row_top[ix];
+      pixel_t topleft = row_topleft[ix];
+      pixel_t ac = left - topleft;
+      pixel_t ab = left - top;
+      pixel_t bc = top - topleft;
+      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
+                                          static_cast<upixel_t>(top));
+      pixel_t d = ab ^ bc;
+      pixel_t clamp = d < 0 ? top : left;
+      pixel_t s = ac ^ bc;
+      pixel_t pred = s < 0 ? grad : clamp;
+      residuals[ix] = PackSigned(px - pred);
+      prefix_size = prefix_size == required_prefix_size
+                        ? prefix_size + (residuals[ix] == 0)
+                        : prefix_size;
+      required_prefix_size += 1;
+    }
+#endif
+    prefix_size = std::min(n, prefix_size);
+    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
+      // Run continues, nothing to do.
+      run += prefix_size;
+    } else if (prefix_size + run > kLZ77MinLength) {
+      // Run is broken. Encode the run and encode the individual vector.
+      t->Chunk(run + prefix_size, residuals, prefix_size, n);
+      run = 0;
+    } else {
+      // There was no run to begin with.
+      t->Chunk(0, residuals, 0, n);
+    }
+  }
+
+  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
+                  const pixel_t* row_top, const pixel_t* row_topleft,
+                  size_t xs) {
+    for (size_t x = 0; x < xs; x += kChunkSize) {
+      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
+                   std::min(kChunkSize, xs - x));
+    }
+  }
+
+  void Finalize() { t->Finalize(run); }
+  // Invariant: run == 0 or run > kLZ77MinLength.
+  size_t run = 0;
+};
+
+uint16_t LoadLE16(const unsigned char* ptr) {
+  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
+}
+
+uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
+
+#ifdef FJXL_GENERIC_SIMD
+void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
+
+void StorePixels(SIMDVec16 p, int32_t* dest) {
+  VecPair<SIMDVec32> p_up = p.Upcast();
+  p_up.low.Store((uint32_t*)dest);
+  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG8(rgba + x);
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[x];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t val = LoadLE16(rgba + 2 * x);
+    if (big_endian) {
+      val = SwapEndian(val);
+    }
+    luma[x] = val;
+  }
+}
+
+template <typename pixel_t>
+void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[2 * x];
+    alpha[x] = rgba[2 * x + 1];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                 pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t l = LoadLE16(rgba + 4 * x);
+    uint16_t a = LoadLE16(rgba + 4 * x + 2);
+    if (big_endian) {
+      l = SwapEndian(l);
+      a = SwapEndian(a);
+    }
+    luma[x] = l;
+    alpha[x] = a;
+  }
+}
+
+template <typename pixel_t>
+void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
+                pixel_t* cg) {
+  *co = r - b;
+  pixel_t tmp = b + (*co >> 1);
+  *cg = g - tmp;
+  *y = tmp + (*cg >> 1);
+}
+
+#ifdef FJXL_GENERIC_SIMD
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
+                int16_t* cg) {
+  SIMDVec16 co_v = r.Sub(b);
+  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
+  SIMDVec16 cg_v = g.Sub(tmp);
+  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
+  y_v.Store((uint16_t*)y);
+  co_v.Store((uint16_t*)co);
+  cg_v.Store((uint16_t*)cg);
+}
+
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
+                int32_t* cg) {
+  VecPair<SIMDVec32> r_up = r.Upcast();
+  VecPair<SIMDVec32> g_up = g.Upcast();
+  VecPair<SIMDVec32> b_up = b.Upcast();
+  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
+  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
+  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
+  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
+  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
+  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
+  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
+  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
+  y_lo_v.Store((uint32_t*)y);
+  co_lo_v.Store((uint32_t*)co);
+  cg_lo_v.Store((uint32_t*)cg);
+  y_hi_v.Store((uint32_t*)y + SIMDVec32::kLanes);
+  co_hi_v.Store((uint32_t*)co + SIMDVec32::kLanes);
+  cg_hi_v.Store((uint32_t*)cg + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
+                 pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[3 * x];
+    uint16_t g = rgba[3 * x + 1];
+    uint16_t b = rgba[3 * x + 2];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 6 * x);
+    uint16_t g = LoadLE16(rgba + 6 * x + 2);
+    uint16_t b = LoadLE16(rgba + 6 * x + 4);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <typename pixel_t>
+void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[4 * x];
+    uint16_t g = rgba[4 * x + 1];
+    uint16_t b = rgba[4 * x + 2];
+    uint16_t a = rgba[4 * x + 3];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+      rgb[3].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 8 * x);
+    uint16_t g = LoadLE16(rgba + 8 * x + 2);
+    uint16_t b = LoadLE16(rgba + 8 * x + 4);
+    uint16_t a = LoadLE16(rgba + 8 * x + 6);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+      a = SwapEndian(a);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <typename Processor, typename BitDepth>
+void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
+                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
+                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                      Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  using pixel_t = typename BitDepth::pixel_t;
+
+  constexpr size_t kAlign = 64;
+  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
+
+  auto align = [=](pixel_t* ptr) {
+    size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
+    if (offset) {
+      ptr += offset / sizeof(pixel_t);
+    }
+    return ptr;
+  };
+
+  constexpr size_t kNumPx =
+      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
+      kAlignPixels;
+
+  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
+
+  for (size_t y = 0; y < ys; y++) {
+    const auto rgba_row =
+        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
+    pixel_t* crow[4] = {};
+    pixel_t* prow[4] = {};
+    for (size_t i = 0; i < nb_chans; i++) {
+      crow[i] = align(&group_data[i][y & 1][kPadding]);
+      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
+    }
+
+    // Pre-fill rows with YCoCg converted pixels.
+    if (nb_chans == 1) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowG8(rgba_row, xs, crow[0]);
+      } else if (big_endian) {
+        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
+      } else {
+        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
+      }
+    } else if (nb_chans == 2) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
+      } else if (big_endian) {
+        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
+      } else {
+        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
+      }
+    } else if (nb_chans == 3) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
+      } else if (big_endian) {
+        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                          crow[2]);
+      } else {
+        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2]);
+      }
+    } else {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
+      } else if (big_endian) {
+        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2], crow[3]);
+      } else {
+        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                            crow[2], crow[3]);
+      }
+    }
+    // Deal with x == 0.
+    for (size_t c = 0; c < nb_chans; c++) {
+      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+      // Fix topleft.
+      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+    }
+    if (y < yskip) continue;
+    for (size_t c = 0; c < nb_chans; c++) {
+      // Get pointers to px/left/top/topleft data to speedup loop.
+      const pixel_t* row = crow[c];
+      const pixel_t* row_left = crow[c] - 1;
+      const pixel_t* row_top = y == 0 ? row_left : prow[c];
+      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
+
+      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
+    }
+  }
+  for (size_t c = 0; c < nb_chans; c++) {
+    processors[c].Finalize();
+  }
+}
+
+template <typename BitDepth>
+void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t ys, size_t row_stride, bool is_single_group,
+                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                    const PrefixCode code[4],
+                    std::array<BitWriter, 4>& output) {
+  for (size_t i = 0; i < nb_chans; i++) {
+    if (is_single_group && i == 0) continue;
+    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
+  }
+  if (!is_single_group) {
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output[0].Write(1, 1);     // Global tree
+    output[0].Write(1, 1);     // All default wp
+    output[0].Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<BitDepth> encoders[4];
+  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
+  for (size_t c = 0; c < nb_chans; c++) {
+    row_encoders[c].t = &encoders[c];
+    encoders[c].output = &output[c];
+    encoders[c].code = &code[c];
+  }
+  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
+      row_encoders);
+}
+
+constexpr int kHashExp = 16;
+constexpr uint32_t kHashSize = 1 << kHashExp;
+constexpr uint32_t kHashMultiplier = 2654435761;
+constexpr int kMaxColors = 512;
+
+// can be any function that returns a value in 0 .. kHashSize-1
+// has to map 0 to 0
+inline uint32_t pixel_hash(uint32_t p) {
+  return (p * kHashMultiplier) >> (32 - kHashExp);
+}
+
+template <size_t nb_chans>
+void FillRowPalette(const unsigned char* inrow, size_t xs,
+                    const int16_t* lookup, int16_t* out) {
+  for (size_t x = 0; x < xs; x++) {
+    uint32_t p = 0;
+    memcpy(&p, inrow + x * nb_chans, nb_chans);
+    out[x] = lookup[pixel_hash(p)];
+  }
+}
+
+template <typename Processor>
+void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                             size_t xs, size_t yskip, size_t ys,
+                             size_t row_stride, const int16_t* lookup,
+                             size_t nb_chans, Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
+  Processor& row_encoder = processors[0];
+
+  for (size_t y = 0; y < ys; y++) {
+    // Pre-fill rows with palette converted pixels.
+    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
+    int16_t* outrow = &group_data[y & 1][kPadding];
+    if (nb_chans == 1) {
+      FillRowPalette<1>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 2) {
+      FillRowPalette<2>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 3) {
+      FillRowPalette<3>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 4) {
+      FillRowPalette<4>(inrow, xs, lookup, outrow);
+    }
+    // Deal with x == 0.
+    group_data[y & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Fix topleft.
+    group_data[(y - 1) & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Get pointers to px/left/top/topleft data to speedup loop.
+    const int16_t* row = &group_data[y & 1][kPadding];
+    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
+    const int16_t* row_top =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
+    const int16_t* row_topleft =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
+
+    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
+  }
+  row_encoder.Finalize();
+}
+
+void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                           size_t xs, size_t ys, size_t row_stride,
+                           bool is_single_group, const PrefixCode code[4],
+                           const int16_t* lookup, size_t nb_chans,
+                           BitWriter& output) {
+  if (!is_single_group) {
+    output.Allocate(16 * xs * ys + 4);
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output.Write(1, 1);     // Global tree
+    output.Write(1, 1);     // All default wp
+    output.Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+
+  row_encoder.t = &encoder;
+  encoder.output = &output;
+  encoder.code = &code[is_single_group ? 1 : 0];
+  ProcessImageAreaPalette<
+      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
+}
+
+template <typename BitDepth>
+void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t row_stride, size_t row_count,
+                    uint64_t raw_counts[4][kNumRawSymbols],
+                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
+                    bool palette, BitDepth bitdepth, size_t nb_chans,
+                    bool big_endian, const int16_t* lookup) {
+  if (palette) {
+    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
+      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
+    }
+    ProcessImageAreaPalette<
+        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
+        row_sample_collectors);
+  } else {
+    ChunkSampleCollector<BitDepth> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[c];
+      sample_collectors[c].lz77_counts = lz77_counts[c];
+    }
+    ProcessImageArea<
+        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
+        big_endian, row_sample_collectors);
+  }
+}
+
+void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
+                            const PrefixCode code[4],
+                            const std::vector<uint32_t>& palette,
+                            size_t pcolors, BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  output->Write(2, 0b01);     // 1 transform
+  output->Write(2, 0b01);     // Palette
+  output->Write(5, 0b00000);  // Starting from ch 0
+  output->Write(2, 0b10);     // 4-channel palette (RGBA)
+  // pcolors <= kMaxColors + kChunkSize - 1
+  static_assert(kMaxColors + kChunkSize < 1281,
+                "add code to signal larger palette sizes");
+  if (pcolors < 256) {
+    output->Write(2, 0b00);
+    output->Write(8, pcolors);
+  } else {
+    output->Write(2, 0b01);
+    output->Write(10, pcolors - 256);
+  }
+
+  output->Write(2, 0b00);  // nb_deltas == 0
+  output->Write(4, 0);     // Zero predictor for delta palette
+  // Encode palette
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+  row_encoder.t = &encoder;
+  encoder.output = output;
+  encoder.code = &code[0];
+  int16_t p[4][32 + 1024] = {};
+  uint8_t prgba[4];
+  size_t i = 0;
+  size_t have_zero = 0;
+  if (palette[pcolors - 1] == 0) have_zero = 1;
+  for (; i < pcolors; i++) {
+    memcpy(prgba, &palette[i], 4);
+    p[0][16 + i + have_zero] = prgba[0];
+    p[1][16 + i + have_zero] = prgba[1];
+    p[2][16 + i + have_zero] = prgba[2];
+    p[3][16 + i + have_zero] = prgba[3];
+  }
+  p[0][15] = 0;
+  row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors);
+  p[1][15] = p[0][16];
+  p[0][15] = p[0][16];
+  row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors);
+  p[2][15] = p[1][16];
+  p[1][15] = p[1][16];
+  row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors);
+  p[3][15] = p[2][16];
+  p[2][15] = p[2][16];
+  row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors);
+  row_encoder.Finalize();
+
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <typename BitDepth>
+JxlFastLosslessFrameState* LLEnc(const unsigned char* rgba, size_t width,
+                                 size_t stride, size_t height,
+                                 BitDepth bitdepth, size_t nb_chans,
+                                 bool big_endian, int effort,
+                                 void* runner_opaque,
+                                 FJxlParallelRunner runner) {
+  assert(width != 0);
+  assert(height != 0);
+  assert(stride >= nb_chans * BitDepth::kInputBytes * width);
+
+  // Count colors to try palette
+  std::vector<uint32_t> palette(kHashSize);
+  palette[0] = 1;
+  std::vector<int16_t> lookup(kHashSize);
+  lookup[0] = 0;
+  int pcolors = 0;
+  bool collided = effort < 2 || bitdepth.bitdepth != 8 ||
+                  nb_chans < 4;  // todo: also do rgb palette
+  for (size_t y = 0; y < height && !collided; y++) {
+    const unsigned char* r = rgba + stride * y;
+    size_t x = 0;
+    if (nb_chans == 4) {
+      // this is just an unrolling of the next loop
+      for (; x + 7 < width; x += 8) {
+        uint32_t p[8], index[8];
+        memcpy(p, r + x * 4, 32);
+        for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
+        for (int i = 0; i < 8; i++) {
+          uint32_t init_entry = index[i] ? 0 : 1;
+          if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) {
+            collided = true;
+          }
+        }
+        for (int i = 0; i < 8; i++) palette[index[i]] = p[i];
+      }
+      for (; x < width; x++) {
+        uint32_t p;
+        memcpy(&p, r + x * 4, 4);
+        uint32_t index = pixel_hash(p);
+        uint32_t init_entry = index ? 0 : 1;
+        if (init_entry != palette[index] && p != palette[index]) {
+          collided = true;
+        }
+        palette[index] = p;
+      }
+    } else {
+      for (; x < width; x++) {
+        uint32_t p = 0;
+        memcpy(&p, r + x * nb_chans, nb_chans);
+        uint32_t index = pixel_hash(p);
+        uint32_t init_entry = index ? 0 : 1;
+        if (init_entry != palette[index] && p != palette[index]) {
+          collided = true;
+        }
+        palette[index] = p;
+      }
+    }
+  }
+
+  int nb_entries = 0;
+  if (!collided) {
+    if (palette[0] == 0) pcolors = 1;
+    if (palette[0] == 1) palette[0] = 0;
+    bool have_color = false;
+    uint8_t minG = 255, maxG = 0;
+    for (uint32_t k = 0; k < kHashSize; k++) {
+      if (palette[k] == 0) continue;
+      uint8_t p[4];
+      memcpy(p, &palette[k], 4);
+      // move entries to front so sort has less work
+      palette[nb_entries] = palette[k];
+      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
+      if (p[1] < minG) minG = p[1];
+      if (p[1] > maxG) maxG = p[1];
+      nb_entries++;
+      // don't do palette if too many colors are needed
+      if (nb_entries + pcolors > kMaxColors) {
+        collided = true;
+        break;
+      }
+    }
+    if (!have_color) {
+      // don't do palette if it's just grayscale without many holes
+      if (maxG - minG < nb_entries * 1.4f) collided = true;
+    }
+  }
+  if (!collided) {
+    std::sort(
+        palette.begin(), palette.begin() + nb_entries,
+        [](uint32_t ap, uint32_t bp) {
+          if (ap == 0) return false;
+          if (bp == 0) return true;
+          uint8_t a[4], b[4];
+          memcpy(a, &ap, 4);
+          memcpy(b, &bp, 4);
+          float ay, by;
+          ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
+          by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
+          return ay < by;  // sort on alpha*luma
+        });
+    for (int k = 0; k < nb_entries; k++) {
+      if (palette[k] == 0) break;
+      lookup[pixel_hash(palette[k])] = pcolors++;
+    }
+  }
+
+  size_t num_groups_x = (width + 255) / 256;
+  size_t num_groups_y = (height + 255) / 256;
+  size_t num_dc_groups_x = (width + 2047) / 2048;
+  size_t num_dc_groups_y = (height + 2047) / 2048;
+
+  uint64_t raw_counts[4][kNumRawSymbols] = {};
+  uint64_t lz77_counts[4][kNumLZ77] = {};
+
+  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
+
+  // sample the middle (effort * 2) rows of every group
+  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    int y_offset = yg * 256;
+    int y_max = std::min<size_t>(height - yg * 256, 256);
+    int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2;
+    int y_count =
+        std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1);
+    int x_max =
+        std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize;
+    CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts,
+                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
+                   big_endian, lookup.data());
+  }
+
+  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
+  uint64_t base_raw_counts[kNumRawSymbols] = {
+      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
+      5,    1,   1,    1,    1,    1,   1,   1,   1};
+
+  bool doing_ycocg = nb_chans > 2 && collided;
+  for (size_t i = bitdepth.NumSymbols(doing_ycocg); i < kNumRawSymbols; i++) {
+    base_raw_counts[i] = 0;
+  }
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
+    }
+  }
+
+  if (!collided) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
+    // ensure all palette indices can actually be encoded
+    for (size_t i = 0; i < token + 1; i++)
+      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
+    // these tokens are only used for the palette itself so they can get a bad
+    // code
+    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
+  }
+
+  uint64_t base_lz77_counts[kNumLZ77] = {
+      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
+      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
+  };
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
+    }
+  }
+
+  alignas(64) PrefixCode hcode[4];
+  for (size_t i = 0; i < 4; i++) {
+    hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
+  }
+
+  size_t num_groups = onegroup ? 1
+                               : (2 + num_dc_groups_x * num_dc_groups_y +
+                                  num_groups_x * num_groups_y);
+
+  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
+
+  frame_state->width = width;
+  frame_state->height = height;
+  frame_state->nb_chans = nb_chans;
+  frame_state->bitdepth = bitdepth.bitdepth;
+
+  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
+  if (collided) {
+    PrepareDCGlobal(onegroup, width, height, nb_chans, hcode,
+                    &frame_state->group_data[0][0]);
+  } else {
+    PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors,
+                           &frame_state->group_data[0][0]);
+  }
+
+  auto run_one = [&](size_t g) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    size_t group_id =
+        onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g);
+    size_t xs = std::min<size_t>(width - xg * 256, 256);
+    size_t ys = std::min<size_t>(height - yg * 256, 256);
+    size_t x0 = xg * 256;
+    size_t y0 = yg * 256;
+    auto& gd = frame_state->group_data[group_id];
+    if (collided) {
+      WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans,
+                     big_endian, hcode, gd);
+
+    } else {
+      WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode,
+                            lookup.data(), nb_chans, gd[0]);
+    }
+  };
+
+  runner(
+      runner_opaque, &run_one,
+      +[](void* r, size_t i) { (*reinterpret_cast<decltype(&run_one)>(r))(i); },
+      num_groups_x * num_groups_y);
+
+  return frame_state;
+}
+
+JxlFastLosslessFrameState* JxlFastLosslessEncodeImpl(
+    const unsigned char* rgba, size_t width, size_t stride, size_t height,
+    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  assert(bitdepth > 0);
+  assert(nb_chans <= 4);
+  assert(nb_chans != 0);
+  if (bitdepth <= 8) {
+    return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth <= 13) {
+    return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth == 14) {
+    return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans,
+               big_endian, effort, runner_opaque, runner);
+}
+
+}  // namespace
+
+#endif  // FJXL_SELF_INCLUDE
+
+#ifndef FJXL_SELF_INCLUDE
+
+#define FJXL_SELF_INCLUDE
+
+// If we have NEON enabled, it is the default target.
+#if FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#define FJXL_NEON
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_NEON
+}  // namespace default_implementation
+
+#else  // FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#include "lib/jxl/enc_fast_lossless.cc"
+}
+
+#if FJXL_ENABLE_AVX2
+#ifdef __clang__
+#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
+                             apply_to = function)
+// Causes spurious warnings on clang5.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#elif defined(__GNUC__)
+#pragma GCC push_options
+// Seems to cause spurious errors on GCC8.
+#pragma GCC diagnostic ignored "-Wpsabi"
+#pragma GCC target "avx,avx2"
+#endif
+
+namespace AVX2 {
+#define FJXL_AVX2
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX2
+}  // namespace AVX2
+
+#ifdef __clang__
+#pragma clang attribute pop
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX2
+
+#if FJXL_ENABLE_AVX512
+#ifdef __clang__
+#pragma clang attribute push(                                                 \
+    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
+    apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
+#endif
+
+namespace AVX512 {
+#define FJXL_AVX512
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX512
+}  // namespace AVX512
+
+#ifdef __clang__
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX512
+
+#endif
+
+extern "C" {
+
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner) {
+  auto frame_state = JxlFastLosslessPrepareFrame(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
+                               /*is_last=*/1);
+  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
+  *output = (unsigned char*)malloc(output_size);
+  size_t written = 0;
+  size_t total = 0;
+  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
+                                               output_size - total)) != 0) {
+    total += written;
+  }
+  return total;
+}
+
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  auto trivial_runner =
+      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
+        for (size_t i = 0; i < count; i++) {
+          fun(opaque, i);
+        }
+      };
+
+  if (runner == nullptr) {
+    runner = trivial_runner;
+  }
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512cd") &&
+      __builtin_cpu_supports("avx512vbmi") &&
+      __builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512f") &&
+      __builtin_cpu_supports("avx512vl")) {
+    return AVX512::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                             nb_chans, bitdepth, big_endian,
+                                             effort, runner_opaque, runner);
+  }
+#endif
+#if FJXL_ENABLE_AVX2
+  if (__builtin_cpu_supports("avx2")) {
+    return AVX2::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                           nb_chans, bitdepth, big_endian,
+                                           effort, runner_opaque, runner);
+  }
+#endif
+
+  return default_implementation::JxlFastLosslessEncodeImpl(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+}
+
+}  // extern "C"
+
+#endif  // FJXL_SELF_INCLUDE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
new file mode 100644
index 0000000000..4ea1d4f69b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
@@ -0,0 +1,72 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FAST_LOSSLESS_H_
+#define LIB_JXL_ENC_FAST_LOSSLESS_H_
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Simple encoding API.
+
+// A FJxlParallelRunner must call fun(opaque, i) for all i from 0 to count. It
+// may do so in parallel.
+typedef void(FJxlParallelRunner)(void* runner_opaque, void* opaque,
+                                 void fun(void*, size_t), size_t count);
+
+// You may pass `nullptr` as a runner: encoding will be sequential.
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner);
+
+// More complex API for cases in which you may want to allocate your own buffer
+// and other advanced use cases.
+
+// Opaque struct that represents an intermediate state of the computation.
+struct JxlFastLosslessFrameState;
+
+// Returned JxlFastLosslessFrameState must be freed by calling
+// JxlFastLosslessFreeFrameState.
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner);
+
+// Prepare the (image/frame) header. You may encode animations by concatenating
+// the output of multiple frames, of which the first one has add_image_header =
+// 1 and subsequent ones have add_image_header = 0, and all frames but the last
+// one have is_last = 0.
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last);
+
+// Upper bound on the required output size, including any padding that may be
+// required by JxlFastLosslessWriteOutput. Cannot be called before
+// JxlFastLosslessPrepareHeader.
+size_t JxlFastLosslessMaxRequiredOutput(const JxlFastLosslessFrameState* frame);
+
+// Actual size of the frame once it is encoded. This is not identical to
+// JxlFastLosslessMaxRequiredOutput because JxlFastLosslessWriteOutput may
+// require extra padding.
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame);
+
+// Writes the frame to the given output buffer. Returns the number of bytes that
+// were written, which is at least 1 unless the entire output has been written
+// already. It is required that `output_size >= 32` when calling this function.
+// This function must be called repeatedly until it returns 0.
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size);
+
+// Frees the provided frame state.
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // LIB_JXL_ENC_FAST_LOSSLESS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.cc b/third_party/jpeg-xl/lib/jxl/enc_fields.cc
new file mode 100644
index 0000000000..22c763e13f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fields.cc
@@ -0,0 +1,239 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_fields.h"
+
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+namespace {
+using ::jxl::fields_internal::VisitorBase;
+class WriteVisitor : public VisitorBase {
+ public:
+  WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer)
+      : extension_bits_(extension_bits), writer_(writer) {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    ok_ &= BitsCoder::Write(bits, *value, writer_);
+    return true;
+  }
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    ok_ &= U32Coder::Write(enc, *value, writer_);
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    ok_ &= U64Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    ok_ &= F16Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    if (*extensions == 0) {
+      JXL_ASSERT(extension_bits_ == 0);
+      return true;
+    }
+    // TODO(janwas): extend API to pass in array of extension_bits, one per
+    // extension. We currently ascribe all bits to the first extension, but
+    // this is only an encoder limitation. NOTE: extension_bits_ can be zero
+    // if an extension does not require any additional fields.
+    ok_ &= U64Coder::Write(extension_bits_, writer_);
+    // For each nonzero bit except the lowest/first (already written):
+    for (uint64_t remaining_extensions = *extensions & (*extensions - 1);
+         remaining_extensions != 0;
+         remaining_extensions &= remaining_extensions - 1) {
+      ok_ &= U64Coder::Write(0, writer_);
+    }
+    return true;
+  }
+  // EndExtensions = default.
+
+  Status OK() const { return ok_; }
+
+ private:
+  const size_t extension_bits_;
+  BitWriter* JXL_RESTRICT writer_;
+  bool ok_ = true;
+};
+}  // namespace
+
+Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer,
+                     AuxOut* aux_out) {
+  size_t extension_bits, total_bits;
+  JXL_RETURN_IF_ERROR(Bundle::CanEncode(fields, &extension_bits, &total_bits));
+
+  BitWriter::Allotment allotment(writer, total_bits);
+  WriteVisitor visitor(extension_bits, writer);
+  JXL_RETURN_IF_ERROR(visitor.VisitConst(fields));
+  JXL_RETURN_IF_ERROR(visitor.OK());
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status BitsCoder::Write(const size_t bits, const uint32_t value,
+                        BitWriter* JXL_RESTRICT writer) {
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %d too large to encode in %" PRIu64 " bits",
+                       value, static_cast<uint64_t>(bits));
+  }
+  writer->Write(bits, value);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U32Coder::Write(const U32Enc enc, const uint32_t value,
+                       BitWriter* JXL_RESTRICT writer) {
+  uint32_t selector;
+  size_t total_bits;
+  JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits));
+
+  writer->Write(2, selector);
+
+  const U32Distr d = enc.GetDistr(selector);
+  if (!d.IsDirect()) {  // Nothing more to write for direct encoding
+    const uint32_t offset = d.Offset();
+    JXL_ASSERT(value >= offset);
+    writer->Write(total_bits - 2, value - offset);
+  }
+
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) {
+  if (value == 0) {
+    // Selector: use 0 bits, value 0
+    writer->Write(2, 0);
+  } else if (value <= 16) {
+    // Selector: use 4 bits, value 1..16
+    writer->Write(2, 1);
+    writer->Write(4, value - 1);
+  } else if (value <= 272) {
+    // Selector: use 8 bits, value 17..272
+    writer->Write(2, 2);
+    writer->Write(8, value - 17);
+  } else {
+    // Selector: varint, first a 12-bit group, after that per 8-bit group.
+    writer->Write(2, 3);
+    writer->Write(12, value & 4095);
+    value >>= 12;
+    int shift = 12;
+    while (value > 0 && shift < 60) {
+      // Indicate varint not done
+      writer->Write(1, 1);
+      writer->Write(8, value & 255);
+      value >>= 8;
+      shift += 8;
+    }
+    if (value > 0) {
+      // This only could happen if shift == N - 4.
+      writer->Write(1, 1);
+      writer->Write(4, value & 15);
+      // Implicitly closed sequence, no extra stop bit is required.
+    } else {
+      // Indicate end of varint
+      writer->Write(1, 0);
+    }
+  }
+
+  return true;
+}
+
+Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) {
+  uint32_t bits32;
+  memcpy(&bits32, &value, sizeof(bits32));
+  const uint32_t sign = bits32 >> 31;
+  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
+  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
+
+  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127;
+  if (JXL_UNLIKELY(exp > 15)) {
+    return JXL_FAILURE("Too big to encode, CanEncode should return false");
+  }
+
+  // Tiny or zero => zero.
+  if (exp < -24) {
+    writer->Write(16, 0);
+    return true;
+  }
+
+  uint32_t biased_exp16, mantissa16;
+
+  // exp = [-24, -15] => subnormal
+  if (JXL_UNLIKELY(exp < -14)) {
+    biased_exp16 = 0;
+    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
+    JXL_ASSERT(1 <= sub_exp && sub_exp < 11);
+    mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
+  } else {
+    // exp = [-14, 15]
+    biased_exp16 = static_cast<uint32_t>(exp + 15);
+    JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31);
+    mantissa16 = mantissa32 >> 13;
+  }
+
+  JXL_ASSERT(mantissa16 < 1024);
+  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+  JXL_ASSERT(bits16 < 0x10000);
+  writer->Write(16, bits16);
+  return true;
+}
+
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out) {
+  // Marker/signature
+  BitWriter::Allotment allotment(writer, 16);
+  writer->Write(8, 0xFF);
+  writer->Write(8, kCodestreamMarker);
+  allotment.ReclaimAndCharge(writer, kLayerHeader, aux_out);
+
+  JXL_RETURN_IF_ERROR(
+      WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out));
+
+  JXL_RETURN_IF_ERROR(
+      WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out));
+
+  metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded;
+  JXL_RETURN_IF_ERROR(
+      Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out));
+
+  return true;
+}
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
+  return Bundle::Write(frame, writer, kLayerHeader, aux_out);
+}
+
+Status WriteImageMetadata(const ImageMetadata& metadata,
+                          BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* aux_out) {
+  return Bundle::Write(metadata, writer, layer, aux_out);
+}
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out) {
+  return Bundle::Write(params, writer, layer, aux_out);
+}
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out) {
+  return Bundle::Write(size, writer, layer, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_fields.h b/third_party/jpeg-xl/lib/jxl/enc_fields.h
new file mode 100644
index 0000000000..5bb179a719
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_fields.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FIELDS_H_
+#define LIB_JXL_ENC_FIELDS_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Write headers from the CodecMetadata. Also may modify nonserialized_...
+// fields of the metadata.
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out);
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out);
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FIELDS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.cc b/third_party/jpeg-xl/lib/jxl/enc_file.cc
new file mode 100644
index 0000000000..b1f1442cc2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_file.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_file.h"
+
+#include <stddef.h>
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+namespace {
+
+Status PrepareCodecMetadataFromIO(const CompressParams& cparams,
+                                  const CodecInOut* io,
+                                  CodecMetadata* metadata) {
+  *metadata = io->metadata;
+  size_t ups = 1;
+  if (cparams.already_downsampled) ups = cparams.resampling;
+
+  JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups));
+
+  // Keep ICC profile in lossless modes because a reconstructed profile may be
+  // slightly different (quantization).
+  // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles.
+  if (!cparams.IsLossless() && !io->Main().IsJPEG()) {
+    metadata->m.color_encoding.DecideIfWantICC();
+  }
+
+  metadata->m.xyb_encoded =
+      cparams.color_transform == ColorTransform::kXYB ? true : false;
+
+  // TODO(firsching): move this EncodeFile to test_utils / re-implement this
+  // using API functions
+  return true;
+}
+
+}  // namespace
+
+Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib,
+                     const CodecMetadata* metadata, const JxlCmsInterface& cms,
+                     ThreadPool* pool, BitWriter* JXL_RESTRICT writer) {
+  BitWriter preview_writer;
+  // TODO(janwas): also support generating preview by downsampling
+  if (ib.HasColor()) {
+    AuxOut aux_out;
+    PassesEncoderState passes_enc_state;
+    // TODO(lode): check if we want all extra channels and matching xyb_encoded
+    // for the preview, such that using the main ImageMetadata object for
+    // encoding this frame is warrented.
+    FrameInfo frame_info;
+    frame_info.is_preview = true;
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib,
+                                    &passes_enc_state, cms, pool,
+                                    &preview_writer, &aux_out));
+    preview_writer.ZeroPadToByte();
+  }
+
+  if (preview_writer.BitsWritten() != 0) {
+    writer->ZeroPadToByte();
+    writer->AppendByteAligned(preview_writer);
+  }
+
+  return true;
+}
+
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
+                  const JxlCmsInterface& cms, AuxOut* aux_out,
+                  ThreadPool* pool) {
+  io->CheckMetadata();
+  BitWriter writer;
+
+  CompressParams cparams = params;
+  if (io->Main().color_transform != ColorTransform::kNone) {
+    // Set the color transform to YCbCr or XYB if the original image is such.
+    cparams.color_transform = io->Main().color_transform;
+  }
+
+  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
+
+  std::unique_ptr<CodecMetadata> metadata = jxl::make_unique<CodecMetadata>();
+  JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get()));
+  JXL_RETURN_IF_ERROR(WriteCodestreamHeaders(metadata.get(), &writer, aux_out));
+
+  // Only send ICC (at least several hundred bytes) if fields aren't enough.
+  if (metadata->m.color_encoding.WantICC()) {
+    JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer,
+                                 kLayerHeader, aux_out));
+  }
+
+  if (metadata->m.have_preview) {
+    JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame,
+                                      metadata.get(), cms, pool, &writer));
+  }
+
+  // Each frame should start on byte boundaries.
+  BitWriter::Allotment allotment(&writer, 8);
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, aux_out);
+
+  for (size_t i = 0; i < io->frames.size(); i++) {
+    FrameInfo info;
+    info.is_last = i == io->frames.size() - 1;
+    if (io->frames[i].use_for_next_frame) {
+      info.save_as_reference = 1;
+    }
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(),
+                                    io->frames[i], passes_enc_state, cms, pool,
+                                    &writer, aux_out));
+  }
+
+  // Clean up passes_enc_state in case it gets reused.
+  for (size_t i = 0; i < 4; i++) {
+    passes_enc_state->shared.dc_frames[i] = Image3F();
+    passes_enc_state->shared.reference_frames[i].frame = ImageBundle();
+  }
+
+  *compressed = std::move(writer).TakeBytes();
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_file.h b/third_party/jpeg-xl/lib/jxl/enc_file.h
new file mode 100644
index 0000000000..ff3ad1233d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_file.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FILE_H_
+#define LIB_JXL_ENC_FILE_H_
+
+// Facade for JXL encoding.
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+class CodecInOut;
+
+// Compresses pixels from `io` (given in any ColorEncoding).
+// `io->metadata.m.original` must be set.
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
+                  const JxlCmsInterface& cms, AuxOut* aux_out = nullptr,
+                  ThreadPool* pool = nullptr);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FILE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.cc b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
new file mode 100644
index 0000000000..ed4088120e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
@@ -0,0 +1,1745 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_frame.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_chroma_from_luma.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/enc_context_map.h"
+#include "lib/jxl/enc_entropy_coder.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_group.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_noise.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_photon_noise.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+namespace {
+
+PassDefinition progressive_passes_dc_vlf_lf_full_ac[] = {
+    {/*num_coefficients=*/2, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/4},
+    {/*num_coefficients=*/3, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
+
+PassDefinition progressive_passes_dc_quant_ac_full_ac[] = {
+    {/*num_coefficients=*/8, /*shift=*/1,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
+
+void ClusterGroups(PassesEncoderState* enc_state) {
+  if (enc_state->shared.frame_header.passes.num_passes > 1) {
+    // TODO(veluca): implement this for progressive modes.
+    return;
+  }
+  // This only considers pass 0 for now.
+  std::vector<uint8_t> context_map;
+  EntropyEncodingData codes;
+  auto& ac = enc_state->passes[0].ac_tokens;
+  size_t limit = std::ceil(std::sqrt(ac.size()));
+  if (limit == 1) return;
+  size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts();
+  std::vector<float> costs(ac.size());
+  HistogramParams params;
+  params.uint_method = HistogramParams::HybridUintMethod::kNone;
+  params.lz77_method = HistogramParams::LZ77Method::kNone;
+  params.ans_histogram_strategy =
+      HistogramParams::ANSHistogramStrategy::kApproximate;
+  size_t max = 0;
+  auto token_cost = [&](std::vector<std::vector<Token>>& tokens, size_t num_ctx,
+                        bool estimate = true) {
+    // TODO(veluca): not estimating is very expensive.
+    BitWriter writer;
+    size_t c = BuildAndEncodeHistograms(
+        params, num_ctx, tokens, &codes, &context_map,
+        estimate ? nullptr : &writer, 0, /*aux_out=*/0);
+    if (estimate) return c;
+    for (size_t i = 0; i < tokens.size(); i++) {
+      WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr);
+    }
+    return writer.BitsWritten();
+  };
+  for (size_t i = 0; i < ac.size(); i++) {
+    std::vector<std::vector<Token>> tokens{ac[i]};
+    costs[i] =
+        token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts());
+    if (costs[i] > costs[max]) {
+      max = i;
+    }
+  }
+  auto dist = [&](int i, int j) {
+    std::vector<std::vector<Token>> tokens{ac[i], ac[j]};
+    return token_cost(tokens, num_contexts) - costs[i] - costs[j];
+  };
+  std::vector<size_t> out{max};
+  std::vector<float> dists(ac.size());
+  size_t farthest = 0;
+  for (size_t i = 0; i < ac.size(); i++) {
+    if (i == max) continue;
+    dists[i] = dist(max, i);
+    if (dists[i] > dists[farthest]) {
+      farthest = i;
+    }
+  }
+
+  while (dists[farthest] > 0 && out.size() < limit) {
+    out.push_back(farthest);
+    dists[farthest] = 0;
+    enc_state->histogram_idx[farthest] = out.size() - 1;
+    for (size_t i = 0; i < ac.size(); i++) {
+      float d = dist(out.back(), i);
+      if (d < dists[i]) {
+        dists[i] = d;
+        enc_state->histogram_idx[i] = out.size() - 1;
+      }
+      if (dists[i] > dists[farthest]) {
+        farthest = i;
+      }
+    }
+  }
+
+  std::vector<size_t> remap(out.size());
+  std::iota(remap.begin(), remap.end(), 0);
+  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
+    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
+  }
+  auto remap_cost = [&](std::vector<size_t> remap) {
+    std::vector<size_t> re_remap(remap.size(), remap.size());
+    size_t r = 0;
+    for (size_t i = 0; i < remap.size(); i++) {
+      if (re_remap[remap[i]] == remap.size()) {
+        re_remap[remap[i]] = r++;
+      }
+      remap[i] = re_remap[remap[i]];
+    }
+    auto tokens = ac;
+    size_t max_hist = 0;
+    for (size_t i = 0; i < tokens.size(); i++) {
+      for (size_t j = 0; j < tokens[i].size(); j++) {
+        size_t hist = remap[enc_state->histogram_idx[i]];
+        tokens[i][j].context += hist * num_contexts;
+        max_hist = std::max(hist + 1, max_hist);
+      }
+    }
+    return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false);
+  };
+
+  for (size_t src = 0; src < out.size(); src++) {
+    float cost = remap_cost(remap);
+    size_t best = src;
+    for (size_t j = src + 1; j < out.size(); j++) {
+      if (remap[src] == remap[j]) continue;
+      auto remap_c = remap;
+      std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]);
+      float c = remap_cost(remap_c);
+      if (c < cost) {
+        best = j;
+        cost = c;
+      }
+    }
+    if (src != best) {
+      std::replace(remap.begin(), remap.end(), remap[src], remap[best]);
+    }
+  }
+  std::vector<size_t> re_remap(remap.size(), remap.size());
+  size_t r = 0;
+  for (size_t i = 0; i < remap.size(); i++) {
+    if (re_remap[remap[i]] == remap.size()) {
+      re_remap[remap[i]] = r++;
+    }
+    remap[i] = re_remap[remap[i]];
+  }
+
+  enc_state->shared.num_histograms =
+      *std::max_element(remap.begin(), remap.end()) + 1;
+  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
+    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
+  }
+  for (size_t i = 0; i < ac.size(); i++) {
+    for (size_t j = 0; j < ac[i].size(); j++) {
+      ac[i][j].context += enc_state->histogram_idx[i] * num_contexts;
+    }
+  }
+}
+
+uint64_t FrameFlagsFromParams(const CompressParams& cparams) {
+  uint64_t flags = 0;
+
+  const float dist = cparams.butteraugli_distance;
+
+  // We don't add noise at low butteraugli distances because the original
+  // noise is stored within the compressed image and adding noise makes things
+  // worse.
+  if (ApplyOverride(cparams.noise, dist >= kMinButteraugliForNoise) ||
+      cparams.photon_noise_iso > 0 ||
+      cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) {
+    flags |= FrameHeader::kNoise;
+  }
+
+  if (cparams.progressive_dc > 0 && cparams.modular_mode == false) {
+    flags |= FrameHeader::kUseDcFrame;
+  }
+
+  return flags;
+}
+
+Status LoopFilterFromParams(const CompressParams& cparams,
+                            FrameHeader* JXL_RESTRICT frame_header) {
+  LoopFilter* loop_filter = &frame_header->loop_filter;
+
+  // Gaborish defaults to enabled in Hare or slower.
+  loop_filter->gab = ApplyOverride(
+      cparams.gaborish, cparams.speed_tier <= SpeedTier::kHare &&
+                            frame_header->encoding == FrameEncoding::kVarDCT &&
+                            cparams.decoding_speed_tier < 4);
+
+  if (cparams.epf != -1) {
+    loop_filter->epf_iters = cparams.epf;
+  } else {
+    if (frame_header->encoding == FrameEncoding::kModular) {
+      loop_filter->epf_iters = 0;
+    } else {
+      constexpr float kThresholds[3] = {0.7, 1.5, 4.0};
+      loop_filter->epf_iters = 0;
+      if (cparams.decoding_speed_tier < 3) {
+        for (size_t i = cparams.decoding_speed_tier == 2 ? 1 : 0; i < 3; i++) {
+          if (cparams.butteraugli_distance >= kThresholds[i]) {
+            loop_filter->epf_iters++;
+          }
+        }
+      }
+    }
+  }
+  // Strength of EPF in modular mode.
+  if (frame_header->encoding == FrameEncoding::kModular &&
+      !cparams.IsLossless()) {
+    // TODO(veluca): this formula is nonsense.
+    loop_filter->epf_sigma_for_modular = cparams.butteraugli_distance;
+  }
+  if (frame_header->encoding == FrameEncoding::kModular &&
+      cparams.lossy_palette) {
+    loop_filter->epf_sigma_for_modular = 1.0f;
+  }
+
+  return true;
+}
+
+Status MakeFrameHeader(const CompressParams& cparams,
+                       const ProgressiveSplitter& progressive_splitter,
+                       const FrameInfo& frame_info, const ImageBundle& ib,
+                       FrameHeader* JXL_RESTRICT frame_header) {
+  frame_header->nonserialized_is_preview = frame_info.is_preview;
+  frame_header->is_last = frame_info.is_last;
+  frame_header->save_before_color_transform =
+      frame_info.save_before_color_transform;
+  frame_header->frame_type = frame_info.frame_type;
+  frame_header->name = ib.name;
+
+  progressive_splitter.InitPasses(&frame_header->passes);
+
+  if (cparams.modular_mode) {
+    frame_header->encoding = FrameEncoding::kModular;
+    frame_header->group_size_shift = cparams.modular_group_size_shift;
+  }
+
+  frame_header->chroma_subsampling = ib.chroma_subsampling;
+  if (ib.IsJPEG()) {
+    // we are transcoding a JPEG, so we don't get to choose
+    frame_header->encoding = FrameEncoding::kVarDCT;
+    frame_header->color_transform = ib.color_transform;
+  } else {
+    frame_header->color_transform = cparams.color_transform;
+    if (!cparams.modular_mode &&
+        (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+         frame_header->chroma_subsampling.MaxVShift() != 0)) {
+      return JXL_FAILURE(
+          "Chroma subsampling is not supported in VarDCT mode when not "
+          "recompressing JPEGs");
+    }
+  }
+  if (frame_header->color_transform != ColorTransform::kYCbCr &&
+      (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+       frame_header->chroma_subsampling.MaxVShift() != 0)) {
+    return JXL_FAILURE(
+        "Chroma subsampling is not supported when color transform is not "
+        "YCbCr");
+  }
+
+  frame_header->flags = FrameFlagsFromParams(cparams);
+  // Non-photon noise is not supported in the Modular encoder for now.
+  if (frame_header->encoding != FrameEncoding::kVarDCT &&
+      cparams.photon_noise_iso == 0 && cparams.manual_noise.empty()) {
+    frame_header->UpdateFlag(false, FrameHeader::Flags::kNoise);
+  }
+
+  JXL_RETURN_IF_ERROR(LoopFilterFromParams(cparams, frame_header));
+
+  frame_header->dc_level = frame_info.dc_level;
+  if (frame_header->dc_level > 2) {
+    // With 3 or more progressive_dc frames, the implementation does not yet
+    // work, see enc_cache.cc.
+    return JXL_FAILURE("progressive_dc > 2 is not yet supported");
+  }
+  if (cparams.progressive_dc > 0 &&
+      (cparams.ec_resampling != 1 || cparams.resampling != 1)) {
+    return JXL_FAILURE("Resampling not supported with DC frames");
+  }
+  if (cparams.resampling != 1 && cparams.resampling != 2 &&
+      cparams.resampling != 4 && cparams.resampling != 8) {
+    return JXL_FAILURE("Invalid resampling factor");
+  }
+  if (cparams.ec_resampling != 1 && cparams.ec_resampling != 2 &&
+      cparams.ec_resampling != 4 && cparams.ec_resampling != 8) {
+    return JXL_FAILURE("Invalid ec_resampling factor");
+  }
+  // Resized frames.
+  if (frame_info.frame_type != FrameType::kDCFrame) {
+    frame_header->frame_origin = ib.origin;
+    size_t ups = 1;
+    if (cparams.already_downsampled) ups = cparams.resampling;
+
+    // TODO(lode): this is not correct in case of odd original image sizes in
+    // combination with cparams.already_downsampled. Likely these values should
+    // be set to respectively frame_header->default_xsize() and
+    // frame_header->default_ysize() instead, the original (non downsampled)
+    // intended decoded image dimensions. But it may be more subtle than that
+    // if combined with crop. This issue causes custom_size_or_origin to be
+    // incorrectly set to true in case of already_downsampled with odd output
+    // image size when no cropping is used.
+    frame_header->frame_size.xsize = ib.xsize() * ups;
+    frame_header->frame_size.ysize = ib.ysize() * ups;
+    if (ib.origin.x0 != 0 || ib.origin.y0 != 0 ||
+        frame_header->frame_size.xsize != frame_header->default_xsize() ||
+        frame_header->frame_size.ysize != frame_header->default_ysize()) {
+      frame_header->custom_size_or_origin = true;
+    }
+  }
+  // Upsampling.
+  frame_header->upsampling = cparams.resampling;
+  const std::vector<ExtraChannelInfo>& extra_channels =
+      frame_header->nonserialized_metadata->m.extra_channel_info;
+  frame_header->extra_channel_upsampling.clear();
+  frame_header->extra_channel_upsampling.resize(extra_channels.size(),
+                                                cparams.ec_resampling);
+  frame_header->save_as_reference = frame_info.save_as_reference;
+
+  // Set blending-related information.
+  if (ib.blend || frame_header->custom_size_or_origin) {
+    // Set blend_channel to the first alpha channel. These values are only
+    // encoded in case a blend mode involving alpha is used and there are more
+    // than one extra channels.
+    size_t index = 0;
+    if (frame_info.alpha_channel == -1) {
+      if (extra_channels.size() > 1) {
+        for (size_t i = 0; i < extra_channels.size(); i++) {
+          if (extra_channels[i].type == ExtraChannel::kAlpha) {
+            index = i;
+            break;
+          }
+        }
+      }
+    } else {
+      index = static_cast<size_t>(frame_info.alpha_channel);
+      JXL_ASSERT(index == 0 || index < extra_channels.size());
+    }
+    frame_header->blending_info.alpha_channel = index;
+    frame_header->blending_info.mode =
+        ib.blend ? ib.blendmode : BlendMode::kReplace;
+    frame_header->blending_info.source = frame_info.source;
+    frame_header->blending_info.clamp = frame_info.clamp;
+    const auto& extra_channel_info = frame_info.extra_channel_blending_info;
+    for (size_t i = 0; i < extra_channels.size(); i++) {
+      if (i < extra_channel_info.size()) {
+        frame_header->extra_channel_blending_info[i] = extra_channel_info[i];
+      } else {
+        frame_header->extra_channel_blending_info[i].alpha_channel = index;
+        BlendMode default_blend = ib.blendmode;
+        if (extra_channels[i].type != ExtraChannel::kBlack && i != index) {
+          // K needs to be blended, spot colors and other stuff gets added
+          default_blend = BlendMode::kAdd;
+        }
+        frame_header->extra_channel_blending_info[i].mode =
+            ib.blend ? default_blend : BlendMode::kReplace;
+        frame_header->extra_channel_blending_info[i].source = 1;
+      }
+    }
+  }
+
+  frame_header->animation_frame.duration = ib.duration;
+  frame_header->animation_frame.timecode = ib.timecode;
+
+  return true;
+}
+
+// Invisible (alpha = 0) pixels tend to be a mess in optimized PNGs.
+// Since they have no visual impact whatsoever, we can replace them with
+// something that compresses better and reduces artifacts near the edges. This
+// does some kind of smooth stuff that seems to work.
+// Replace invisible pixels with a weighted average of the pixel to the left,
+// the pixel to the topright, and non-invisible neighbours.
+// Produces downward-blurry smears, with in the upwards direction only a 1px
+// edge duplication but not more. It would probably be better to smear in all
+// directions. That requires an alpha-weighed convolution with a large enough
+// kernel though, which might be overkill...
+void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      float* JXL_RESTRICT row = image->PlaneRow(c, y);
+      const float* JXL_RESTRICT prow =
+          (y > 0 ? image->PlaneRow(c, y - 1) : nullptr);
+      const float* JXL_RESTRICT nrow =
+          (y + 1 < image->ysize() ? image->PlaneRow(c, y + 1) : nullptr);
+      const float* JXL_RESTRICT a = alpha.Row(y);
+      const float* JXL_RESTRICT pa = (y > 0 ? alpha.Row(y - 1) : nullptr);
+      const float* JXL_RESTRICT na =
+          (y + 1 < image->ysize() ? alpha.Row(y + 1) : nullptr);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        if (a[x] == 0) {
+          if (lossless) {
+            row[x] = 0;
+            continue;
+          }
+          float d = 0.f;
+          row[x] = 0;
+          if (x > 0) {
+            row[x] += row[x - 1];
+            d++;
+            if (a[x - 1] > 0.f) {
+              row[x] += row[x - 1];
+              d++;
+            }
+          }
+          if (x + 1 < image->xsize()) {
+            if (y > 0) {
+              row[x] += prow[x + 1];
+              d++;
+            }
+            if (a[x + 1] > 0.f) {
+              row[x] += 2.f * row[x + 1];
+              d += 2.f;
+            }
+            if (y > 0 && pa[x + 1] > 0.f) {
+              row[x] += 2.f * prow[x + 1];
+              d += 2.f;
+            }
+            if (y + 1 < image->ysize() && na[x + 1] > 0.f) {
+              row[x] += 2.f * nrow[x + 1];
+              d += 2.f;
+            }
+          }
+          if (y > 0 && pa[x] > 0.f) {
+            row[x] += 2.f * prow[x];
+            d += 2.f;
+          }
+          if (y + 1 < image->ysize() && na[x] > 0.f) {
+            row[x] += 2.f * nrow[x];
+            d += 2.f;
+          }
+          if (d > 1.f) row[x] /= d;
+        }
+      }
+    }
+  }
+}
+
+struct PixelStatsForChromacityAdjustment {
+  float dx = 0;
+  float db = 0;
+  float exposed_blue = 0;
+  float CalcPlane(const ImageF* JXL_RESTRICT plane) const {
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane->xsize(); ++tx) {
+        float cur = plane->Row(ty)[tx];
+        float prev_row = plane->Row(ty - 1)[tx];
+        float prev = plane->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(cur - prev));
+        ymax = std::max(ymax, std::abs(cur - prev_row));
+      }
+    }
+    return std::max(xmax, ymax);
+  }
+  void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y,
+                       const ImageF* JXL_RESTRICT plane_b) {
+    float eb = 0;
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane_y->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane_y->xsize(); ++tx) {
+        float cur_y = plane_y->Row(ty)[tx];
+        float cur_b = plane_b->Row(ty)[tx];
+        float exposed_b = cur_b - cur_y * 1.2;
+        float diff_b = cur_b - cur_y;
+        float prev_row = plane_b->Row(ty - 1)[tx];
+        float prev = plane_b->Row(ty)[tx - 1];
+        float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx];
+        float diff_prev = prev - plane_y->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(diff_b - diff_prev));
+        ymax = std::max(ymax, std::abs(diff_b - diff_prev_row));
+        if (exposed_b >= 0) {
+          exposed_b *= fabs(cur_b - prev) + fabs(cur_b - prev_row);
+          eb = std::max(eb, exposed_b);
+        }
+      }
+    }
+    exposed_blue = eb;
+    db = std::max(xmax, ymax);
+  }
+  void Calc(const Image3F* JXL_RESTRICT opsin) {
+    dx = CalcPlane(&opsin->Plane(0));
+    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2));
+  }
+  int HowMuchIsXChannelPixelized() {
+    if (dx >= 0.03) {
+      return 2;
+    }
+    if (dx >= 0.017) {
+      return 1;
+    }
+    return 0;
+  }
+  int HowMuchIsBChannelPixelized() {
+    int add = exposed_blue >= 0.13 ? 1 : 0;
+    if (db > 0.38) {
+      return 2 + add;
+    }
+    if (db > 0.33) {
+      return 1 + add;
+    }
+    if (db > 0.28) {
+      return add;
+    }
+    return 0;
+  }
+};
+
+}  // namespace
+
+class LossyFrameEncoder {
+ public:
+  LossyFrameEncoder(const CompressParams& cparams,
+                    const FrameHeader& frame_header,
+                    PassesEncoderState* JXL_RESTRICT enc_state,
+                    const JxlCmsInterface& cms, ThreadPool* pool,
+                    AuxOut* aux_out)
+      : enc_state_(enc_state), cms_(cms), pool_(pool), aux_out_(aux_out) {
+    JXL_CHECK(InitializePassesSharedState(frame_header, &enc_state_->shared,
+                                          /*encoder=*/true));
+    enc_state_->cparams = cparams;
+    enc_state_->passes.clear();
+  }
+
+  Status ComputeEncodingData(const ImageBundle* linear,
+                             Image3F* JXL_RESTRICT opsin,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             FrameHeader* frame_header) {
+    PROFILER_ZONE("ComputeEncodingData uninstrumented");
+    JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 &&
+               (opsin->ysize() % kBlockDim) == 0);
+    PassesSharedState& shared = enc_state_->shared;
+
+    if (!enc_state_->cparams.max_error_mode) {
+      // Compute chromacity adjustments using two approaches.
+      // 1) Distance based approach for chromacity adjustment:
+      float x_qm_scale_steps[4] = {1.25f, 7.0f, 15.0f, 24.0f};
+      shared.frame_header.x_qm_scale = 2;
+      for (float x_qm_scale_step : x_qm_scale_steps) {
+        if (enc_state_->cparams.original_butteraugli_distance >
+            x_qm_scale_step) {
+          shared.frame_header.x_qm_scale++;
+        }
+      }
+      if (enc_state_->cparams.butteraugli_distance < 0.299f) {
+        // Favor chromacity preservation for making images appear more
+        // faithful to original even with extreme (5-10x) zooming.
+        shared.frame_header.x_qm_scale++;
+      }
+      // 2) Pixel-based approach for chromacity adjustment:
+      // look at the individual pixels and make a guess how difficult
+      // the image would be based on the worst case pixel.
+      PixelStatsForChromacityAdjustment pixel_stats;
+      if (enc_state_->cparams.speed_tier <= SpeedTier::kWombat) {
+        pixel_stats.Calc(opsin);
+      }
+      // For X take the most severe adjustment.
+      shared.frame_header.x_qm_scale =
+          std::max<int>(shared.frame_header.x_qm_scale,
+                        2 + pixel_stats.HowMuchIsXChannelPixelized());
+      // B only ajudsted by pixel-based approach.
+      shared.frame_header.b_qm_scale =
+          2 + pixel_stats.HowMuchIsBChannelPixelized();
+    }
+
+    JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics(
+        enc_state_, modular_frame_encoder, linear, opsin, cms_, pool_,
+        aux_out_));
+
+    JXL_RETURN_IF_ERROR(InitializePassesEncoder(
+        *opsin, cms, pool_, enc_state_, modular_frame_encoder, aux_out_));
+
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
+    ComputeAllCoeffOrders(shared.frame_dim);
+    shared.num_histograms = 1;
+
+    const auto tokenize_group_init = [&](const size_t num_threads) {
+      group_caches_.resize(num_threads);
+      return true;
+    };
+    const auto tokenize_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+      // Tokenize coefficients.
+      const Rect rect = shared.BlockGroupRect(group_index);
+      for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size();
+           idx_pass++) {
+        JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32);
+        const int32_t* JXL_RESTRICT ac_rows[3] = {
+            enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32,
+        };
+        // Ensure group cache is initialized.
+        group_caches_[thread].InitOnce();
+        TokenizeCoefficients(
+            &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect,
+            ac_rows, shared.ac_strategy, frame_header->chroma_subsampling,
+            &group_caches_[thread].num_nzeroes,
+            &enc_state_->passes[idx_pass].ac_tokens[group_index],
+            enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field,
+            enc_state_->shared.block_ctx_map);
+      }
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups,
+                                  tokenize_group_init, tokenize_group,
+                                  "TokenizeGroup"));
+
+    *frame_header = shared.frame_header;
+    return true;
+  }
+
+  Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data,
+                                    ModularFrameEncoder* modular_frame_encoder,
+                                    FrameHeader* frame_header) {
+    PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented");
+    PassesSharedState& shared = enc_state_->shared;
+
+    frame_header->x_qm_scale = 2;
+    frame_header->b_qm_scale = 2;
+
+    FrameDimensions frame_dim = frame_header->ToFrameDimensions();
+
+    const size_t xsize = frame_dim.xsize_padded;
+    const size_t ysize = frame_dim.ysize_padded;
+    const size_t xsize_blocks = frame_dim.xsize_blocks;
+    const size_t ysize_blocks = frame_dim.ysize_blocks;
+
+    // no-op chroma from luma
+    shared.cmap = ColorCorrelationMap(xsize, ysize, false);
+    shared.ac_strategy.FillDCT8();
+    FillImage(uint8_t(0), &shared.epf_sharpness);
+
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
+    enc_state_->coeffs.clear();
+    while (enc_state_->coeffs.size() < enc_state_->passes.size()) {
+      enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, frame_dim.num_groups));
+    }
+
+    // convert JPEG quantization table to a Quantizer object
+    float dcquantization[3];
+    std::vector<QuantEncoding> qe(DequantMatrices::kNum,
+                                  QuantEncoding::Library(0));
+
+    auto jpeg_c_map = JpegOrder(frame_header->color_transform,
+                                jpeg_data.components.size() == 1);
+
+    std::vector<int> qt(192);
+    for (size_t c = 0; c < 3; c++) {
+      size_t jpeg_c = jpeg_c_map[c];
+      const int32_t* quant =
+          jpeg_data.quant[jpeg_data.components[jpeg_c].quant_idx].values.data();
+
+      dcquantization[c] = 255 * 8.0f / quant[0];
+      for (size_t y = 0; y < 8; y++) {
+        for (size_t x = 0; x < 8; x++) {
+          // JPEG XL transposes the DCT, JPEG doesn't.
+          qt[c * 64 + 8 * x + y] = quant[8 * y + x];
+        }
+      }
+    }
+    DequantMatricesSetCustomDC(&shared.matrices, dcquantization);
+    float dcquantization_r[3] = {1.0f / dcquantization[0],
+                                 1.0f / dcquantization[1],
+                                 1.0f / dcquantization[2]};
+
+    qe[AcStrategy::Type::DCT] = QuantEncoding::RAW(qt);
+    DequantMatricesSetCustom(&shared.matrices, qe, modular_frame_encoder);
+
+    // Ensure that InvGlobalScale() is 1.
+    shared.quantizer = Quantizer(&shared.matrices, 1, kGlobalScaleDenom);
+    // Recompute MulDC() and InvMulDC().
+    shared.quantizer.RecomputeFromGlobalScale();
+
+    // Per-block dequant scaling should be 1.
+    FillImage(static_cast<int32_t>(shared.quantizer.InvGlobalScale()),
+              &shared.raw_quant_field);
+
+    std::vector<int32_t> scaled_qtable(192);
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t i = 0; i < 64; i++) {
+        scaled_qtable[64 * c + i] =
+            (1 << kCFLFixedPointPrecision) * qt[64 + i] / qt[64 * c + i];
+      }
+    }
+
+    auto jpeg_row = [&](size_t c, size_t y) {
+      return jpeg_data.components[jpeg_c_map[c]].coeffs.data() +
+             jpeg_data.components[jpeg_c_map[c]].width_in_blocks *
+                 kDCTBlockSize * y;
+    };
+
+    Image3F dc = Image3F(xsize_blocks, ysize_blocks);
+    bool DCzero =
+        (shared.frame_header.color_transform == ColorTransform::kYCbCr);
+    // Compute chroma-from-luma for AC (doesn't seem to be useful for DC)
+    if (frame_header->chroma_subsampling.Is444() &&
+        enc_state_->cparams.force_cfl_jpeg_recompression &&
+        jpeg_data.components.size() == 3) {
+      for (size_t c : {0, 2}) {
+        ImageSB* map = (c == 0 ? &shared.cmap.ytox_map : &shared.cmap.ytob_map);
+        const float kScale = kDefaultColorFactor;
+        const int kOffset = 127;
+        const float kBase =
+            c == 0 ? shared.cmap.YtoXRatio(0) : shared.cmap.YtoBRatio(0);
+        const float kZeroThresh =
+            kScale * kZeroBiasDefault[c] *
+            0.9999f;  // just epsilon less for better rounding
+
+        auto process_row = [&](const uint32_t task, const size_t thread) {
+          size_t ty = task;
+          int8_t* JXL_RESTRICT row_out = map->Row(ty);
+          for (size_t tx = 0; tx < map->xsize(); ++tx) {
+            const size_t y0 = ty * kColorTileDimInBlocks;
+            const size_t x0 = tx * kColorTileDimInBlocks;
+            const size_t y1 = std::min(frame_dim.ysize_blocks,
+                                       (ty + 1) * kColorTileDimInBlocks);
+            const size_t x1 = std::min(frame_dim.xsize_blocks,
+                                       (tx + 1) * kColorTileDimInBlocks);
+            int32_t d_num_zeros[257] = {0};
+            // TODO(veluca): this needs SIMD + fixed point adaptation, and/or
+            // conversion to the new CfL algorithm.
+            for (size_t y = y0; y < y1; ++y) {
+              const int16_t* JXL_RESTRICT row_m = jpeg_row(1, y);
+              const int16_t* JXL_RESTRICT row_s = jpeg_row(c, y);
+              for (size_t x = x0; x < x1; ++x) {
+                for (size_t coeffpos = 1; coeffpos < kDCTBlockSize;
+                     coeffpos++) {
+                  const float scaled_m =
+                      row_m[x * kDCTBlockSize + coeffpos] *
+                      scaled_qtable[64 * c + coeffpos] *
+                      (1.0f / (1 << kCFLFixedPointPrecision));
+                  const float scaled_s =
+                      kScale * row_s[x * kDCTBlockSize + coeffpos] +
+                      (kOffset - kBase * kScale) * scaled_m;
+                  if (std::abs(scaled_m) > 1e-8f) {
+                    float from, to;
+                    if (scaled_m > 0) {
+                      from = (scaled_s - kZeroThresh) / scaled_m;
+                      to = (scaled_s + kZeroThresh) / scaled_m;
+                    } else {
+                      from = (scaled_s + kZeroThresh) / scaled_m;
+                      to = (scaled_s - kZeroThresh) / scaled_m;
+                    }
+                    if (from < 0.0f) {
+                      from = 0.0f;
+                    }
+                    if (to > 255.0f) {
+                      to = 255.0f;
+                    }
+                    // Instead of clamping the both values
+                    // we just check that range is sane.
+                    if (from <= to) {
+                      d_num_zeros[static_cast<int>(std::ceil(from))]++;
+                      d_num_zeros[static_cast<int>(std::floor(to + 1))]--;
+                    }
+                  }
+                }
+              }
+            }
+            int best = 0;
+            int32_t best_sum = 0;
+            FindIndexOfSumMaximum(d_num_zeros, 256, &best, &best_sum);
+            int32_t offset_sum = 0;
+            for (int i = 0; i < 256; ++i) {
+              if (i <= kOffset) {
+                offset_sum += d_num_zeros[i];
+              }
+            }
+            row_out[tx] = 0;
+            if (best_sum > offset_sum + 1) {
+              row_out[tx] = best - kOffset;
+            }
+          }
+        };
+
+        JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, map->ysize(),
+                                      ThreadPool::NoInit, process_row,
+                                      "FindCorrelation"));
+      }
+    }
+
+    if (!frame_header->chroma_subsampling.Is444()) {
+      ZeroFillImage(&dc);
+      for (auto& coeff : enc_state_->coeffs) {
+        coeff->ZeroFill();
+      }
+    }
+    // JPEG DC is from -1024 to 1023.
+    std::vector<size_t> dc_counts[3] = {};
+    dc_counts[0].resize(2048);
+    dc_counts[1].resize(2048);
+    dc_counts[2].resize(2048);
+    size_t total_dc[3] = {};
+    for (size_t c : {1, 0, 2}) {
+      if (jpeg_data.components.size() == 1 && c != 1) {
+        for (auto& coeff : enc_state_->coeffs) {
+          coeff->ZeroFillPlane(c);
+        }
+        ZeroFillImage(&dc.Plane(c));
+        // Ensure no division by 0.
+        dc_counts[c][1024] = 1;
+        total_dc[c] = 1;
+        continue;
+      }
+      size_t hshift = frame_header->chroma_subsampling.HShift(c);
+      size_t vshift = frame_header->chroma_subsampling.VShift(c);
+      ImageSB& map = (c == 0 ? shared.cmap.ytox_map : shared.cmap.ytob_map);
+      for (size_t group_index = 0; group_index < frame_dim.num_groups;
+           group_index++) {
+        const size_t gx = group_index % frame_dim.xsize_groups;
+        const size_t gy = group_index / frame_dim.xsize_groups;
+        int32_t* coeffs[kMaxNumPasses];
+        for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+          coeffs[i] = enc_state_->coeffs[i]->PlaneRow(c, group_index, 0).ptr32;
+        }
+        int32_t block[64];
+        for (size_t by = gy * kGroupDimInBlocks;
+             by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) {
+          if ((by >> vshift) << vshift != by) continue;
+          const int16_t* JXL_RESTRICT inputjpeg = jpeg_row(c, by >> vshift);
+          const int16_t* JXL_RESTRICT inputjpegY = jpeg_row(1, by);
+          float* JXL_RESTRICT fdc = dc.PlaneRow(c, by >> vshift);
+          const int8_t* JXL_RESTRICT cm =
+              map.ConstRow(by / kColorTileDimInBlocks);
+          for (size_t bx = gx * kGroupDimInBlocks;
+               bx < xsize_blocks && bx < (gx + 1) * kGroupDimInBlocks; ++bx) {
+            if ((bx >> hshift) << hshift != bx) continue;
+            size_t base = (bx >> hshift) * kDCTBlockSize;
+            int idc;
+            if (DCzero) {
+              idc = inputjpeg[base];
+            } else {
+              idc = inputjpeg[base] + 1024 / qt[c * 64];
+            }
+            dc_counts[c][std::min(static_cast<uint32_t>(idc + 1024),
+                                  uint32_t(2047))]++;
+            total_dc[c]++;
+            fdc[bx >> hshift] = idc * dcquantization_r[c];
+            if (c == 1 || !enc_state_->cparams.force_cfl_jpeg_recompression ||
+                !frame_header->chroma_subsampling.Is444()) {
+              for (size_t y = 0; y < 8; y++) {
+                for (size_t x = 0; x < 8; x++) {
+                  block[y * 8 + x] = inputjpeg[base + x * 8 + y];
+                }
+              }
+            } else {
+              const int32_t scale =
+                  shared.cmap.RatioJPEG(cm[bx / kColorTileDimInBlocks]);
+
+              for (size_t y = 0; y < 8; y++) {
+                for (size_t x = 0; x < 8; x++) {
+                  int Y = inputjpegY[kDCTBlockSize * bx + x * 8 + y];
+                  int QChroma = inputjpeg[kDCTBlockSize * bx + x * 8 + y];
+                  // Fixed-point multiply of CfL scale with quant table ratio
+                  // first, and Y value second.
+                  int coeff_scale = (scale * scaled_qtable[64 * c + y * 8 + x] +
+                                     (1 << (kCFLFixedPointPrecision - 1))) >>
+                                    kCFLFixedPointPrecision;
+                  int cfl_factor = (Y * coeff_scale +
+                                    (1 << (kCFLFixedPointPrecision - 1))) >>
+                                   kCFLFixedPointPrecision;
+                  int QCR = QChroma - cfl_factor;
+                  block[y * 8 + x] = QCR;
+                }
+              }
+            }
+            enc_state_->progressive_splitter.SplitACCoefficients(
+                block, AcStrategy::FromRawStrategy(AcStrategy::Type::DCT), bx,
+                by, coeffs);
+            for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+              coeffs[i] += kDCTBlockSize;
+            }
+          }
+        }
+      }
+    }
+
+    auto& dct = enc_state_->shared.block_ctx_map.dc_thresholds;
+    auto& num_dc_ctxs = enc_state_->shared.block_ctx_map.num_dc_ctxs;
+    num_dc_ctxs = 1;
+    for (size_t i = 0; i < 3; i++) {
+      dct[i].clear();
+      int num_thresholds = (CeilLog2Nonzero(total_dc[i]) - 12) / 2;
+      // up to 3 buckets per channel:
+      // dark/medium/bright, yellow/unsat/blue, green/unsat/red
+      num_thresholds = std::min(std::max(num_thresholds, 0), 2);
+      size_t cumsum = 0;
+      size_t cut = total_dc[i] / (num_thresholds + 1);
+      for (int j = 0; j < 2048; j++) {
+        cumsum += dc_counts[i][j];
+        if (cumsum > cut) {
+          dct[i].push_back(j - 1025);
+          cut = total_dc[i] * (dct[i].size() + 1) / (num_thresholds + 1);
+        }
+      }
+      num_dc_ctxs *= dct[i].size() + 1;
+    }
+
+    auto& ctx_map = enc_state_->shared.block_ctx_map.ctx_map;
+    ctx_map.clear();
+    ctx_map.resize(3 * kNumOrders * num_dc_ctxs, 0);
+
+    int lbuckets = (dct[1].size() + 1);
+    for (size_t i = 0; i < num_dc_ctxs; i++) {
+      // up to 9 contexts for luma
+      ctx_map[i] = i / lbuckets;
+      // up to 3 contexts for chroma
+      ctx_map[kNumOrders * num_dc_ctxs + i] =
+          ctx_map[2 * kNumOrders * num_dc_ctxs + i] =
+              num_dc_ctxs / lbuckets + (i % lbuckets);
+    }
+    enc_state_->shared.block_ctx_map.num_ctxs =
+        *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+
+    enc_state_->histogram_idx.resize(shared.frame_dim.num_groups);
+
+    // disable DC frame for now
+    shared.frame_header.UpdateFlag(false, FrameHeader::kUseDcFrame);
+    auto compute_dc_coeffs = [&](const uint32_t group_index,
+                                 size_t /* thread */) {
+      modular_frame_encoder->AddVarDCTDC(dc, group_index, /*nl_dc=*/false,
+                                         enc_state_, /*jpeg_transcode=*/true);
+      modular_frame_encoder->AddACMetadata(group_index, /*jpeg_transcode=*/true,
+                                           enc_state_);
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_dc_groups,
+                                  ThreadPool::NoInit, compute_dc_coeffs,
+                                  "Compute DC coeffs"));
+
+    // Must happen before WriteFrameHeader!
+    shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing);
+
+    ComputeAllCoeffOrders(frame_dim);
+    shared.num_histograms = 1;
+
+    const auto tokenize_group_init = [&](const size_t num_threads) {
+      group_caches_.resize(num_threads);
+      return true;
+    };
+    const auto tokenize_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+      // Tokenize coefficients.
+      const Rect rect = shared.BlockGroupRect(group_index);
+      for (size_t idx_pass = 0; idx_pass < enc_state_->passes.size();
+           idx_pass++) {
+        JXL_ASSERT(enc_state_->coeffs[idx_pass]->Type() == ACType::k32);
+        const int32_t* JXL_RESTRICT ac_rows[3] = {
+            enc_state_->coeffs[idx_pass]->PlaneRow(0, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(1, group_index, 0).ptr32,
+            enc_state_->coeffs[idx_pass]->PlaneRow(2, group_index, 0).ptr32,
+        };
+        // Ensure group cache is initialized.
+        group_caches_[thread].InitOnce();
+        TokenizeCoefficients(
+            &shared.coeff_orders[idx_pass * shared.coeff_order_size], rect,
+            ac_rows, shared.ac_strategy, frame_header->chroma_subsampling,
+            &group_caches_[thread].num_nzeroes,
+            &enc_state_->passes[idx_pass].ac_tokens[group_index],
+            enc_state_->shared.quant_dc, enc_state_->shared.raw_quant_field,
+            enc_state_->shared.block_ctx_map);
+      }
+    };
+    JXL_RETURN_IF_ERROR(RunOnPool(pool_, 0, shared.frame_dim.num_groups,
+                                  tokenize_group_init, tokenize_group,
+                                  "TokenizeGroup"));
+    *frame_header = shared.frame_header;
+    doing_jpeg_recompression = true;
+    return true;
+  }
+
+  Status EncodeGlobalDCInfo(const FrameHeader& frame_header,
+                            BitWriter* writer) const {
+    // Encode quantizer DC and global scale.
+    QuantizerParams params = enc_state_->shared.quantizer.GetParams();
+    JXL_RETURN_IF_ERROR(
+        WriteQuantizerParams(params, writer, kLayerQuant, aux_out_));
+    EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_);
+    ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC,
+                                aux_out_);
+    return true;
+  }
+
+  Status EncodeGlobalACInfo(BitWriter* writer,
+                            ModularFrameEncoder* modular_frame_encoder) {
+    JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices,
+                                              writer, kLayerQuant, aux_out_,
+                                              modular_frame_encoder));
+    if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) {
+      if (!doing_jpeg_recompression) ClusterGroups(enc_state_);
+    }
+    size_t num_histo_bits =
+        CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups);
+    if (num_histo_bits != 0) {
+      BitWriter::Allotment allotment(writer, num_histo_bits);
+      writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1);
+      allotment.ReclaimAndCharge(writer, kLayerAC, aux_out_);
+    }
+
+    for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
+         i++) {
+      // Encode coefficient orders.
+      size_t order_bits = 0;
+      JXL_RETURN_IF_ERROR(U32Coder::CanEncode(
+          kOrderEnc, enc_state_->used_orders[i], &order_bits));
+      BitWriter::Allotment allotment(writer, order_bits);
+      JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer));
+      allotment.ReclaimAndCharge(writer, kLayerOrder, aux_out_);
+      EncodeCoeffOrders(
+          enc_state_->used_orders[i],
+          &enc_state_->shared
+               .coeff_orders[i * enc_state_->shared.coeff_order_size],
+          writer, kLayerOrder, aux_out_);
+
+      // Encode histograms.
+      HistogramParams hist_params(
+          enc_state_->cparams.speed_tier,
+          enc_state_->shared.block_ctx_map.NumACContexts());
+      if (enc_state_->cparams.speed_tier > SpeedTier::kTortoise) {
+        hist_params.lz77_method = HistogramParams::LZ77Method::kNone;
+      }
+      if (enc_state_->cparams.decoding_speed_tier >= 1) {
+        hist_params.max_histograms = 6;
+      }
+      BuildAndEncodeHistograms(
+          hist_params,
+          enc_state_->shared.num_histograms *
+              enc_state_->shared.block_ctx_map.NumACContexts(),
+          enc_state_->passes[i].ac_tokens, &enc_state_->passes[i].codes,
+          &enc_state_->passes[i].context_map, writer, kLayerAC, aux_out_);
+    }
+
+    return true;
+  }
+
+  Status EncodeACGroup(size_t pass, size_t group_index, BitWriter* group_code,
+                       AuxOut* local_aux_out) {
+    return EncodeGroupTokenizedCoefficients(
+        group_index, pass, enc_state_->histogram_idx[group_index], *enc_state_,
+        group_code, local_aux_out);
+  }
+
+  PassesEncoderState* State() { return enc_state_; }
+
+ private:
+  void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) {
+    PROFILER_FUNC;
+    // No coefficient reordering in Falcon or faster.
+    auto used_orders_info = ComputeUsedOrders(
+        enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy,
+        Rect(enc_state_->shared.raw_quant_field));
+    enc_state_->used_orders.clear();
+    enc_state_->used_orders.resize(
+        enc_state_->progressive_splitter.GetNumPasses(),
+        used_orders_info.second);
+    for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
+         i++) {
+      ComputeCoeffOrder(
+          enc_state_->cparams.speed_tier, *enc_state_->coeffs[i],
+          enc_state_->shared.ac_strategy, frame_dim, enc_state_->used_orders[i],
+          used_orders_info.first,
+          &enc_state_->shared
+               .coeff_orders[i * enc_state_->shared.coeff_order_size]);
+    }
+  }
+
+  template <typename V, typename R>
+  static inline void FindIndexOfSumMaximum(const V* array, const size_t len,
+                                           R* idx, V* sum) {
+    JXL_ASSERT(len > 0);
+    V maxval = 0;
+    V val = 0;
+    R maxidx = 0;
+    for (size_t i = 0; i < len; ++i) {
+      val += array[i];
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+    *idx = maxidx;
+    *sum = maxval;
+  }
+
+  PassesEncoderState* JXL_RESTRICT enc_state_;
+  JxlCmsInterface cms_;
+  ThreadPool* pool_;
+  AuxOut* aux_out_;
+  std::vector<EncCache> group_caches_;
+  bool doing_jpeg_recompression = false;
+};
+
+Status ParamsPostInit(CompressParams* p) {
+  if (!p->manual_noise.empty() &&
+      p->manual_noise.size() != NoiseParams::kNumNoisePoints) {
+    return JXL_FAILURE("Invalid number of noise lut entries");
+  }
+  if (!p->manual_xyb_factors.empty() && p->manual_xyb_factors.size() != 3) {
+    return JXL_FAILURE("Invalid number of XYB quantization factors");
+  }
+  if (!p->modular_mode && p->butteraugli_distance == 0.0) {
+    p->butteraugli_distance = kMinButteraugliDistance;
+  }
+  if (p->original_butteraugli_distance == -1.0) {
+    p->original_butteraugli_distance = p->butteraugli_distance;
+  }
+  if (p->resampling <= 0) {
+    p->resampling = 1;
+    // For very low bit rates, using 2x2 resampling gives better results on
+    // most photographic images, with an adjusted butteraugli score chosen to
+    // give roughly the same amount of bits per pixel.
+    if (!p->already_downsampled && p->butteraugli_distance >= 20) {
+      p->resampling = 2;
+      p->butteraugli_distance = 6 + ((p->butteraugli_distance - 20) * 0.25);
+    }
+  }
+  if (p->ec_resampling <= 0) {
+    p->ec_resampling = p->resampling;
+  }
+  return true;
+}
+
+Status EncodeFrame(const CompressParams& cparams_orig,
+                   const FrameInfo& frame_info, const CodecMetadata* metadata,
+                   const ImageBundle& ib, PassesEncoderState* passes_enc_state,
+                   const JxlCmsInterface& cms, ThreadPool* pool,
+                   BitWriter* writer, AuxOut* aux_out) {
+  CompressParams cparams = cparams_orig;
+  if (cparams.speed_tier == SpeedTier::kGlacier && !cparams.IsLossless()) {
+    cparams.speed_tier = SpeedTier::kTortoise;
+  }
+  if (cparams.speed_tier == SpeedTier::kGlacier) {
+    std::vector<CompressParams> all_params;
+    std::vector<size_t> size;
+
+    CompressParams cparams_attempt = cparams_orig;
+    cparams_attempt.speed_tier = SpeedTier::kTortoise;
+    cparams_attempt.options.max_properties = 4;
+
+    for (float x : {0.0f, 80.f}) {
+      cparams_attempt.channel_colors_percent = x;
+      for (float y : {0.0f, 95.0f}) {
+        cparams_attempt.channel_colors_pre_transform_percent = y;
+        // 70000 ensures that the number of palette colors is representable in
+        // modular headers.
+        for (int K : {0, 1 << 10, 70000}) {
+          cparams_attempt.palette_colors = K;
+          for (int tree_mode : {-1, (int)ModularOptions::TreeMode::kNoWP,
+                                (int)ModularOptions::TreeMode::kDefault}) {
+            if (tree_mode == -1) {
+              // LZ77 only
+              cparams_attempt.options.nb_repeats = 0;
+            } else {
+              cparams_attempt.options.nb_repeats = 1;
+              cparams_attempt.options.wp_tree_mode =
+                  static_cast<ModularOptions::TreeMode>(tree_mode);
+            }
+            for (Predictor pred : {Predictor::Zero, Predictor::Variable}) {
+              cparams_attempt.options.predictor = pred;
+              for (int g : {0, 1, 3}) {
+                cparams_attempt.modular_group_size_shift = g;
+                for (Override patches : {Override::kDefault, Override::kOff}) {
+                  cparams_attempt.patches = patches;
+                  all_params.push_back(cparams_attempt);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    size.resize(all_params.size());
+
+    std::atomic<int> num_errors{0};
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, all_params.size(), ThreadPool::NoInit,
+        [&](size_t task, size_t) {
+          BitWriter w;
+          PassesEncoderState state;
+          if (!EncodeFrame(all_params[task], frame_info, metadata, ib, &state,
+                           cms, nullptr, &w, aux_out)) {
+            num_errors.fetch_add(1, std::memory_order_relaxed);
+            return;
+          }
+          size[task] = w.BitsWritten();
+        },
+        "Compress kGlacier"));
+    JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+    size_t best_idx = 0;
+    for (size_t i = 1; i < all_params.size(); i++) {
+      if (size[best_idx] > size[i]) {
+        best_idx = i;
+      }
+    }
+    cparams = all_params[best_idx];
+  }
+
+  if (cparams_orig.target_bitrate > 0.0f &&
+      frame_info.frame_type == FrameType::kRegularFrame) {
+    cparams.target_bitrate = 0.0f;
+    const float target_bitrate = cparams_orig.target_bitrate;
+    float bitrate = 0.0f;
+    float prev_bitrate = 0.0f;
+    float rescale = 1.0f;
+    size_t prev_bits = 0;
+    float error = 0.0f;
+    float best_error = 100.0f;
+    float best_rescale = 1.0f;
+    for (size_t i = 0; i < 10; ++i) {
+      std::unique_ptr<PassesEncoderState> state =
+          jxl::make_unique<PassesEncoderState>();
+      BitWriter bw;
+      JXL_CHECK(EncodeFrame(cparams, frame_info, metadata, ib, state.get(), cms,
+                            pool, &bw, nullptr));
+      bitrate = bw.BitsWritten() * 1.0 / (ib.xsize() * ib.ysize());
+      error = target_bitrate / bitrate - 1.0f;
+      if (std::abs(error) < std::abs(best_error)) {
+        best_error = error;
+        best_rescale = cparams.quant_ac_rescale;
+      }
+      if (bw.BitsWritten() == prev_bits || std::abs(error) < 0.0005f) {
+        break;
+      }
+      float lambda = 1.0f;
+      if (i > 0) {
+        lambda = (((bitrate / prev_bitrate) - 1.0f) / (rescale - 1.0f));
+      }
+      rescale = (1.0f + ((target_bitrate / bitrate) - 1.0f) / lambda);
+      if (rescale < 0.0f) {
+        break;
+      }
+      cparams.quant_ac_rescale *= rescale;
+      prev_bitrate = bitrate;
+      prev_bits = bw.BitsWritten();
+    }
+    if (aux_out) {
+      aux_out->max_quant_rescale = best_rescale;
+      aux_out->min_quant_rescale = best_rescale;
+      aux_out->min_bitrate_error = best_error;
+      aux_out->max_bitrate_error = best_error;
+    }
+    cparams.quant_ac_rescale = best_rescale;
+  }
+  ib.VerifyMetadata();
+
+  passes_enc_state->special_frames.clear();
+
+  if (cparams.qprogressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
+  } else if (cparams.progressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac});
+  }
+
+  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
+
+  if (cparams.progressive_dc < 0) {
+    if (cparams.progressive_dc != -1) {
+      return JXL_FAILURE("Invalid progressive DC setting value (%d)",
+                         cparams.progressive_dc);
+    }
+    cparams.progressive_dc = 0;
+  }
+  if (cparams.ec_resampling < cparams.resampling) {
+    cparams.ec_resampling = cparams.resampling;
+  }
+  if (cparams.resampling > 1 || frame_info.is_preview) {
+    cparams.progressive_dc = 0;
+  }
+
+  if (frame_info.dc_level + cparams.progressive_dc > 4) {
+    return JXL_FAILURE("Too many levels of progressive DC");
+  }
+
+  if (cparams.butteraugli_distance != 0 &&
+      cparams.butteraugli_distance < kMinButteraugliDistance) {
+    return JXL_FAILURE("Butteraugli distance is too low (%f)",
+                       cparams.butteraugli_distance);
+  }
+
+  if (ib.IsJPEG()) {
+    cparams.gaborish = Override::kOff;
+    cparams.epf = 0;
+    cparams.modular_mode = false;
+  }
+
+  if (ib.xsize() == 0 || ib.ysize() == 0) return JXL_FAILURE("Empty image");
+
+  // Assert that this metadata is correctly set up for the compression params,
+  // this should have been done by enc_file.cc
+  JXL_ASSERT(metadata->m.xyb_encoded ==
+             (cparams.color_transform == ColorTransform::kXYB));
+  std::unique_ptr<FrameHeader> frame_header =
+      jxl::make_unique<FrameHeader>(metadata);
+  JXL_RETURN_IF_ERROR(MakeFrameHeader(cparams,
+                                      passes_enc_state->progressive_splitter,
+                                      frame_info, ib, frame_header.get()));
+  // Check that if the codestream header says xyb_encoded, the color_transform
+  // matches the requirement. This is checked from the cparams here, even though
+  // optimally we'd be able to check this against what has actually been written
+  // in the main codestream header, but since ib is a const object and the data
+  // written to the main codestream header is (in modified form) in ib, the
+  // encoder cannot indicate this fact in the ib's metadata.
+  if (cparams_orig.color_transform == ColorTransform::kXYB) {
+    if (frame_header->color_transform != ColorTransform::kXYB) {
+      return JXL_FAILURE(
+          "The color transform of frames must be xyb if the codestream is xyb "
+          "encoded");
+    }
+  } else {
+    if (frame_header->color_transform == ColorTransform::kXYB) {
+      return JXL_FAILURE(
+          "The color transform of frames cannot be xyb if the codestream is "
+          "not xyb encoded");
+    }
+  }
+
+  FrameDimensions frame_dim = frame_header->ToFrameDimensions();
+
+  const size_t num_groups = frame_dim.num_groups;
+
+  Image3F opsin;
+  const ColorEncoding& c_linear = ColorEncoding::LinearSRGB(ib.IsGray());
+  std::unique_ptr<ImageMetadata> metadata_linear =
+      jxl::make_unique<ImageMetadata>();
+  metadata_linear->xyb_encoded =
+      (cparams.color_transform == ColorTransform::kXYB);
+  metadata_linear->color_encoding = c_linear;
+  ImageBundle linear_storage(metadata_linear.get());
+
+  std::vector<AuxOut> aux_outs;
+  // LossyFrameEncoder stores a reference to a std::function<Status(size_t)>
+  // so we need to keep the std::function<Status(size_t)> being referenced
+  // alive while lossy_frame_encoder is used. We could make resize_aux_outs a
+  // lambda type by making LossyFrameEncoder a template instead, but this is
+  // simpler.
+  const std::function<Status(size_t)> resize_aux_outs =
+      [&aux_outs, aux_out](const size_t num_threads) -> Status {
+    if (aux_out != nullptr) {
+      size_t old_size = aux_outs.size();
+      for (size_t i = num_threads; i < old_size; i++) {
+        aux_out->Assimilate(aux_outs[i]);
+      }
+      aux_outs.resize(num_threads);
+      // Each thread needs these INPUTS. Don't copy the entire AuxOut
+      // because it may contain stats which would be Assimilated multiple
+      // times below.
+      for (size_t i = old_size; i < aux_outs.size(); i++) {
+        aux_outs[i].dump_image = aux_out->dump_image;
+        aux_outs[i].debug_prefix = aux_out->debug_prefix;
+      }
+    }
+    return true;
+  };
+
+  LossyFrameEncoder lossy_frame_encoder(cparams, *frame_header,
+                                        passes_enc_state, cms, pool, aux_out);
+  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
+      jxl::make_unique<ModularFrameEncoder>(*frame_header, cparams);
+
+  const std::vector<ImageF>* extra_channels = &ib.extra_channels();
+  std::vector<ImageF> extra_channels_storage;
+  // Clear patches
+  passes_enc_state->shared.image_features.patches = PatchDictionary();
+  passes_enc_state->shared.image_features.patches.SetPassesSharedState(
+      &passes_enc_state->shared);
+
+  if (ib.IsJPEG()) {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeJPEGTranscodingData(
+        *ib.jpeg_data, modular_frame_encoder.get(), frame_header.get()));
+  } else if (!lossy_frame_encoder.State()->heuristics->HandlesColorConversion(
+                 cparams, ib) ||
+             frame_header->encoding != FrameEncoding::kVarDCT) {
+    // Allocating a large enough image avoids a copy when padding.
+    opsin =
+        Image3F(RoundUpToBlockDim(ib.xsize()), RoundUpToBlockDim(ib.ysize()));
+    opsin.ShrinkTo(ib.xsize(), ib.ysize());
+
+    const bool want_linear = frame_header->encoding == FrameEncoding::kVarDCT &&
+                             cparams.speed_tier <= SpeedTier::kKitten;
+    const ImageBundle* JXL_RESTRICT ib_or_linear = &ib;
+
+    if (frame_header->color_transform == ColorTransform::kXYB &&
+        frame_info.ib_needs_color_transform) {
+      // linear_storage would only be used by the Butteraugli loop (passing
+      // linear sRGB avoids a color conversion there). Otherwise, don't
+      // fill it to reduce memory usage.
+      ib_or_linear =
+          ToXYB(ib, pool, &opsin, cms, want_linear ? &linear_storage : nullptr);
+    } else {  // RGB or YCbCr: don't do anything (forward YCbCr is not
+              // implemented, this is only used when the input is already in
+              // YCbCr)
+              // If encoding a special DC or reference frame, don't do anything:
+              // input is already in XYB.
+      CopyImageTo(ib.color(), &opsin);
+    }
+    bool lossless = cparams.IsLossless();
+    if (ib.HasAlpha() && !ib.AlphaIsPremultiplied() &&
+        frame_header->frame_type == FrameType::kRegularFrame &&
+        !ApplyOverride(cparams.keep_invisible, lossless) &&
+        cparams.ec_resampling == cparams.resampling) {
+      // simplify invisible pixels
+      SimplifyInvisible(&opsin, ib.alpha(), lossless);
+      if (want_linear) {
+        SimplifyInvisible(const_cast<Image3F*>(&ib_or_linear->color()),
+                          ib.alpha(), lossless);
+      }
+    }
+    if (aux_out != nullptr) {
+      JXL_RETURN_IF_ERROR(
+          aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin));
+    }
+    if (frame_header->encoding == FrameEncoding::kVarDCT) {
+      PadImageToBlockMultipleInPlace(&opsin);
+      JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
+          ib_or_linear, &opsin, cms, pool, modular_frame_encoder.get(),
+          frame_header.get()));
+    } else if (frame_header->upsampling != 1 && !cparams.already_downsampled) {
+      // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling
+      // after noise, if necessary.
+      DownsampleImage(&opsin, frame_header->upsampling);
+    }
+  } else {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
+        &ib, &opsin, cms, pool, modular_frame_encoder.get(),
+        frame_header.get()));
+  }
+  if (cparams.ec_resampling != 1 && !cparams.already_downsampled) {
+    extra_channels = &extra_channels_storage;
+    for (size_t i = 0; i < ib.extra_channels().size(); i++) {
+      extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i]));
+      DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling);
+    }
+  }
+  // needs to happen *AFTER* VarDCT-ComputeEncodingData.
+  JXL_RETURN_IF_ERROR(modular_frame_encoder->ComputeEncodingData(
+      *frame_header, *ib.metadata(), &opsin, *extra_channels,
+      lossy_frame_encoder.State(), cms, pool, aux_out,
+      /* do_color=*/frame_header->encoding == FrameEncoding::kModular));
+
+  writer->AppendByteAligned(lossy_frame_encoder.State()->special_frames);
+  frame_header->UpdateFlag(
+      lossy_frame_encoder.State()->shared.image_features.patches.HasAny(),
+      FrameHeader::kPatches);
+  frame_header->UpdateFlag(
+      lossy_frame_encoder.State()->shared.image_features.splines.HasAny(),
+      FrameHeader::kSplines);
+  JXL_RETURN_IF_ERROR(WriteFrameHeader(*frame_header, writer, aux_out));
+
+  const size_t num_passes =
+      passes_enc_state->progressive_splitter.GetNumPasses();
+
+  // DC global info + DC groups + AC global info + AC groups *
+  // num_passes.
+  const bool has_ac_global = true;
+  std::vector<BitWriter> group_codes(NumTocEntries(frame_dim.num_groups,
+                                                   frame_dim.num_dc_groups,
+                                                   num_passes, has_ac_global));
+  const size_t global_ac_index = frame_dim.num_dc_groups + 1;
+  const bool is_small_image = frame_dim.num_groups == 1 && num_passes == 1;
+  const auto get_output = [&](const size_t index) {
+    return &group_codes[is_small_image ? 0 : index];
+  };
+  auto ac_group_code = [&](size_t pass, size_t group) {
+    return get_output(AcGroupIndex(pass, group, frame_dim.num_groups,
+                                   frame_dim.num_dc_groups, has_ac_global));
+  };
+
+  if (frame_header->flags & FrameHeader::kPatches) {
+    PatchDictionaryEncoder::Encode(
+        lossy_frame_encoder.State()->shared.image_features.patches,
+        get_output(0), kLayerDictionary, aux_out);
+  }
+
+  if (frame_header->flags & FrameHeader::kSplines) {
+    EncodeSplines(lossy_frame_encoder.State()->shared.image_features.splines,
+                  get_output(0), kLayerSplines, HistogramParams(), aux_out);
+  }
+
+  if (cparams.photon_noise_iso > 0) {
+    lossy_frame_encoder.State()->shared.image_features.noise_params =
+        SimulatePhotonNoise(ib.xsize(), ib.ysize(), cparams.photon_noise_iso);
+  }
+  if (cparams.manual_noise.size() == NoiseParams::kNumNoisePoints) {
+    for (size_t i = 0; i < NoiseParams::kNumNoisePoints; i++) {
+      lossy_frame_encoder.State()->shared.image_features.noise_params.lut[i] =
+          cparams.manual_noise[i];
+    }
+  }
+  if (frame_header->flags & FrameHeader::kNoise) {
+    EncodeNoise(lossy_frame_encoder.State()->shared.image_features.noise_params,
+                get_output(0), kLayerNoise, aux_out);
+  }
+
+  JXL_RETURN_IF_ERROR(
+      DequantMatricesEncodeDC(&lossy_frame_encoder.State()->shared.matrices,
+                              get_output(0), kLayerQuant, aux_out));
+  if (frame_header->encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(
+        lossy_frame_encoder.EncodeGlobalDCInfo(*frame_header, get_output(0)));
+  }
+  JXL_RETURN_IF_ERROR(
+      modular_frame_encoder->EncodeGlobalInfo(get_output(0), aux_out));
+  JXL_RETURN_IF_ERROR(modular_frame_encoder->EncodeStream(
+      get_output(0), aux_out, kLayerModularGlobal, ModularStreamId::Global()));
+
+  const auto process_dc_group = [&](const uint32_t group_index,
+                                    const size_t thread) {
+    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    BitWriter* output = get_output(group_index + 1);
+    if (frame_header->encoding == FrameEncoding::kVarDCT &&
+        !(frame_header->flags & FrameHeader::kUseDcFrame)) {
+      BitWriter::Allotment allotment(output, 2);
+      output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]);
+      allotment.ReclaimAndCharge(output, kLayerDC, my_aux_out);
+      JXL_CHECK(modular_frame_encoder->EncodeStream(
+          output, my_aux_out, kLayerDC,
+          ModularStreamId::VarDCTDC(group_index)));
+    }
+    JXL_CHECK(modular_frame_encoder->EncodeStream(
+        output, my_aux_out, kLayerModularDcGroup,
+        ModularStreamId::ModularDC(group_index)));
+    if (frame_header->encoding == FrameEncoding::kVarDCT) {
+      const Rect& rect =
+          lossy_frame_encoder.State()->shared.DCGroupRect(group_index);
+      size_t nb_bits = CeilLog2Nonzero(rect.xsize() * rect.ysize());
+      if (nb_bits != 0) {
+        BitWriter::Allotment allotment(output, nb_bits);
+        output->Write(nb_bits,
+                      modular_frame_encoder->ac_metadata_size[group_index] - 1);
+        allotment.ReclaimAndCharge(output, kLayerControlFields, my_aux_out);
+      }
+      JXL_CHECK(modular_frame_encoder->EncodeStream(
+          output, my_aux_out, kLayerControlFields,
+          ModularStreamId::ACMetadata(group_index)));
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, frame_dim.num_dc_groups,
+                                resize_aux_outs, process_dc_group,
+                                "EncodeDCGroup"));
+
+  if (frame_header->encoding == FrameEncoding::kVarDCT) {
+    JXL_RETURN_IF_ERROR(lossy_frame_encoder.EncodeGlobalACInfo(
+        get_output(global_ac_index), modular_frame_encoder.get()));
+  }
+
+  std::atomic<int> num_errors{0};
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+
+    for (size_t i = 0; i < num_passes; i++) {
+      if (frame_header->encoding == FrameEncoding::kVarDCT) {
+        if (!lossy_frame_encoder.EncodeACGroup(
+                i, group_index, ac_group_code(i, group_index), my_aux_out)) {
+          num_errors.fetch_add(1, std::memory_order_relaxed);
+          return;
+        }
+      }
+      // Write all modular encoded data (color?, alpha, depth, extra channels)
+      if (!modular_frame_encoder->EncodeStream(
+              ac_group_code(i, group_index), my_aux_out, kLayerModularAcGroup,
+              ModularStreamId::ModularAC(group_index, i))) {
+        num_errors.fetch_add(1, std::memory_order_relaxed);
+        return;
+      }
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, num_groups, resize_aux_outs,
+                                process_group, "EncodeGroupCoefficients"));
+
+  // Resizing aux_outs to 0 also Assimilates the array.
+  static_cast<void>(resize_aux_outs(0));
+  JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+  for (BitWriter& bw : group_codes) {
+    BitWriter::Allotment allotment(&bw, 8);
+    bw.ZeroPadToByte();  // end of group.
+    allotment.ReclaimAndCharge(&bw, kLayerAC, aux_out);
+  }
+
+  std::vector<coeff_order_t>* permutation_ptr = nullptr;
+  std::vector<coeff_order_t> permutation;
+  if (cparams.centerfirst && !(num_passes == 1 && num_groups == 1)) {
+    permutation_ptr = &permutation;
+    // Don't permute global DC/AC or DC.
+    permutation.resize(global_ac_index + 1);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::vector<coeff_order_t> ac_group_order(num_groups);
+    std::iota(ac_group_order.begin(), ac_group_order.end(), 0);
+    size_t group_dim = frame_dim.group_dim;
+
+    // The center of the image is either given by parameters or chosen
+    // to be the middle of the image by default if center_x, center_y resp.
+    // are not provided.
+
+    int64_t imag_cx;
+    if (cparams.center_x != static_cast<size_t>(-1)) {
+      JXL_RETURN_IF_ERROR(cparams.center_x < ib.xsize());
+      imag_cx = cparams.center_x;
+    } else {
+      imag_cx = ib.xsize() / 2;
+    }
+
+    int64_t imag_cy;
+    if (cparams.center_y != static_cast<size_t>(-1)) {
+      JXL_RETURN_IF_ERROR(cparams.center_y < ib.ysize());
+      imag_cy = cparams.center_y;
+    } else {
+      imag_cy = ib.ysize() / 2;
+    }
+
+    // The center of the group containing the center of the image.
+    int64_t cx = (imag_cx / group_dim) * group_dim + group_dim / 2;
+    int64_t cy = (imag_cy / group_dim) * group_dim + group_dim / 2;
+    // This identifies in what area of the central group the center of the image
+    // lies in.
+    double direction = -std::atan2(imag_cy - cy, imag_cx - cx);
+    // This identifies the side of the central group the center of the image
+    // lies closest to. This can take values 0, 1, 2, 3 corresponding to left,
+    // bottom, right, top.
+    int64_t side = std::fmod((direction + 5 * kPi / 4), 2 * kPi) * 2 / kPi;
+    auto get_distance_from_center = [&](size_t gid) {
+      Rect r = passes_enc_state->shared.GroupRect(gid);
+      int64_t gcx = r.x0() + group_dim / 2;
+      int64_t gcy = r.y0() + group_dim / 2;
+      int64_t dx = gcx - cx;
+      int64_t dy = gcy - cy;
+      // The angle is determined by taking atan2 and adding an appropriate
+      // starting point depending on the side we want to start on.
+      double angle = std::remainder(
+          std::atan2(dy, dx) + kPi / 4 + side * (kPi / 2), 2 * kPi);
+      // Concentric squares in clockwise order.
+      return std::make_pair(std::max(std::abs(dx), std::abs(dy)), angle);
+    };
+    std::sort(ac_group_order.begin(), ac_group_order.end(),
+              [&](coeff_order_t a, coeff_order_t b) {
+                return get_distance_from_center(a) <
+                       get_distance_from_center(b);
+              });
+    std::vector<coeff_order_t> inv_ac_group_order(ac_group_order.size(), 0);
+    for (size_t i = 0; i < ac_group_order.size(); i++) {
+      inv_ac_group_order[ac_group_order[i]] = i;
+    }
+    for (size_t i = 0; i < num_passes; i++) {
+      size_t pass_start = permutation.size();
+      for (coeff_order_t v : inv_ac_group_order) {
+        permutation.push_back(pass_start + v);
+      }
+    }
+    std::vector<BitWriter> new_group_codes(group_codes.size());
+    for (size_t i = 0; i < permutation.size(); i++) {
+      new_group_codes[permutation[i]] = std::move(group_codes[i]);
+    }
+    group_codes = std::move(new_group_codes);
+  }
+
+  JXL_RETURN_IF_ERROR(
+      WriteGroupOffsets(group_codes, permutation_ptr, writer, aux_out));
+  writer->AppendByteAligned(group_codes);
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_frame.h b/third_party/jpeg-xl/lib/jxl/enc_frame.h
new file mode 100644
index 0000000000..b1dc637eb0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.h
@@ -0,0 +1,78 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FRAME_H_
+#define LIB_JXL_ENC_FRAME_H_
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Information needed for encoding a frame that is not contained elsewhere and
+// does not belong to `cparams`.
+// TODO(lode): if possible, it might be better to replace FrameInfo and several
+// fields from ImageBundle (such as frame name and duration) by direct usage of
+// jxl::FrameHeader itself.
+struct FrameInfo {
+  // TODO(veluca): consider adding more parameters, such as custom patches.
+  bool save_before_color_transform = false;
+  // Whether or not the input image bundle is already in the codestream
+  // colorspace (as deduced by cparams).
+  // TODO(veluca): this is a hack - ImageBundle doesn't have a simple way to say
+  // "this is already in XYB".
+  bool ib_needs_color_transform = true;
+  FrameType frame_type = FrameType::kRegularFrame;
+  size_t dc_level = 0;
+  // Only used for kRegularFrame.
+  bool is_last = true;
+  bool is_preview = false;
+  // Information for storing this frame for future use (only for non-DC frames).
+  size_t save_as_reference = 0;
+  // The source frame for blending of a next frame, matching the
+  // save_as_reference value of a previous frame. Animated frames can use
+  // save_as_reference values 1, 2 and 3, while composite still frames can use
+  // save_as_reference values 0, 1, 2 and 3. The current C++ encoder
+  // implementation is assuming and using 1 for all frames of animations, so
+  // using that as the default value here.
+  // Corresponds to BlendingInfo::source from the FrameHeader.
+  size_t source = 1;
+  // Corresponds to BlendingInfo::clamp from the FrameHeader.
+  size_t clamp = 1;
+  // Corresponds to BlendingInfo::alpha_channel from the FrameHeader, or set to
+  // -1 to automatically choose it as the index of the first extra channel of
+  // type alpha.
+  int alpha_channel = -1;
+
+  // If non-empty, uses this blending info for the extra channels, otherwise
+  // automatically chooses it. The encoder API will fill this vector with the
+  // extra channel info and allows more options. The non-API cjxl leaves it
+  // empty and relies on the default behavior.
+  std::vector<BlendingInfo> extra_channel_blending_info;
+};
+
+// Checks and adjusts CompressParams when they are all initialized.
+Status ParamsPostInit(CompressParams* p);
+
+// Encodes a single frame (including its header) into a byte stream.  Groups may
+// be processed in parallel by `pool`. metadata is the ImageMetadata encoded in
+// the codestream, and must be used for the FrameHeaders, do not use
+// ib.metadata.
+Status EncodeFrame(const CompressParams& cparams_orig,
+                   const FrameInfo& frame_info, const CodecMetadata* metadata,
+                   const ImageBundle& ib, PassesEncoderState* passes_enc_state,
+                   const JxlCmsInterface& cms, ThreadPool* pool,
+                   BitWriter* writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FRAME_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
new file mode 100644
index 0000000000..d57bb68b7f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
@@ -0,0 +1,61 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_gaborish.h"
+
+#include <stddef.h>
+
+#include <hwy/base.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) {
+  WeightsSymmetric5 weights[3];
+  // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5
+  // are insufficient. The numbers here have been obtained by butteraugli
+  // based optimizing the whole system and the errors produced are likely
+  // more favorable for good rate-distortion compromises rather than
+  // just using mathematical optimization to find the inverse.
+  static const float kGaborish[5] = {
+      -0.090881924078487886f, -0.043663953593472138f, 0.01392497846646211f,
+      0.0036189602184591141f, 0.0030557936884763499f};
+  for (int i = 0; i < 3; ++i) {
+    double sum = 1.0 + mul[i] * 4 *
+                           (kGaborish[0] + kGaborish[1] + kGaborish[2] +
+                            kGaborish[4] + 2 * kGaborish[3]);
+    if (sum < 1e-5) {
+      sum = 1e-5;
+    }
+    const float normalize = static_cast<float>(1.0 / sum);
+    const float normalize_mul = mul[i] * normalize;
+    weights[i] = WeightsSymmetric5{{HWY_REP4(normalize)},
+                                   {HWY_REP4(normalize_mul * kGaborish[0])},
+                                   {HWY_REP4(normalize_mul * kGaborish[2])},
+                                   {HWY_REP4(normalize_mul * kGaborish[1])},
+                                   {HWY_REP4(normalize_mul * kGaborish[4])},
+                                   {HWY_REP4(normalize_mul * kGaborish[3])}};
+  }
+  // Reduce memory footprint by only allocating a single plane and swapping it
+  // into the output Image3F. Better still would be tiling.
+  // Note that we cannot *allocate* a plane, as doing so might cause Image3F to
+  // have planes of different stride. Instead, we copy one plane in a temporary
+  // image and reuse the existing planes of the in/out image.
+  ImageF temp = CopyImage(in_out->Plane(2));
+  Symmetric5(in_out->Plane(0), Rect(*in_out), weights[0], pool,
+             &in_out->Plane(2));
+  Symmetric5(in_out->Plane(1), Rect(*in_out), weights[1], pool,
+             &in_out->Plane(0));
+  Symmetric5(temp, Rect(*in_out), weights[2], pool, &in_out->Plane(1));
+  // Now planes are 1, 2, 0.
+  in_out->Plane(0).Swap(in_out->Plane(1));
+  // 2 1 0
+  in_out->Plane(0).Swap(in_out->Plane(2));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish.h b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
new file mode 100644
index 0000000000..102064f9a2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_GABORISH_H_
+#define LIB_JXL_GABORISH_H_
+
+// Linear smoothing (3x3 convolution) for deblocking without too much blur.
+
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+// Used in encoder to reduce the impact of the decoder's smoothing.
+// This is not exact. Works in-place to reduce memory use.
+// The input is typically in XYB space.
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
new file mode 100644
index 0000000000..57a18e3338
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_gaborish.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+// weight1,2 need not be normalized.
+WeightsSymmetric3 GaborishKernel(float weight1, float weight2) {
+  constexpr float weight0 = 1.0f;
+
+  // Normalize
+  const float mul = 1.0f / (weight0 + 4 * (weight1 + weight2));
+  const float w0 = weight0 * mul;
+  const float w1 = weight1 * mul;
+  const float w2 = weight2 * mul;
+
+  const WeightsSymmetric3 w = {{HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}};
+  return w;
+}
+
+void ConvolveGaborish(const ImageF& in, float weight1, float weight2,
+                      ThreadPool* pool, ImageF* JXL_RESTRICT out) {
+  JXL_CHECK(SameSize(in, *out));
+  Symmetric3(in, Rect(in), GaborishKernel(weight1, weight2), pool, out);
+}
+
+void TestRoundTrip(const Image3F& in, float max_l1) {
+  Image3F fwd(in.xsize(), in.ysize());
+  ThreadPool* null_pool = nullptr;
+  ConvolveGaborish(in.Plane(0), 0, 0, null_pool, &fwd.Plane(0));
+  ConvolveGaborish(in.Plane(1), 0, 0, null_pool, &fwd.Plane(1));
+  ConvolveGaborish(in.Plane(2), 0, 0, null_pool, &fwd.Plane(2));
+  float w = 0.92718927264540152f;
+  float weights[3] = {
+      w,
+      w,
+      w,
+  };
+  GaborishInverse(&fwd, weights, null_pool);
+  JXL_ASSERT_OK(VerifyRelativeError(in, fwd, max_l1, 1E-4f, _));
+}
+
+TEST(GaborishTest, TestZero) {
+  Image3F in(20, 20);
+  ZeroFillImage(&in);
+  TestRoundTrip(in, 0.0f);
+}
+
+// Disabled: large difference.
+#if 0
+TEST(GaborishTest, TestDirac) {
+  Image3F in(20, 20);
+  ZeroFillImage(&in);
+  in.PlaneRow(1, 10)[10] = 10.0f;
+  TestRoundTrip(in, 0.26f);
+}
+#endif
+
+TEST(GaborishTest, TestFlat) {
+  Image3F in(20, 20);
+  FillImage(1.0f, &in);
+  TestRoundTrip(in, 1E-5f);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h
new file mode 100644
index 0000000000..0db7012bbe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_gamma_correct.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_GAMMA_CORRECT_H_
+#define LIB_JXL_ENC_GAMMA_CORRECT_H_
+
+// Deprecated: sRGB transfer function. Use color_management.h instead.
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+namespace jxl {
+
+// Values are in [0, 1].
+static JXL_INLINE double Srgb8ToLinearDirect(double srgb) {
+  if (srgb <= 0.0) return 0.0;
+  if (srgb <= 0.04045) return srgb / 12.92;
+  if (srgb >= 1.0) return 1.0;
+  return std::pow((srgb + 0.055) / 1.055, 2.4);
+}
+
+// Values are in [0, 1].
+static JXL_INLINE double LinearToSrgb8Direct(double linear) {
+  if (linear <= 0.0) return 0.0;
+  if (linear >= 1.0) return 1.0;
+  if (linear <= 0.0031308) return linear * 12.92;
+  return std::pow(linear, 1.0 / 2.4) * 1.055 - 0.055;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_GAMMA_CORRECT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.cc b/third_party/jpeg-xl/lib/jxl/enc_group.cc
new file mode 100644
index 0000000000..074cf1553a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.cc
@@ -0,0 +1,426 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_group.h"
+
+#include <hwy/aligned_allocator.h>
+#include <utility>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_transforms-inl.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quantizer-inl.h"
+#include "lib/jxl/quantizer.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::MaskFromVec;
+using hwy::HWY_NAMESPACE::Round;
+
+// NOTE: caller takes care of extracting quant from rect of RawQuantField.
+void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
+                     size_t c, float qm_multiplier, size_t quant_kind,
+                     size_t xsize, size_t ysize, float* thresholds,
+                     const float* JXL_RESTRICT block_in, int32_t* quant,
+                     int32_t* JXL_RESTRICT block_out) {
+  PROFILER_FUNC;
+  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
+  float qac = quantizer.Scale() * (*quant);
+  // Not SIMD-fied for now.
+  if (c != 1 && (xsize > 1 || ysize > 1)) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
+      if (thresholds[i] < 0.54) {
+        thresholds[i] = 0.54;
+      }
+    }
+  }
+  HWY_CAPPED(float, kBlockDim) df;
+  HWY_CAPPED(int32_t, kBlockDim) di;
+  HWY_CAPPED(uint32_t, kBlockDim) du;
+  const auto quantv = Set(df, qac * qm_multiplier);
+  for (size_t y = 0; y < ysize * kBlockDim; y++) {
+    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
+    const size_t off = y * kBlockDim * xsize;
+    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
+      auto thr = Zero(df);
+      if (xsize == 1) {
+        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
+        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
+        thr = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
+                         Set(df, thresholds[yfix]));
+      } else {
+        // Same for all lanes in the vector.
+        thr = Set(
+            df,
+            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
+      }
+      const auto q = Mul(Load(df, qm + off + x), quantv);
+      const auto in = Load(df, block_in + off + x);
+      const auto val = Mul(q, in);
+      const auto nzero_mask = Ge(Abs(val), thr);
+      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
+      Store(v, di, block_out + off + x);
+    }
+  }
+}
+
+void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
+                        float qm_multiplier, size_t quant_kind, size_t xsize,
+                        size_t ysize, float* thresholds,
+                        const float* JXL_RESTRICT block_in, int32_t* quant) {
+  // No quantization adjusting for these small blocks.
+  // Quantization adjusting attempts to fix some known issues
+  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
+  // when there are not many non-zeros.
+  constexpr size_t kPartialBlockKinds =
+      (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) |
+      (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) |
+      (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) |
+      (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) |
+      (1 << AcStrategy::Type::AFV3);
+  if ((1 << quant_kind) & kPartialBlockKinds) return;
+
+  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
+  float qac = quantizer.Scale() * (*quant);
+  if (xsize > 1 || ysize > 1) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
+      if (thresholds[i] < 0.54) {
+        thresholds[i] = 0.54;
+      }
+    }
+  }
+  float sum_of_highest_freq_row_and_column = 0;
+  float hfNonZeros[4] = {};
+  float hfMaxError[4] = {};
+
+  for (size_t y = 0; y < ysize * kBlockDim; y++) {
+    for (size_t x = 0; x < xsize * kBlockDim; x++) {
+      const size_t pos = y * kBlockDim * xsize + x;
+      if (x < xsize && y < ysize) {
+        continue;
+      }
+      const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
+                           static_cast<size_t>(x >= xsize * kBlockDim / 2));
+      const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
+      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
+      if (c == 1 && v == 0) {
+        const float error = std::abs(val);
+        if (hfMaxError[hfix] < error) {
+          hfMaxError[hfix] = error;
+        }
+      }
+      if (v != 0.0f) {
+        hfNonZeros[hfix] += std::abs(v);
+        if ((y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1) &&
+            (x >= xsize * 4 && y >= ysize * 4)) {
+          sum_of_highest_freq_row_and_column += std::abs(val);
+        }
+      }
+    }
+  }
+  if (c == 1) {
+    static const double kLimit = 0.49f;
+    for (int i = 1; i < 4; ++i) {
+      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit) {
+        thresholds[i] = 0.9999 * hfMaxError[i];
+      }
+    }
+  }
+  // Heuristic for improving accuracy of high-frequency patterns
+  // occurring in an environment with no medium-frequency masking
+  // patterns. This should be improved later to be done in X and B
+  // planes too as 32x32 and larger transforms become rather ugly
+  // when this is not compensated for.
+  if (15 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+    constexpr int inc = 5;
+    *quant += inc;
+    if (8 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (5 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (3 * sum_of_highest_freq_row_and_column >= hfNonZeros[0] + 1) {
+      *quant += inc;
+    }
+    if (*quant >= Quantizer::kQuantMax) {
+      *quant = Quantizer::kQuantMax - 1;
+    }
+  }
+  if (quant_kind == AcStrategy::Type::DCT) {
+    // If this 8x8 block is too flat, increase the adaptive quantization level
+    // a bit to reduce visible block boundaries and requantize the block.
+    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
+      *quant += 1;
+      if (*quant >= Quantizer::kQuantMax) {
+        *quant = Quantizer::kQuantMax - 1;
+      }
+    }
+  }
+  {
+    // Reduce quant in highly active areas.
+    int32_t div = (xsize + ysize) / 2;
+    int32_t activity = (hfNonZeros[0] + div / 2) / div;
+    int32_t orig_qp_limit = std::max(4, *quant / 2);
+    for (int i = 1; i < 4; ++i) {
+      activity = std::min<int32_t>(activity, (hfNonZeros[i] + div / 2) / div);
+    }
+    if (activity >= 15) {
+      activity = 15;
+    }
+    int32_t qp = *quant - activity;
+    if (qp < orig_qp_limit) {
+      qp = orig_qp_limit;
+    }
+    *quant = qp;
+  }
+}
+
+// NOTE: caller takes care of extracting quant from rect of RawQuantField.
+void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
+                               const Quantizer& quantizer,
+                               const bool error_diffusion, size_t quant_kind,
+                               size_t xsize, size_t ysize,
+                               const float* JXL_RESTRICT biases, int32_t* quant,
+                               float* JXL_RESTRICT inout,
+                               int32_t* JXL_RESTRICT quantized) {
+  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+  {
+    int32_t max_quant = 0;
+    int quant_orig = *quant;
+    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
+                    enc_state->b_qm_multiplier};
+    int clut[3] = {1, 0, 2};
+    for (int ii = 0; ii < 3; ++ii) {
+      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+      int c = clut[ii];
+      *quant = quant_orig;
+      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
+                         &thres[0], inout + c * size, quant);
+      // Dead zone adjustment
+      if (c == 1) {
+        for (int k = 0; k < 4; ++k) {
+          thres_y[k] = thres[k];
+        }
+      }
+      max_quant = std::max(*quant, max_quant);
+    }
+    *quant = max_quant;
+  }
+
+  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
+                  &thres_y[0], inout + size, quant, quantized + size);
+
+  PROFILER_ZONE("enc quant adjust bias");
+  const float* JXL_RESTRICT dequant_matrix =
+      quantizer.DequantMatrix(quant_kind, 1);
+
+  HWY_CAPPED(float, kDCTBlockSize) df;
+  HWY_CAPPED(int32_t, kDCTBlockSize) di;
+  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
+  for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
+    const auto quant = Load(di, quantized + size + k);
+    const auto adj_quant = AdjustQuantBias(di, 1, quant, biases);
+    const auto dequantm = Load(df, dequant_matrix + k);
+    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
+  }
+}
+
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc) {
+  PROFILER_FUNC;
+  const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx);
+  const Rect group_rect = enc_state->shared.GroupRect(group_idx);
+  const Rect cmap_rect(
+      block_group_rect.x0() / kColorTileDimInBlocks,
+      block_group_rect.y0() / kColorTileDimInBlocks,
+      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
+      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
+
+  const size_t xsize_blocks = block_group_rect.xsize();
+  const size_t ysize_blocks = block_group_rect.ysize();
+
+  const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
+  const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
+
+  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
+  const CompressParams& cparams = enc_state->cparams;
+
+  // TODO(veluca): consider strategies to reduce this memory.
+  auto mem = hwy::AllocateAligned<int32_t>(3 * AcStrategy::kMaxCoeffArea);
+  auto fmem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  float* JXL_RESTRICT scratch_space =
+      fmem.get() + 3 * AcStrategy::kMaxCoeffArea;
+  {
+    // Only use error diffusion in Squirrel mode or slower.
+    const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
+    constexpr HWY_CAPPED(float, kDCTBlockSize) d;
+
+    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
+    size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
+    JXL_DASSERT(num_passes > 0);
+    for (size_t i = 0; i < num_passes; i++) {
+      // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
+      JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32);
+      for (size_t c = 0; c < 3; c++) {
+        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
+      }
+    }
+
+    HWY_ALIGN float* coeffs_in = fmem.get();
+    HWY_ALIGN int32_t* quantized = mem.get();
+
+    for (size_t by = 0; by < ysize_blocks; ++by) {
+      int32_t* JXL_RESTRICT row_quant_ac =
+          block_group_rect.Row(&full_quant_field, by);
+      size_t ty = by / kColorTileDimInBlocks;
+      const int8_t* JXL_RESTRICT row_cmap[3] = {
+          cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
+          nullptr,
+          cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
+      };
+      const float* JXL_RESTRICT opsin_rows[3] = {
+          group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
+          group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
+          group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
+      };
+      float* JXL_RESTRICT dc_rows[3] = {
+          block_group_rect.PlaneRow(dc, 0, by),
+          block_group_rect.PlaneRow(dc, 1, by),
+          block_group_rect.PlaneRow(dc, 2, by),
+      };
+      AcStrategyRow ac_strategy_row =
+          enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
+      for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
+           tx++) {
+        const auto x_factor =
+            Set(d, enc_state->shared.cmap.YtoXRatio(row_cmap[0][tx]));
+        const auto b_factor =
+            Set(d, enc_state->shared.cmap.YtoBRatio(row_cmap[2][tx]));
+        for (size_t bx = tx * kColorTileDimInBlocks;
+             bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
+          const AcStrategy acs = ac_strategy_row[bx];
+          if (!acs.IsFirstBlock()) continue;
+
+          size_t xblocks = acs.covered_blocks_x();
+          size_t yblocks = acs.covered_blocks_y();
+
+          CoefficientLayout(&yblocks, &xblocks);
+
+          size_t size = kDCTBlockSize * xblocks * yblocks;
+
+          // DCT Y channel, roundtrip-quantize it and set DC.
+          int32_t quant_ac = row_quant_ac[bx];
+          for (size_t c : {0, 1, 2}) {
+            TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
+                                opsin_stride, coeffs_in + c * size,
+                                scratch_space);
+          }
+          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
+                                  dc_rows[1] + bx, dc_stride);
+
+          QuantizeRoundtripYBlockAC(
+              enc_state, size, enc_state->shared.quantizer, error_diffusion,
+              acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
+              coeffs_in, quantized);
+
+          // Unapply color correlation
+          for (size_t k = 0; k < size; k += Lanes(d)) {
+            const auto in_x = Load(d, coeffs_in + k);
+            const auto in_y = Load(d, coeffs_in + size + k);
+            const auto in_b = Load(d, coeffs_in + 2 * size + k);
+            const auto out_x = NegMulAdd(x_factor, in_y, in_x);
+            const auto out_b = NegMulAdd(b_factor, in_y, in_b);
+            Store(out_x, d, coeffs_in + k);
+            Store(out_b, d, coeffs_in + 2 * size + k);
+          }
+
+          // Quantize X and B channels and set DC.
+          for (size_t c : {0, 2}) {
+            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
+            QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
+                            c == 0 ? enc_state->x_qm_multiplier
+                                   : enc_state->b_qm_multiplier,
+                            acs.RawStrategy(), xblocks, yblocks, &thres[0],
+                            coeffs_in + c * size, &quant_ac,
+                            quantized + c * size);
+            DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
+                                    dc_rows[c] + bx, dc_stride);
+          }
+          row_quant_ac[bx] = quant_ac;
+          for (size_t c = 0; c < 3; c++) {
+            enc_state->progressive_splitter.SplitACCoefficients(
+                quantized + c * size, acs, bx, by, coeffs[c]);
+            for (size_t p = 0; p < num_passes; p++) {
+              coeffs[c][p] += size;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ComputeCoefficients);
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc) {
+  return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin,
+                                                   dc);
+}
+
+Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
+                                        size_t histogram_idx,
+                                        const PassesEncoderState& enc_state,
+                                        BitWriter* writer, AuxOut* aux_out) {
+  // Select which histogram to use among those of the current pass.
+  const size_t num_histograms = enc_state.shared.num_histograms;
+  // num_histograms is 0 only for lossless.
+  JXL_ASSERT(num_histograms == 0 || histogram_idx < num_histograms);
+  size_t histo_selector_bits = CeilLog2Nonzero(num_histograms);
+
+  if (histo_selector_bits != 0) {
+    BitWriter::Allotment allotment(writer, histo_selector_bits);
+    writer->Write(histo_selector_bits, histogram_idx);
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
+  }
+  WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx],
+              enc_state.passes[pass_idx].codes,
+              enc_state.passes[pass_idx].context_map, writer, kLayerACTokens,
+              aux_out);
+
+  return true;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_group.h b/third_party/jpeg-xl/lib/jxl/enc_group.h
new file mode 100644
index 0000000000..0caf408a03
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_GROUP_H_
+#define LIB_JXL_ENC_GROUP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct PassesEncoderState;
+
+// Fills DC
+void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
+                         const Image3F& opsin, Image3F* dc);
+
+Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
+                                        size_t histogram_idx,
+                                        const PassesEncoderState& enc_state,
+                                        BitWriter* writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_GROUP_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
new file mode 100644
index 0000000000..18122fa769
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
@@ -0,0 +1,948 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_heuristics.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <string>
+
+#include "lib/jxl/enc_ac_strategy.h"
+#include "lib/jxl/enc_adaptive_quantization.h"
+#include "lib/jxl/enc_ar_control_field.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_chroma_from_luma.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_noise.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_photon_noise.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/enc_xyb.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
+  if (enc_state.cparams.decoding_speed_tier >= 1) {
+    static constexpr uint8_t kSimpleCtxMap[] = {
+        // Cluster all blocks together
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  //
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  //
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  //
+    };
+    static_assert(
+        3 * kNumOrders == sizeof(kSimpleCtxMap) / sizeof *kSimpleCtxMap,
+        "Update simple context map");
+
+    auto bcm = enc_state.shared.block_ctx_map;
+    bcm.ctx_map.assign(std::begin(kSimpleCtxMap), std::end(kSimpleCtxMap));
+    bcm.num_ctxs = 2;
+    bcm.num_dc_ctxs = 1;
+    return;
+  }
+  if (enc_state.cparams.speed_tier >= SpeedTier::kFalcon) {
+    return;
+  }
+  const ImageI& rqf = enc_state.shared.raw_quant_field;
+  // No need to change context modeling for small images.
+  size_t tot = rqf.xsize() * rqf.ysize();
+  size_t size_for_ctx_model =
+      (1 << 10) * enc_state.cparams.butteraugli_distance;
+  if (tot < size_for_ctx_model) return;
+
+  struct OccCounters {
+    // count the occurrences of each qf value and each strategy type.
+    OccCounters(const ImageI& rqf, const AcStrategyImage& ac_strategy) {
+      for (size_t y = 0; y < rqf.ysize(); y++) {
+        const int32_t* qf_row = rqf.Row(y);
+        AcStrategyRow acs_row = ac_strategy.ConstRow(y);
+        for (size_t x = 0; x < rqf.xsize(); x++) {
+          int ord = kStrategyOrder[acs_row[x].RawStrategy()];
+          int qf = qf_row[x] - 1;
+          qf_counts[qf]++;
+          qf_ord_counts[ord][qf]++;
+          ord_counts[ord]++;
+        }
+      }
+    }
+
+    size_t qf_counts[256] = {};
+    size_t qf_ord_counts[kNumOrders][256] = {};
+    size_t ord_counts[kNumOrders] = {};
+  };
+  // The OccCounters struct is too big to allocate on the stack.
+  std::unique_ptr<OccCounters> counters(
+      new OccCounters(rqf, enc_state.shared.ac_strategy));
+
+  // Splitting the context model according to the quantization field seems to
+  // mostly benefit only large images.
+  size_t size_for_qf_split = (1 << 13) * enc_state.cparams.butteraugli_distance;
+  size_t num_qf_segments = tot < size_for_qf_split ? 1 : 2;
+  std::vector<uint32_t>& qft = enc_state.shared.block_ctx_map.qf_thresholds;
+  qft.clear();
+  // Divide the quant field in up to num_qf_segments segments.
+  size_t cumsum = 0;
+  size_t next = 1;
+  size_t last_cut = 256;
+  size_t cut = tot * next / num_qf_segments;
+  for (uint32_t j = 0; j < 256; j++) {
+    cumsum += counters->qf_counts[j];
+    if (cumsum > cut) {
+      if (j != 0) {
+        qft.push_back(j);
+      }
+      last_cut = j;
+      while (cumsum > cut) {
+        next++;
+        cut = tot * next / num_qf_segments;
+      }
+    } else if (next > qft.size() + 1) {
+      if (j - 1 == last_cut && j != 0) {
+        qft.push_back(j);
+      }
+    }
+  }
+
+  // Count the occurrences of each segment.
+  std::vector<size_t> counts(kNumOrders * (qft.size() + 1));
+  size_t qft_pos = 0;
+  for (size_t j = 0; j < 256; j++) {
+    if (qft_pos < qft.size() && j == qft[qft_pos]) {
+      qft_pos++;
+    }
+    for (size_t i = 0; i < kNumOrders; i++) {
+      counts[qft_pos + i * (qft.size() + 1)] += counters->qf_ord_counts[i][j];
+    }
+  }
+
+  // Repeatedly merge the lowest-count pair.
+  std::vector<uint8_t> remap((qft.size() + 1) * kNumOrders);
+  std::iota(remap.begin(), remap.end(), 0);
+  std::vector<uint8_t> clusters(remap);
+  size_t nb_clusters = Clamp1((int)(tot / size_for_ctx_model / 2), 2, 9);
+  size_t nb_clusters_chroma = Clamp1((int)(tot / size_for_ctx_model / 3), 1, 5);
+  // This is O(n^2 log n), but n is small.
+  while (clusters.size() > nb_clusters) {
+    std::sort(clusters.begin(), clusters.end(),
+              [&](int a, int b) { return counts[a] > counts[b]; });
+    counts[clusters[clusters.size() - 2]] += counts[clusters.back()];
+    counts[clusters.back()] = 0;
+    remap[clusters.back()] = clusters[clusters.size() - 2];
+    clusters.pop_back();
+  }
+  for (size_t i = 0; i < remap.size(); i++) {
+    while (remap[remap[i]] != remap[i]) {
+      remap[i] = remap[remap[i]];
+    }
+  }
+  // Relabel starting from 0.
+  std::vector<uint8_t> remap_remap(remap.size(), remap.size());
+  size_t num = 0;
+  for (size_t i = 0; i < remap.size(); i++) {
+    if (remap_remap[remap[i]] == remap.size()) {
+      remap_remap[remap[i]] = num++;
+    }
+    remap[i] = remap_remap[remap[i]];
+  }
+  // Write the block context map.
+  auto& ctx_map = enc_state.shared.block_ctx_map.ctx_map;
+  ctx_map = remap;
+  ctx_map.resize(remap.size() * 3);
+  // for chroma, only use up to nb_clusters_chroma separate block contexts
+  // (those for the biggest clusters)
+  for (size_t i = remap.size(); i < remap.size() * 3; i++) {
+    ctx_map[i] = num + Clamp1((int)remap[i % remap.size()], 0,
+                              (int)nb_clusters_chroma - 1);
+  }
+  enc_state.shared.block_ctx_map.num_ctxs =
+      *std::max_element(ctx_map.begin(), ctx_map.end()) + 1;
+}
+
+}  // namespace
+
+void FindBestDequantMatrices(const CompressParams& cparams,
+                             const Image3F& opsin,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             DequantMatrices* dequant_matrices) {
+  // TODO(veluca): quant matrices for no-gaborish.
+  // TODO(veluca): heuristics for in-bitstream quant tables.
+  *dequant_matrices = DequantMatrices();
+  if (cparams.max_error_mode) {
+    // Set numerators of all quantization matrices to constant values.
+    float weights[3][1] = {{1.0f / cparams.max_error[0]},
+                           {1.0f / cparams.max_error[1]},
+                           {1.0f / cparams.max_error[2]}};
+    DctQuantWeightParams dct_params(weights);
+    std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                         QuantEncoding::DCT(dct_params));
+    DequantMatricesSetCustom(dequant_matrices, encodings,
+                             modular_frame_encoder);
+    float dc_weights[3] = {1.0f / cparams.max_error[0],
+                           1.0f / cparams.max_error[1],
+                           1.0f / cparams.max_error[2]};
+    DequantMatricesSetCustomDC(dequant_matrices, dc_weights);
+  }
+}
+
+bool DefaultEncoderHeuristics::HandlesColorConversion(
+    const CompressParams& cparams, const ImageBundle& ib) {
+  return cparams.noise != Override::kOn && cparams.patches != Override::kOn &&
+         cparams.speed_tier >= SpeedTier::kWombat && cparams.resampling == 1 &&
+         cparams.color_transform == ColorTransform::kXYB &&
+         !cparams.modular_mode && !ib.HasAlpha();
+}
+
+namespace {
+
+void StoreMin2(const float v, float& min1, float& min2) {
+  if (v < min2) {
+    if (v < min1) {
+      min2 = min1;
+      min1 = v;
+    } else {
+      min2 = v;
+    }
+  }
+}
+
+void CreateMask(const ImageF& image, ImageF& mask) {
+  for (size_t y = 0; y < image.ysize(); y++) {
+    auto* row_n = y > 0 ? image.Row(y - 1) : image.Row(y);
+    auto* row_in = image.Row(y);
+    auto* row_s = y + 1 < image.ysize() ? image.Row(y + 1) : image.Row(y);
+    auto* row_out = mask.Row(y);
+    for (size_t x = 0; x < image.xsize(); x++) {
+      // Center, west, east, north, south values and their absolute difference
+      float c = row_in[x];
+      float w = x > 0 ? row_in[x - 1] : row_in[x];
+      float e = x + 1 < image.xsize() ? row_in[x + 1] : row_in[x];
+      float n = row_n[x];
+      float s = row_s[x];
+      float dw = std::abs(c - w);
+      float de = std::abs(c - e);
+      float dn = std::abs(c - n);
+      float ds = std::abs(c - s);
+      float min = std::numeric_limits<float>::max();
+      float min2 = std::numeric_limits<float>::max();
+      StoreMin2(dw, min, min2);
+      StoreMin2(de, min, min2);
+      StoreMin2(dn, min, min2);
+      StoreMin2(ds, min, min2);
+      row_out[x] = min2;
+    }
+  }
+}
+
+// Downsamples the image by a factor of 2 with a kernel that's sharper than
+// the standard 2x2 box kernel used by DownsampleImage.
+// The kernel is optimized against the result of the 2x2 upsampling kernel used
+// by the decoder. Ringing is slightly reduced by clamping the values of the
+// resulting pixels within certain bounds of a small region in the original
+// image.
+void DownsampleImage2_Sharper(const ImageF& input, ImageF* output) {
+  const int64_t kernelx = 12;
+  const int64_t kernely = 12;
+
+  static const float kernel[144] = {
+      -0.000314256996835, -0.000314256996835, -0.000897597057705,
+      -0.000562751488849, -0.000176807273646, 0.001864627368902,
+      0.001864627368902,  -0.000176807273646, -0.000562751488849,
+      -0.000897597057705, -0.000314256996835, -0.000314256996835,
+      -0.000314256996835, -0.001527942804748, -0.000121760530512,
+      0.000191123989093,  0.010193185932466,  0.058637519197110,
+      0.058637519197110,  0.010193185932466,  0.000191123989093,
+      -0.000121760530512, -0.001527942804748, -0.000314256996835,
+      -0.000897597057705, -0.000121760530512, 0.000946363683751,
+      0.007113577630288,  0.000437956841058,  -0.000372823835211,
+      -0.000372823835211, 0.000437956841058,  0.007113577630288,
+      0.000946363683751,  -0.000121760530512, -0.000897597057705,
+      -0.000562751488849, 0.000191123989093,  0.007113577630288,
+      0.044592622228814,  0.000222278879007,  -0.162864473015945,
+      -0.162864473015945, 0.000222278879007,  0.044592622228814,
+      0.007113577630288,  0.000191123989093,  -0.000562751488849,
+      -0.000176807273646, 0.010193185932466,  0.000437956841058,
+      0.000222278879007,  -0.000913092543974, -0.017071696107902,
+      -0.017071696107902, -0.000913092543974, 0.000222278879007,
+      0.000437956841058,  0.010193185932466,  -0.000176807273646,
+      0.001864627368902,  0.058637519197110,  -0.000372823835211,
+      -0.162864473015945, -0.017071696107902, 0.414660099370354,
+      0.414660099370354,  -0.017071696107902, -0.162864473015945,
+      -0.000372823835211, 0.058637519197110,  0.001864627368902,
+      0.001864627368902,  0.058637519197110,  -0.000372823835211,
+      -0.162864473015945, -0.017071696107902, 0.414660099370354,
+      0.414660099370354,  -0.017071696107902, -0.162864473015945,
+      -0.000372823835211, 0.058637519197110,  0.001864627368902,
+      -0.000176807273646, 0.010193185932466,  0.000437956841058,
+      0.000222278879007,  -0.000913092543974, -0.017071696107902,
+      -0.017071696107902, -0.000913092543974, 0.000222278879007,
+      0.000437956841058,  0.010193185932466,  -0.000176807273646,
+      -0.000562751488849, 0.000191123989093,  0.007113577630288,
+      0.044592622228814,  0.000222278879007,  -0.162864473015945,
+      -0.162864473015945, 0.000222278879007,  0.044592622228814,
+      0.007113577630288,  0.000191123989093,  -0.000562751488849,
+      -0.000897597057705, -0.000121760530512, 0.000946363683751,
+      0.007113577630288,  0.000437956841058,  -0.000372823835211,
+      -0.000372823835211, 0.000437956841058,  0.007113577630288,
+      0.000946363683751,  -0.000121760530512, -0.000897597057705,
+      -0.000314256996835, -0.001527942804748, -0.000121760530512,
+      0.000191123989093,  0.010193185932466,  0.058637519197110,
+      0.058637519197110,  0.010193185932466,  0.000191123989093,
+      -0.000121760530512, -0.001527942804748, -0.000314256996835,
+      -0.000314256996835, -0.000314256996835, -0.000897597057705,
+      -0.000562751488849, -0.000176807273646, 0.001864627368902,
+      0.001864627368902,  -0.000176807273646, -0.000562751488849,
+      -0.000897597057705, -0.000314256996835, -0.000314256996835};
+
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+
+  ImageF box_downsample = CopyImage(input);
+  DownsampleImage(&box_downsample, 2);
+
+  ImageF mask(box_downsample.xsize(), box_downsample.ysize());
+  CreateMask(box_downsample, mask);
+
+  for (size_t y = 0; y < output->ysize(); y++) {
+    float* row_out = output->Row(y);
+    const float* row_in[kernely];
+    const float* row_mask = mask.Row(y);
+    // get the rows in the support
+    for (size_t ky = 0; ky < kernely; ky++) {
+      int64_t iy = y * 2 + ky - (kernely - 1) / 2;
+      if (iy < 0) iy = 0;
+      if (iy >= ysize) iy = ysize - 1;
+      row_in[ky] = input.Row(iy);
+    }
+
+    for (size_t x = 0; x < output->xsize(); x++) {
+      // get min and max values of the original image in the support
+      float min = std::numeric_limits<float>::max();
+      float max = std::numeric_limits<float>::min();
+      // kernelx - R and kernely - R are the radius of a rectangular region in
+      // which the values of a pixel are bounded to reduce ringing.
+      static constexpr int64_t R = 5;
+      for (int64_t ky = R; ky + R < kernely; ky++) {
+        for (int64_t kx = R; kx + R < kernelx; kx++) {
+          int64_t ix = x * 2 + kx - (kernelx - 1) / 2;
+          if (ix < 0) ix = 0;
+          if (ix >= xsize) ix = xsize - 1;
+          min = std::min<float>(min, row_in[ky][ix]);
+          max = std::max<float>(max, row_in[ky][ix]);
+        }
+      }
+
+      float sum = 0;
+      for (int64_t ky = 0; ky < kernely; ky++) {
+        for (int64_t kx = 0; kx < kernelx; kx++) {
+          int64_t ix = x * 2 + kx - (kernelx - 1) / 2;
+          if (ix < 0) ix = 0;
+          if (ix >= xsize) ix = xsize - 1;
+          sum += row_in[ky][ix] * kernel[ky * kernelx + kx];
+        }
+      }
+
+      row_out[x] = sum;
+
+      // Clamp the pixel within the value  of a small area to prevent ringning.
+      // The mask determines how much to clamp, clamp more to reduce more
+      // ringing in smooth areas, clamp less in noisy areas to get more
+      // sharpness. Higher mask_multiplier gives less clamping, so less
+      // ringing reduction.
+      const constexpr float mask_multiplier = 1;
+      float a = row_mask[x] * mask_multiplier;
+      float clip_min = min - a;
+      float clip_max = max + a;
+      if (row_out[x] < clip_min) {
+        row_out[x] = clip_min;
+      } else if (row_out[x] > clip_max) {
+        row_out[x] = clip_max;
+      }
+    }
+  }
+}
+
+void DownsampleImage2_Sharper(Image3F* opsin) {
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim,
+                      DivCeil(opsin->ysize(), 2) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage2_Sharper(opsin->Plane(c), &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+
+// The default upsampling kernels used by Upsampler in the decoder.
+static const constexpr int64_t kSize = 5;
+
+static const float kernel00[25] = {
+    -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+    -0.03452303f, 0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f,
+    -0.04022174f, 0.28896755f,  0.56661550f,  0.03777607f,  -0.01986694f,
+    -0.02921014f, 0.00278718f,  0.03777607f,  -0.03144731f, -0.01185068f,
+    -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f,
+};
+static const float kernel01[25] = {
+    -0.00624645f, -0.01610267f, -0.01986694f, -0.01185068f, -0.00213539f,
+    -0.02921014f, 0.00278718f,  0.03777607f,  -0.03144731f, -0.01185068f,
+    -0.04022174f, 0.28896755f,  0.56661550f,  0.03777607f,  -0.01986694f,
+    -0.03452303f, 0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f,
+    -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+};
+static const float kernel10[25] = {
+    -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f,
+    -0.01610267f, 0.00278718f,  0.28896755f,  0.14111091f,  -0.03452303f,
+    -0.01986694f, 0.03777607f,  0.56661550f,  0.28896755f,  -0.04022174f,
+    -0.01185068f, -0.03144731f, 0.03777607f,  0.00278718f,  -0.02921014f,
+    -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f,
+};
+static const float kernel11[25] = {
+    -0.00213539f, -0.01185068f, -0.01986694f, -0.01610267f, -0.00624645f,
+    -0.01185068f, -0.03144731f, 0.03777607f,  0.00278718f,  -0.02921014f,
+    -0.01986694f, 0.03777607f,  0.56661550f,  0.28896755f,  -0.04022174f,
+    -0.01610267f, 0.00278718f,  0.28896755f,  0.14111091f,  -0.03452303f,
+    -0.00624645f, -0.02921014f, -0.04022174f, -0.03452303f, -0.01716200f,
+};
+
+// Does exactly the same as the Upsampler in dec_upsampler for 2x2 pixels, with
+// default CustomTransformData.
+// TODO(lode): use Upsampler instead. However, it requires pre-initialization
+// and padding on the left side of the image which requires refactoring the
+// other code using this.
+static void UpsampleImage(const ImageF& input, ImageF* output) {
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+  int64_t xsize2 = output->xsize();
+  int64_t ysize2 = output->ysize();
+  for (int64_t y = 0; y < ysize2; y++) {
+    for (int64_t x = 0; x < xsize2; x++) {
+      auto kernel = kernel00;
+      if ((x & 1) && (y & 1)) {
+        kernel = kernel11;
+      } else if (x & 1) {
+        kernel = kernel10;
+      } else if (y & 1) {
+        kernel = kernel01;
+      }
+      float sum = 0;
+      int64_t x2 = x / 2;
+      int64_t y2 = y / 2;
+
+      // get min and max values of the original image in the support
+      float min = std::numeric_limits<float>::max();
+      float max = std::numeric_limits<float>::min();
+
+      for (int64_t ky = 0; ky < kSize; ky++) {
+        for (int64_t kx = 0; kx < kSize; kx++) {
+          int64_t xi = x2 - kSize / 2 + kx;
+          int64_t yi = y2 - kSize / 2 + ky;
+          if (xi < 0) xi = 0;
+          if (xi >= xsize) xi = input.xsize() - 1;
+          if (yi < 0) yi = 0;
+          if (yi >= ysize) yi = input.ysize() - 1;
+          min = std::min<float>(min, input.Row(yi)[xi]);
+          max = std::max<float>(max, input.Row(yi)[xi]);
+        }
+      }
+
+      for (int64_t ky = 0; ky < kSize; ky++) {
+        for (int64_t kx = 0; kx < kSize; kx++) {
+          int64_t xi = x2 - kSize / 2 + kx;
+          int64_t yi = y2 - kSize / 2 + ky;
+          if (xi < 0) xi = 0;
+          if (xi >= xsize) xi = input.xsize() - 1;
+          if (yi < 0) yi = 0;
+          if (yi >= ysize) yi = input.ysize() - 1;
+          sum += input.Row(yi)[xi] * kernel[ky * kSize + kx];
+        }
+      }
+      output->Row(y)[x] = sum;
+      if (output->Row(y)[x] < min) output->Row(y)[x] = min;
+      if (output->Row(y)[x] > max) output->Row(y)[x] = max;
+    }
+  }
+}
+
+// Returns the derivative of Upsampler, with respect to input pixel x2, y2, to
+// output pixel x, y (ignoring the clamping).
+float UpsamplerDeriv(int64_t x2, int64_t y2, int64_t x, int64_t y) {
+  auto kernel = kernel00;
+  if ((x & 1) && (y & 1)) {
+    kernel = kernel11;
+  } else if (x & 1) {
+    kernel = kernel10;
+  } else if (y & 1) {
+    kernel = kernel01;
+  }
+
+  int64_t ix = x / 2;
+  int64_t iy = y / 2;
+  int64_t kx = x2 - ix + kSize / 2;
+  int64_t ky = y2 - iy + kSize / 2;
+
+  // This should not happen.
+  if (kx < 0 || kx >= kSize || ky < 0 || ky >= kSize) return 0;
+
+  return kernel[ky * kSize + kx];
+}
+
+// Apply the derivative of the Upsampler to the input, reversing the effect of
+// its coefficients. The output image is 2x2 times smaller than the input.
+void AntiUpsample(const ImageF& input, ImageF* d) {
+  int64_t xsize = input.xsize();
+  int64_t ysize = input.ysize();
+  int64_t xsize2 = d->xsize();
+  int64_t ysize2 = d->ysize();
+  int64_t k0 = kSize - 1;
+  int64_t k1 = kSize;
+  for (int64_t y2 = 0; y2 < ysize2; ++y2) {
+    auto* row = d->Row(y2);
+    for (int64_t x2 = 0; x2 < xsize2; ++x2) {
+      int64_t x0 = x2 * 2 - k0;
+      if (x0 < 0) x0 = 0;
+      int64_t x1 = x2 * 2 + k1 + 1;
+      if (x1 > xsize) x1 = xsize;
+      int64_t y0 = y2 * 2 - k0;
+      if (y0 < 0) y0 = 0;
+      int64_t y1 = y2 * 2 + k1 + 1;
+      if (y1 > ysize) y1 = ysize;
+
+      float sum = 0;
+      for (int64_t y = y0; y < y1; ++y) {
+        const auto* row_in = input.Row(y);
+        for (int64_t x = x0; x < x1; ++x) {
+          double deriv = UpsamplerDeriv(x2, y2, x, y);
+          sum += deriv * row_in[x];
+        }
+      }
+      row[x2] = sum;
+    }
+  }
+}
+
+// Element-wise multiplies two images.
+template <typename T>
+void ElwiseMul(const Plane<T>& image1, const Plane<T>& image2, Plane<T>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  JXL_CHECK(xsize == out->xsize());
+  JXL_CHECK(ysize == out->ysize());
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] * row2[x];
+    }
+  }
+}
+
+// Element-wise divides two images.
+template <typename T>
+void ElwiseDiv(const Plane<T>& image1, const Plane<T>& image2, Plane<T>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  JXL_CHECK(xsize == out->xsize());
+  JXL_CHECK(ysize == out->ysize());
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] / row2[x];
+    }
+  }
+}
+
+void ReduceRinging(const ImageF& initial, const ImageF& mask, ImageF& down) {
+  int64_t xsize2 = down.xsize();
+  int64_t ysize2 = down.ysize();
+
+  for (size_t y = 0; y < down.ysize(); y++) {
+    const float* row_mask = mask.Row(y);
+    float* row_out = down.Row(y);
+    for (size_t x = 0; x < down.xsize(); x++) {
+      float v = down.Row(y)[x];
+      float min = initial.Row(y)[x];
+      float max = initial.Row(y)[x];
+      for (int64_t yi = -1; yi < 2; yi++) {
+        for (int64_t xi = -1; xi < 2; xi++) {
+          int64_t x2 = (int64_t)x + xi;
+          int64_t y2 = (int64_t)y + yi;
+          if (x2 < 0 || y2 < 0 || x2 >= (int64_t)xsize2 ||
+              y2 >= (int64_t)ysize2)
+            continue;
+          min = std::min<float>(min, initial.Row(y2)[x2]);
+          max = std::max<float>(max, initial.Row(y2)[x2]);
+        }
+      }
+
+      row_out[x] = v;
+
+      // Clamp the pixel within the value  of a small area to prevent ringning.
+      // The mask determines how much to clamp, clamp more to reduce more
+      // ringing in smooth areas, clamp less in noisy areas to get more
+      // sharpness. Higher mask_multiplier gives less clamping, so less
+      // ringing reduction.
+      const constexpr float mask_multiplier = 2;
+      float a = row_mask[x] * mask_multiplier;
+      float clip_min = min - a;
+      float clip_max = max + a;
+      if (row_out[x] < clip_min) row_out[x] = clip_min;
+      if (row_out[x] > clip_max) row_out[x] = clip_max;
+    }
+  }
+}
+
+// TODO(lode): move this to a separate file enc_downsample.cc
+void DownsampleImage2_Iterative(const ImageF& orig, ImageF* output) {
+  int64_t xsize = orig.xsize();
+  int64_t ysize = orig.ysize();
+  int64_t xsize2 = DivCeil(orig.xsize(), 2);
+  int64_t ysize2 = DivCeil(orig.ysize(), 2);
+
+  ImageF box_downsample = CopyImage(orig);
+  DownsampleImage(&box_downsample, 2);
+  ImageF mask(box_downsample.xsize(), box_downsample.ysize());
+  CreateMask(box_downsample, mask);
+
+  output->ShrinkTo(xsize2, ysize2);
+
+  // Initial result image using the sharper downsampling.
+  // Allocate extra space to avoid a reallocation when padding.
+  ImageF initial(DivCeil(orig.xsize(), 2) + kBlockDim,
+                 DivCeil(orig.ysize(), 2) + kBlockDim);
+  initial.ShrinkTo(initial.xsize() - kBlockDim, initial.ysize() - kBlockDim);
+  DownsampleImage2_Sharper(orig, &initial);
+
+  ImageF down = CopyImage(initial);
+  ImageF up(xsize, ysize);
+  ImageF corr(xsize, ysize);
+  ImageF corr2(xsize2, ysize2);
+
+  // In the weights map, relatively higher values will allow less ringing but
+  // also less sharpness. With all constant values, it optimizes equally
+  // everywhere. Even in this case, the weights2 computed from
+  // this is still used and differs at the borders of the image.
+  // TODO(lode): Make use of the weights field for anti-ringing and clamping,
+  // the values are all set to 1 for now, but it is intended to be used for
+  // reducing ringing based on the mask, and taking clamping into account.
+  ImageF weights(xsize, ysize);
+  for (size_t y = 0; y < weights.ysize(); y++) {
+    auto* row = weights.Row(y);
+    for (size_t x = 0; x < weights.xsize(); x++) {
+      row[x] = 1;
+    }
+  }
+  ImageF weights2(xsize2, ysize2);
+  AntiUpsample(weights, &weights2);
+
+  const size_t num_it = 3;
+  for (size_t it = 0; it < num_it; ++it) {
+    UpsampleImage(down, &up);
+    corr = LinComb<float>(1, orig, -1, up);
+    ElwiseMul(corr, weights, &corr);
+    AntiUpsample(corr, &corr2);
+    ElwiseDiv(corr2, weights2, &corr2);
+
+    down = LinComb<float>(1, down, 1, corr2);
+  }
+
+  ReduceRinging(initial, mask, down);
+
+  // can't just use CopyImage, because the output image was prepared with
+  // padding.
+  for (size_t y = 0; y < down.ysize(); y++) {
+    for (size_t x = 0; x < down.xsize(); x++) {
+      float v = down.Row(y)[x];
+      output->Row(y)[x] = v;
+    }
+  }
+}
+
+void DownsampleImage2_Iterative(Image3F* opsin) {
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), 2) + kBlockDim,
+                      DivCeil(opsin->ysize(), 2) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+
+  Image3F rgb(opsin->xsize(), opsin->ysize());
+  OpsinParams opsin_params;  // TODO: use the ones that are actually used
+  opsin_params.Init(kDefaultIntensityTarget);
+  OpsinToLinear(*opsin, Rect(rgb), nullptr, &rgb, opsin_params);
+
+  ImageF mask(opsin->xsize(), opsin->ysize());
+  ButteraugliParams butter_params;
+  ButteraugliComparator butter(rgb, butter_params);
+  butter.Mask(&mask);
+  ImageF mask_fuzzy(opsin->xsize(), opsin->ysize());
+
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage2_Iterative(opsin->Plane(c), &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+}  // namespace
+
+Status DefaultEncoderHeuristics::LossyFrameHeuristics(
+    PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder,
+    const ImageBundle* original_pixels, Image3F* opsin,
+    const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) {
+  PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented");
+
+  CompressParams& cparams = enc_state->cparams;
+  PassesSharedState& shared = enc_state->shared;
+
+  // Compute parameters for noise synthesis.
+  if (shared.frame_header.flags & FrameHeader::kNoise) {
+    PROFILER_ZONE("enc GetNoiseParam");
+    if (cparams.photon_noise_iso == 0) {
+      // Don't start at zero amplitude since adding noise is expensive -- it
+      // significantly slows down decoding, and this is unlikely to
+      // completely go away even with advanced optimizations. After the
+      // kNoiseModelingRampUpDistanceRange we have reached the full level,
+      // i.e. noise is no longer represented by the compressed image, so we
+      // can add full noise by the noise modeling itself.
+      static const float kNoiseModelingRampUpDistanceRange = 0.6;
+      static const float kNoiseLevelAtStartOfRampUp = 0.25;
+      static const float kNoiseRampupStart = 1.0;
+      // TODO(user) test and properly select quality_coef with smooth
+      // filter
+      float quality_coef = 1.0f;
+      const float rampup = (cparams.butteraugli_distance - kNoiseRampupStart) /
+                           kNoiseModelingRampUpDistanceRange;
+      if (rampup < 1.0f) {
+        quality_coef = kNoiseLevelAtStartOfRampUp +
+                       (1.0f - kNoiseLevelAtStartOfRampUp) * rampup;
+      }
+      if (rampup < 0.0f) {
+        quality_coef = kNoiseRampupStart;
+      }
+      if (!GetNoiseParameter(*opsin, &shared.image_features.noise_params,
+                             quality_coef)) {
+        shared.frame_header.flags &= ~FrameHeader::kNoise;
+      }
+    }
+  }
+  if (enc_state->shared.frame_header.upsampling != 1 &&
+      !cparams.already_downsampled) {
+    // In VarDCT mode, LossyFrameHeuristics takes care of running downsampling
+    // after noise, if necessary.
+    if (cparams.resampling == 2) {
+      // TODO(lode): use the regular DownsampleImage, or adapt to the custom
+      // coefficients, if there is are custom upscaling coefficients in
+      // CustomTransformData
+      if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+        // TODO(lode): DownsampleImage2_Iterative is currently too slow to
+        // be used for squirrel, make it faster, and / or enable it only for
+        // kitten.
+        DownsampleImage2_Iterative(opsin);
+      } else {
+        DownsampleImage2_Sharper(opsin);
+      }
+    } else {
+      DownsampleImage(opsin, cparams.resampling);
+    }
+    PadImageToBlockMultipleInPlace(opsin);
+  }
+
+  if (cparams.butteraugli_distance < 0) {
+    return JXL_FAILURE("Expected non-negative distance");
+  }
+
+  // Find and subtract splines.
+  if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+    // If we do already have them, they were passed upstream to EncodeFile.
+    if (!shared.image_features.splines.HasAny()) {
+      shared.image_features.splines = FindSplines(*opsin);
+    }
+    JXL_RETURN_IF_ERROR(shared.image_features.splines.InitializeDrawCache(
+        opsin->xsize(), opsin->ysize(), shared.cmap));
+    shared.image_features.splines.SubtractFrom(opsin);
+  }
+
+  // Find and subtract patches/dots.
+  if (ApplyOverride(cparams.patches,
+                    cparams.speed_tier <= SpeedTier::kSquirrel)) {
+    FindBestPatchDictionary(*opsin, enc_state, cms, pool, aux_out);
+    PatchDictionaryEncoder::SubtractFrom(shared.image_features.patches, opsin);
+  }
+
+  static const float kAcQuant = 0.79f;
+  const float quant_dc = InitialQuantDC(cparams.butteraugli_distance);
+  Quantizer& quantizer = enc_state->shared.quantizer;
+  // We don't know the quant field yet, but for computing the global scale
+  // assuming that it will be the same as for Falcon mode is good enough.
+  quantizer.ComputeGlobalScaleAndQuant(
+      quant_dc, kAcQuant / cparams.butteraugli_distance, 0);
+
+  // TODO(veluca): we can now run all the code from here to FindBestQuantizer
+  // (excluded) one rect at a time. Do that.
+
+  // Dependency graph:
+  //
+  // input: either XYB or input image
+  //
+  // input image -> XYB [optional]
+  // XYB -> initial quant field
+  // XYB -> Gaborished XYB
+  // Gaborished XYB -> CfL1
+  // initial quant field, Gaborished XYB, CfL1 -> ACS
+  // initial quant field, ACS, Gaborished XYB -> EPF control field
+  // initial quant field -> adjusted initial quant field
+  // adjusted initial quant field, ACS -> raw quant field
+  // raw quant field, ACS, Gaborished XYB -> CfL2
+  //
+  // output: Gaborished XYB, CfL, ACS, raw quant field, EPF control field.
+
+  ArControlFieldHeuristics ar_heuristics;
+  AcStrategyHeuristics acs_heuristics;
+  CfLHeuristics cfl_heuristics;
+
+  if (!opsin->xsize()) {
+    JXL_ASSERT(HandlesColorConversion(cparams, *original_pixels));
+    *opsin = Image3F(RoundUpToBlockDim(original_pixels->xsize()),
+                     RoundUpToBlockDim(original_pixels->ysize()));
+    opsin->ShrinkTo(original_pixels->xsize(), original_pixels->ysize());
+    ToXYB(*original_pixels, pool, opsin, cms, /*linear=*/nullptr);
+    PadImageToBlockMultipleInPlace(opsin);
+  }
+
+  // Compute an initial estimate of the quantization field.
+  // Call InitialQuantField only in Hare mode or slower. Otherwise, rely
+  // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon
+  // mode.
+  if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) {
+    enc_state->initial_quant_field =
+        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    enc_state->initial_quant_masking =
+        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    float q = cparams.uniform_quant > 0
+                  ? cparams.uniform_quant
+                  : kAcQuant / cparams.butteraugli_distance;
+    FillImage(q, &enc_state->initial_quant_field);
+    FillImage(1.0f / (q + 0.001f), &enc_state->initial_quant_masking);
+  } else {
+    // Call this here, as it relies on pre-gaborish values.
+    float butteraugli_distance_for_iqf = cparams.butteraugli_distance;
+    if (!shared.frame_header.loop_filter.gab) {
+      butteraugli_distance_for_iqf *= 0.73f;
+    }
+    enc_state->initial_quant_field = InitialQuantField(
+        butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f,
+        &enc_state->initial_quant_masking);
+    quantizer.SetQuantField(quant_dc, enc_state->initial_quant_field, nullptr);
+  }
+
+  // TODO(veluca): do something about animations.
+
+  // Apply inverse-gaborish.
+  if (shared.frame_header.loop_filter.gab) {
+    // Unsure why better to do some more gaborish on X and B than Y.
+    float weight[3] = {
+        1.0036278514398933f,
+        0.99406123118127299f,
+        0.99719338015886894f,
+    };
+    GaborishInverse(opsin, weight, pool);
+  }
+
+  FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
+                          &enc_state->shared.matrices);
+
+  cfl_heuristics.Init(*opsin);
+  acs_heuristics.Init(*opsin, enc_state);
+
+  auto process_tile = [&](const uint32_t tid, const size_t thread) {
+    size_t n_enc_tiles =
+        DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks);
+    size_t tx = tid % n_enc_tiles;
+    size_t ty = tid / n_enc_tiles;
+    size_t by0 = ty * kEncTileDimInBlocks;
+    size_t by1 = std::min((ty + 1) * kEncTileDimInBlocks,
+                          enc_state->shared.frame_dim.ysize_blocks);
+    size_t bx0 = tx * kEncTileDimInBlocks;
+    size_t bx1 = std::min((tx + 1) * kEncTileDimInBlocks,
+                          enc_state->shared.frame_dim.xsize_blocks);
+    Rect r(bx0, by0, bx1 - bx0, by1 - by0);
+
+    // For speeds up to Wombat, we only compute the color correlation map
+    // once we know the transform type and the quantization map.
+    if (cparams.speed_tier <= SpeedTier::kSquirrel) {
+      cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices,
+                                 /*ac_strategy=*/nullptr,
+                                 /*raw_quant_field=*/nullptr,
+                                 /*quantizer=*/nullptr, /*fast=*/false, thread,
+                                 &enc_state->shared.cmap);
+    }
+
+    // Choose block sizes.
+    acs_heuristics.ProcessRect(r);
+
+    // Choose amount of post-processing smoothing.
+    // TODO(veluca): should this go *after* AdjustQuantField?
+    ar_heuristics.RunRect(r, *opsin, enc_state, thread);
+
+    // Always set the initial quant field, so we can compute the CfL map with
+    // more accuracy. The initial quant field might change in slower modes, but
+    // adjusting the quant field with butteraugli when all the other encoding
+    // parameters are fixed is likely a more reliable choice anyway.
+    AdjustQuantField(enc_state->shared.ac_strategy, r,
+                     &enc_state->initial_quant_field);
+    quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r,
+                                &enc_state->shared.raw_quant_field);
+
+    // Compute a non-default CfL map if we are at Hare speed, or slower.
+    if (cparams.speed_tier <= SpeedTier::kHare) {
+      cfl_heuristics.ComputeTile(
+          r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy,
+          &enc_state->shared.raw_quant_field, &enc_state->shared.quantizer,
+          /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread,
+          &enc_state->shared.cmap);
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0,
+      DivCeil(enc_state->shared.frame_dim.xsize_blocks, kEncTileDimInBlocks) *
+          DivCeil(enc_state->shared.frame_dim.ysize_blocks,
+                  kEncTileDimInBlocks),
+      [&](const size_t num_threads) {
+        ar_heuristics.PrepareForThreads(num_threads);
+        cfl_heuristics.PrepareForThreads(num_threads);
+        return true;
+      },
+      process_tile, "Enc Heuristics"));
+
+  acs_heuristics.Finalize(aux_out);
+  if (cparams.speed_tier <= SpeedTier::kHare) {
+    cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat,
+                             &enc_state->shared.cmap);
+  }
+
+  // Refine quantization levels.
+  FindBestQuantizer(original_pixels, *opsin, enc_state, cms, pool, aux_out);
+
+  // Choose a context model that depends on the amount of quantization for AC.
+  if (cparams.speed_tier < SpeedTier::kFalcon) {
+    FindBestBlockEntropyModel(*enc_state);
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.h b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
new file mode 100644
index 0000000000..3cb9b506a6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
@@ -0,0 +1,81 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_HEURISTICS_H_
+#define LIB_JXL_ENC_HEURISTICS_H_
+
+// Hook for custom encoder heuristics (VarDCT only for now).
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct PassesEncoderState;
+class DequantMatrices;
+class ImageBundle;
+class ModularFrameEncoder;
+
+class EncoderHeuristics {
+ public:
+  virtual ~EncoderHeuristics() = default;
+  // Initializes encoder structures in `enc_state` using the original image data
+  // in `original_pixels`, and the XYB image data in `opsin`. Also modifies the
+  // `opsin` image by applying Gaborish, and doing other modifications if
+  // necessary. `pool` is used for running the computations on multiple threads.
+  // `aux_out` collects statistics and can be used to print debug images.
+  virtual Status LossyFrameHeuristics(
+      PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder,
+      const ImageBundle* original_pixels, Image3F* opsin,
+      const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) = 0;
+
+  // Custom fixed tree for lossless mode. Must set `tree` to a valid tree if
+  // the function returns true.
+  virtual bool CustomFixedTreeLossless(const FrameDimensions& frame_dim,
+                                       Tree* tree) {
+    return false;
+  }
+
+  // If this method returns `true`, the `opsin` parameter to
+  // LossyFrameHeuristics will not be initialized, and should be initialized
+  // during the call. Moreover, `original_pixels` may not be in a linear
+  // colorspace (but will be the same as the `ib` value passed to this
+  // function).
+  virtual bool HandlesColorConversion(const CompressParams& cparams,
+                                      const ImageBundle& ib) {
+    return false;
+  }
+};
+
+class DefaultEncoderHeuristics : public EncoderHeuristics {
+ public:
+  Status LossyFrameHeuristics(PassesEncoderState* enc_state,
+                              ModularFrameEncoder* modular_frame_encoder,
+                              const ImageBundle* original_pixels,
+                              Image3F* opsin, const JxlCmsInterface& cms,
+                              ThreadPool* pool, AuxOut* aux_out) override;
+  bool HandlesColorConversion(const CompressParams& cparams,
+                              const ImageBundle& ib) override;
+};
+
+// Exposed here since it may be used by other EncoderHeuristics implementations
+// outside this project.
+void FindBestDequantMatrices(const CompressParams& cparams,
+                             const Image3F& opsin,
+                             ModularFrameEncoder* modular_frame_encoder,
+                             DequantMatrices* dequant_matrices);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_HEURISTICS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc
new file mode 100644
index 0000000000..3eab2c218a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.cc
@@ -0,0 +1,214 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_huffman.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "lib/jxl/enc_huffman_tree.h"
+
+namespace jxl {
+
+namespace {
+
+constexpr int kCodeLengthCodes = 18;
+
+void StoreHuffmanTreeOfHuffmanTreeToBitMask(const int num_codes,
+                                            const uint8_t* code_length_bitdepth,
+                                            BitWriter* writer) {
+  static const uint8_t kStorageOrder[kCodeLengthCodes] = {
+      1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  // The bit lengths of the Huffman code over the code length alphabet
+  // are compressed with the following static Huffman code:
+  //   Symbol   Code
+  //   ------   ----
+  //   0          00
+  //   1        1110
+  //   2         110
+  //   3          01
+  //   4          10
+  //   5        1111
+  static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {0, 7, 3,
+                                                                 2, 1, 15};
+  static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {2, 4, 3,
+                                                                    2, 2, 4};
+
+  // Throw away trailing zeros:
+  size_t codes_to_store = kCodeLengthCodes;
+  if (num_codes > 1) {
+    for (; codes_to_store > 0; --codes_to_store) {
+      if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
+        break;
+      }
+    }
+  }
+  size_t skip_some = 0;  // skips none.
+  if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
+      code_length_bitdepth[kStorageOrder[1]] == 0) {
+    skip_some = 2;  // skips two.
+    if (code_length_bitdepth[kStorageOrder[2]] == 0) {
+      skip_some = 3;  // skips three.
+    }
+  }
+  writer->Write(2, skip_some);
+  for (size_t i = skip_some; i < codes_to_store; ++i) {
+    size_t l = code_length_bitdepth[kStorageOrder[i]];
+    writer->Write(kHuffmanBitLengthHuffmanCodeBitLengths[l],
+                  kHuffmanBitLengthHuffmanCodeSymbols[l]);
+  }
+}
+
+void StoreHuffmanTreeToBitMask(const size_t huffman_tree_size,
+                               const uint8_t* huffman_tree,
+                               const uint8_t* huffman_tree_extra_bits,
+                               const uint8_t* code_length_bitdepth,
+                               const uint16_t* code_length_bitdepth_symbols,
+                               BitWriter* writer) {
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
+    size_t ix = huffman_tree[i];
+    writer->Write(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix]);
+    // Extra bits
+    switch (ix) {
+      case 16:
+        writer->Write(2, huffman_tree_extra_bits[i]);
+        break;
+      case 17:
+        writer->Write(3, huffman_tree_extra_bits[i]);
+        break;
+    }
+  }
+}
+
+void StoreSimpleHuffmanTree(const uint8_t* depths, size_t symbols[4],
+                            size_t num_symbols, size_t max_bits,
+                            BitWriter* writer) {
+  // value of 1 indicates a simple Huffman code
+  writer->Write(2, 1);
+  writer->Write(2, num_symbols - 1);  // NSYM - 1
+
+  // Sort
+  for (size_t i = 0; i < num_symbols; i++) {
+    for (size_t j = i + 1; j < num_symbols; j++) {
+      if (depths[symbols[j]] < depths[symbols[i]]) {
+        std::swap(symbols[j], symbols[i]);
+      }
+    }
+  }
+
+  if (num_symbols == 2) {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+  } else if (num_symbols == 3) {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+    writer->Write(max_bits, symbols[2]);
+  } else {
+    writer->Write(max_bits, symbols[0]);
+    writer->Write(max_bits, symbols[1]);
+    writer->Write(max_bits, symbols[2]);
+    writer->Write(max_bits, symbols[3]);
+    // tree-select
+    writer->Write(1, depths[symbols[0]] == 1 ? 1 : 0);
+  }
+}
+
+// num = alphabet size
+// depths = symbol depths
+void StoreHuffmanTree(const uint8_t* depths, size_t num, BitWriter* writer) {
+  // Write the Huffman tree into the compact representation.
+  std::unique_ptr<uint8_t[]> arena(new uint8_t[2 * num]);
+  uint8_t* huffman_tree = arena.get();
+  uint8_t* huffman_tree_extra_bits = arena.get() + num;
+  size_t huffman_tree_size = 0;
+  WriteHuffmanTree(depths, num, &huffman_tree_size, huffman_tree,
+                   huffman_tree_extra_bits);
+
+  // Calculate the statistics of the Huffman tree in the compact representation.
+  uint32_t huffman_tree_histogram[kCodeLengthCodes] = {0};
+  for (size_t i = 0; i < huffman_tree_size; ++i) {
+    ++huffman_tree_histogram[huffman_tree[i]];
+  }
+
+  int num_codes = 0;
+  int code = 0;
+  for (int i = 0; i < kCodeLengthCodes; ++i) {
+    if (huffman_tree_histogram[i]) {
+      if (num_codes == 0) {
+        code = i;
+        num_codes = 1;
+      } else if (num_codes == 1) {
+        num_codes = 2;
+        break;
+      }
+    }
+  }
+
+  // Calculate another Huffman tree to use for compressing both the
+  // earlier Huffman tree with.
+  uint8_t code_length_bitdepth[kCodeLengthCodes] = {0};
+  uint16_t code_length_bitdepth_symbols[kCodeLengthCodes] = {0};
+  CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes, 5,
+                    &code_length_bitdepth[0]);
+  ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes,
+                            &code_length_bitdepth_symbols[0]);
+
+  // Now, we have all the data, let's start storing it
+  StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth,
+                                         writer);
+
+  if (num_codes == 1) {
+    code_length_bitdepth[code] = 0;
+  }
+
+  // Store the real huffman tree now.
+  StoreHuffmanTreeToBitMask(huffman_tree_size, huffman_tree,
+                            huffman_tree_extra_bits, &code_length_bitdepth[0],
+                            code_length_bitdepth_symbols, writer);
+}
+
+}  // namespace
+
+void BuildAndStoreHuffmanTree(const uint32_t* histogram, const size_t length,
+                              uint8_t* depth, uint16_t* bits,
+                              BitWriter* writer) {
+  size_t count = 0;
+  size_t s4[4] = {0};
+  for (size_t i = 0; i < length; i++) {
+    if (histogram[i]) {
+      if (count < 4) {
+        s4[count] = i;
+      } else if (count > 4) {
+        break;
+      }
+      count++;
+    }
+  }
+
+  size_t max_bits_counter = length - 1;
+  size_t max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
+
+  if (count <= 1) {
+    // Output symbol bits and depths are initialized with 0, nothing to do.
+    writer->Write(4, 1);
+    writer->Write(max_bits, s4[0]);
+    return;
+  }
+
+  CreateHuffmanTree(histogram, length, 15, depth);
+  ConvertBitDepthsToSymbols(depth, length, bits);
+
+  if (count <= 4) {
+    StoreSimpleHuffmanTree(depth, s4, count, max_bits, writer);
+  } else {
+    StoreHuffmanTree(depth, length, writer);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman.h b/third_party/jpeg-xl/lib/jxl/enc_huffman.h
new file mode 100644
index 0000000000..d7a66584e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_HUFFMAN_H_
+#define LIB_JXL_ENC_HUFFMAN_H_
+
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+// Builds a Huffman tree for the given histogram, and encodes it into writer
+// in a format that can be read by HuffmanDecodingData::ReadFromBitstream.
+// An allotment for `writer` must already have been created by the caller.
+void BuildAndStoreHuffmanTree(const uint32_t* histogram, size_t length,
+                              uint8_t* depth, uint16_t* bits,
+                              BitWriter* writer);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_HUFFMAN_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc
new file mode 100644
index 0000000000..5c40dea770
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.cc
@@ -0,0 +1,328 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_huffman_tree.h"
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level) {
+  if (p.index_left >= 0) {
+    ++level;
+    SetDepth(pool[p.index_left], pool, depth, level);
+    SetDepth(pool[p.index_right_or_value], pool, depth, level);
+  } else {
+    depth[p.index_right_or_value] = level;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) {
+  return v0.total_count < v1.total_count;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, const size_t length,
+                       const int tree_limit, uint8_t* depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1;; count_limit *= 2) {
+    std::vector<HuffmanTree> tree;
+    tree.reserve(2 * length + 1);
+
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max(data[i], count_limit - 1);
+        tree.emplace_back(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    const size_t n = tree.size();
+    if (n == 1) {
+      // Fake value; will be fixed on upper level.
+      depth[tree[0].index_right_or_value] = 1;
+      break;
+    }
+
+    std::stable_sort(tree.begin(), tree.end(), Compare);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
+    tree.push_back(sentinel);
+    tree.push_back(sentinel);
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count <= tree[j].total_count) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count <= tree[j].total_count) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = tree.size() - 1;
+      tree[j_end].total_count =
+          tree[left].total_count + tree[right].total_count;
+      tree[j_end].index_left = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree.push_back(sentinel);
+    }
+    JXL_DASSERT(tree.size() == 2 * n + 1);
+    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
+
+    // We need to pack the Huffman tree in tree_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
+      break;
+    }
+  }
+}
+
+void Reverse(uint8_t* v, size_t start, size_t end) {
+  --end;
+  while (start < end) {
+    uint8_t tmp = v[start];
+    v[start] = v[end];
+    v[end] = tmp;
+    ++start;
+    --end;
+  }
+}
+
+void WriteHuffmanTreeRepetitions(const uint8_t previous_value,
+                                 const uint8_t value, size_t repetitions,
+                                 size_t* tree_size, uint8_t* tree,
+                                 uint8_t* extra_bits_data) {
+  JXL_DASSERT(repetitions > 0);
+  if (previous_value != value) {
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions == 7) {
+    tree[*tree_size] = value;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions < 3) {
+    for (size_t i = 0; i < repetitions; ++i) {
+      tree[*tree_size] = value;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
+    }
+  } else {
+    repetitions -= 3;
+    size_t start = *tree_size;
+    while (true) {
+      tree[*tree_size] = 16;
+      extra_bits_data[*tree_size] = repetitions & 0x3;
+      ++(*tree_size);
+      repetitions >>= 2;
+      if (repetitions == 0) {
+        break;
+      }
+      --repetitions;
+    }
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
+  }
+}
+
+void WriteHuffmanTreeRepetitionsZeros(size_t repetitions, size_t* tree_size,
+                                      uint8_t* tree, uint8_t* extra_bits_data) {
+  if (repetitions == 11) {
+    tree[*tree_size] = 0;
+    extra_bits_data[*tree_size] = 0;
+    ++(*tree_size);
+    --repetitions;
+  }
+  if (repetitions < 3) {
+    for (size_t i = 0; i < repetitions; ++i) {
+      tree[*tree_size] = 0;
+      extra_bits_data[*tree_size] = 0;
+      ++(*tree_size);
+    }
+  } else {
+    repetitions -= 3;
+    size_t start = *tree_size;
+    while (true) {
+      tree[*tree_size] = 17;
+      extra_bits_data[*tree_size] = repetitions & 0x7;
+      ++(*tree_size);
+      repetitions >>= 3;
+      if (repetitions == 0) {
+        break;
+      }
+      --repetitions;
+    }
+    Reverse(tree, start, *tree_size);
+    Reverse(extra_bits_data, start, *tree_size);
+  }
+}
+
+static void DecideOverRleUse(const uint8_t* depth, const size_t length,
+                             bool* use_rle_for_non_zero,
+                             bool* use_rle_for_zero) {
+  size_t total_reps_zero = 0;
+  size_t total_reps_non_zero = 0;
+  size_t count_reps_zero = 1;
+  size_t count_reps_non_zero = 1;
+  for (size_t i = 0; i < length;) {
+    const uint8_t value = depth[i];
+    size_t reps = 1;
+    for (size_t k = i + 1; k < length && depth[k] == value; ++k) {
+      ++reps;
+    }
+    if (reps >= 3 && value == 0) {
+      total_reps_zero += reps;
+      ++count_reps_zero;
+    }
+    if (reps >= 4 && value != 0) {
+      total_reps_non_zero += reps;
+      ++count_reps_non_zero;
+    }
+    i += reps;
+  }
+  *use_rle_for_non_zero = total_reps_non_zero > count_reps_non_zero * 2;
+  *use_rle_for_zero = total_reps_zero > count_reps_zero * 2;
+}
+
+void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size,
+                      uint8_t* tree, uint8_t* extra_bits_data) {
+  uint8_t previous_value = 8;
+
+  // Throw away trailing zeros.
+  size_t new_length = length;
+  for (size_t i = 0; i < length; ++i) {
+    if (depth[length - i - 1] == 0) {
+      --new_length;
+    } else {
+      break;
+    }
+  }
+
+  // First gather statistics on if it is a good idea to do rle.
+  bool use_rle_for_non_zero = false;
+  bool use_rle_for_zero = false;
+  if (length > 50) {
+    // Find rle coding for longer codes.
+    // Shorter codes seem not to benefit from rle.
+    DecideOverRleUse(depth, new_length, &use_rle_for_non_zero,
+                     &use_rle_for_zero);
+  }
+
+  // Actual rle coding.
+  for (size_t i = 0; i < new_length;) {
+    const uint8_t value = depth[i];
+    size_t reps = 1;
+    if ((value != 0 && use_rle_for_non_zero) ||
+        (value == 0 && use_rle_for_zero)) {
+      for (size_t k = i + 1; k < new_length && depth[k] == value; ++k) {
+        ++reps;
+      }
+    }
+    if (value == 0) {
+      WriteHuffmanTreeRepetitionsZeros(reps, tree_size, tree, extra_bits_data);
+    } else {
+      WriteHuffmanTreeRepetitions(previous_value, value, reps, tree_size, tree,
+                                  extra_bits_data);
+      previous_value = value;
+    }
+    i += reps;
+  }
+}
+
+namespace {
+
+uint16_t ReverseBits(int num_bits, uint16_t bits) {
+  static const size_t kLut[16] = {// Pre-reversed 4-bit values.
+                                  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+                                  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf};
+  size_t retval = kLut[bits & 0xf];
+  for (int i = 4; i < num_bits; i += 4) {
+    retval <<= 4;
+    bits = static_cast<uint16_t>(bits >> 4);
+    retval |= kLut[bits & 0xf];
+  }
+  retval >>= (-num_bits & 0x3);
+  return static_cast<uint16_t>(retval);
+}
+
+}  // namespace
+
+void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len,
+                               uint16_t* bits) {
+  // In Brotli, all bit depths are [1..15]
+  // 0 bit depth means that the symbol does not exist.
+  const int kMaxBits = 16;  // 0..15 are values for bits
+  uint16_t bl_count[kMaxBits] = {0};
+  {
+    for (size_t i = 0; i < len; ++i) {
+      ++bl_count[depth[i]];
+    }
+    bl_count[0] = 0;
+  }
+  uint16_t next_code[kMaxBits];
+  next_code[0] = 0;
+  {
+    int code = 0;
+    for (size_t i = 1; i < kMaxBits; ++i) {
+      code = (code + bl_count[i - 1]) << 1;
+      next_code[i] = static_cast<uint16_t>(code);
+    }
+  }
+  for (size_t i = 0; i < len; ++i) {
+    if (depth[i]) {
+      bits[i] = ReverseBits(depth[i], next_code[depth[i]]++);
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h
new file mode 100644
index 0000000000..7d716cd3b5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_huffman_tree.h
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Library for creating Huffman codes from population counts.
+
+#ifndef LIB_JXL_HUFFMAN_TREE_H_
+#define LIB_JXL_HUFFMAN_TREE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count(count), index_left(left), index_right_or_value(right) {}
+  uint32_t total_count;
+  int16_t index_left;
+  int16_t index_right_or_value;
+};
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
+
+// Write a Huffman tree from bit depths into the bitstream representation
+// of a Huffman tree. The generated Huffman tree is to be compressed once
+// more using a Huffman tree
+void WriteHuffmanTree(const uint8_t* depth, size_t length, size_t* tree_size,
+                      uint8_t* tree, uint8_t* extra_bits_data);
+
+// Get the actual bit values for a tree of bit depths.
+void ConvertBitDepthsToSymbols(const uint8_t* depth, size_t len,
+                               uint16_t* bits);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HUFFMAN_TREE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
new file mode 100644
index 0000000000..a6782f6a45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
@@ -0,0 +1,406 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_icc_codec.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/icc_codec_common.h"
+
+namespace jxl {
+namespace {
+
+// Unshuffles or de-interleaves bytes, for example with width 2, turns
+// "AaBbCcDc" into "ABCDabcd", this for example de-interleaves UTF-16 bytes into
+// first all the high order bytes, then all the low order bytes.
+// Transposes a matrix of width columns and ceil(size / width) rows. There are
+// size elements, size may be < width * height, if so the
+// last elements of the bottom row are missing, the missing spots are
+// transposed along with the filled spots, and the result has the missing
+// elements at the bottom of the rightmost column. The input is the input matrix
+// in scanline order, the output is the result matrix in scanline order, with
+// missing elements skipped over (this may occur at multiple positions).
+void Unshuffle(uint8_t* data, size_t size, size_t width) {
+  size_t height = (size + width - 1) / width;  // amount of rows of input
+  PaddedBytes result(size);
+  // i = input index, j output index
+  size_t s = 0, j = 0;
+  for (size_t i = 0; i < size; i++) {
+    result[j] = data[i];
+    j += height;
+    if (j >= size) j = ++s;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    data[i] = result[i];
+  }
+}
+
+// This is performed by the encoder, the encoder must be able to encode any
+// random byte stream (not just byte streams that are a valid ICC profile), so
+// an error returned by this function is an implementation error.
+Status PredictAndShuffle(size_t stride, size_t width, int order, size_t num,
+                         const uint8_t* data, size_t size, size_t* pos,
+                         PaddedBytes* result) {
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(*pos, num, size));
+  // Required by the specification, see decoder. stride * 4 must be < *pos.
+  if (!*pos || ((*pos - 1u) >> 2u) < stride) {
+    return JXL_FAILURE("Invalid stride");
+  }
+  if (*pos < stride * 4) return JXL_FAILURE("Too large stride");
+  size_t start = result->size();
+  for (size_t i = 0; i < num; i++) {
+    uint8_t predicted =
+        LinearPredictICCValue(data, *pos, i, stride, width, order);
+    result->push_back(data[*pos + i] - predicted);
+  }
+  *pos += num;
+  if (width > 1) Unshuffle(result->data() + start, num, width);
+  return true;
+}
+}  // namespace
+
+// Outputs a transformed form of the given icc profile. The result itself is
+// not particularly smaller than the input data in bytes, but it will be in a
+// form that is easier to compress (more zeroes, ...) and will compress better
+// with brotli.
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
+  PaddedBytes commands;
+  PaddedBytes data;
+
+  EncodeVarInt(size, result);
+
+  // Header
+  PaddedBytes header = ICCInitialHeaderPrediction();
+  EncodeUint32(0, size, &header);
+  for (size_t i = 0; i < kICCHeaderSize && i < size; i++) {
+    ICCPredictHeader(icc, size, header.data(), i);
+    data.push_back(icc[i] - header[i]);
+  }
+  if (size <= kICCHeaderSize) {
+    EncodeVarInt(0, result);  // 0 commands
+    for (size_t i = 0; i < data.size(); i++) {
+      result->push_back(data[i]);
+    }
+    return true;
+  }
+
+  std::vector<Tag> tags;
+  std::vector<size_t> tagstarts;
+  std::vector<size_t> tagsizes;
+  std::map<size_t, size_t> tagmap;
+
+  // Tag list
+  size_t pos = kICCHeaderSize;
+  if (pos + 4 <= size) {
+    uint64_t numtags = DecodeUint32(icc, size, pos);
+    pos += 4;
+    EncodeVarInt(numtags + 1, &commands);
+    uint64_t prevtagstart = kICCHeaderSize + numtags * 12;
+    uint32_t prevtagsize = 0;
+    for (size_t i = 0; i < numtags; i++) {
+      if (pos + 12 > size) break;
+
+      Tag tag = DecodeKeyword(icc, size, pos + 0);
+      uint32_t tagstart = DecodeUint32(icc, size, pos + 4);
+      uint32_t tagsize = DecodeUint32(icc, size, pos + 8);
+      pos += 12;
+
+      tags.push_back(tag);
+      tagstarts.push_back(tagstart);
+      tagsizes.push_back(tagsize);
+      tagmap[tagstart] = tags.size() - 1;
+
+      uint8_t tagcode = kCommandTagUnknown;
+      for (size_t j = 0; j < kNumTagStrings; j++) {
+        if (tag == *kTagStrings[j]) {
+          tagcode = j + kCommandTagStringFirst;
+          break;
+        }
+      }
+
+      if (tag == kRtrcTag && pos + 24 < size) {
+        bool ok = true;
+        ok &= DecodeKeyword(icc, size, pos + 0) == kGtrcTag;
+        ok &= DecodeKeyword(icc, size, pos + 12) == kBtrcTag;
+        if (ok) {
+          for (size_t kk = 0; kk < 8; kk++) {
+            if (icc[pos - 8 + kk] != icc[pos + 4 + kk]) ok = false;
+            if (icc[pos - 8 + kk] != icc[pos + 16 + kk]) ok = false;
+          }
+        }
+        if (ok) {
+          tagcode = kCommandTagTRC;
+          pos += 24;
+          i += 2;
+        }
+      }
+
+      if (tag == kRxyzTag && pos + 24 < size) {
+        bool ok = true;
+        ok &= DecodeKeyword(icc, size, pos + 0) == kGxyzTag;
+        ok &= DecodeKeyword(icc, size, pos + 12) == kBxyzTag;
+        uint32_t offsetr = tagstart;
+        uint32_t offsetg = DecodeUint32(icc, size, pos + 4);
+        uint32_t offsetb = DecodeUint32(icc, size, pos + 16);
+        uint32_t sizer = tagsize;
+        uint32_t sizeg = DecodeUint32(icc, size, pos + 8);
+        uint32_t sizeb = DecodeUint32(icc, size, pos + 20);
+        ok &= sizer == 20;
+        ok &= sizeg == 20;
+        ok &= sizeb == 20;
+        ok &= (offsetg == offsetr + 20);
+        ok &= (offsetb == offsetr + 40);
+        if (ok) {
+          tagcode = kCommandTagXYZ;
+          pos += 24;
+          i += 2;
+        }
+      }
+
+      uint8_t command = tagcode;
+      uint64_t predicted_tagstart = prevtagstart + prevtagsize;
+      if (predicted_tagstart != tagstart) command |= kFlagBitOffset;
+      size_t predicted_tagsize = prevtagsize;
+      if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag ||
+          tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag ||
+          tag == kLumiTag) {
+        predicted_tagsize = 20;
+      }
+      if (predicted_tagsize != tagsize) command |= kFlagBitSize;
+      commands.push_back(command);
+      if (tagcode == 1) {
+        AppendKeyword(tag, &data);
+      }
+      if (command & kFlagBitOffset) EncodeVarInt(tagstart, &commands);
+      if (command & kFlagBitSize) EncodeVarInt(tagsize, &commands);
+
+      prevtagstart = tagstart;
+      prevtagsize = tagsize;
+    }
+  }
+  // Indicate end of tag list or varint indicating there's none
+  commands.push_back(0);
+
+  // Main content
+  // The main content in a valid ICC profile contains tagged elements, with the
+  // tag types (4 letter names) given by the tag list above, and the tag list
+  // pointing to the start and indicating the size of each tagged element. It is
+  // allowed for tagged elements to overlap, e.g. the curve for R, G and B could
+  // all point to the same one.
+  Tag tag;
+  size_t tagstart = 0, tagsize = 0, clutstart = 0;
+
+  size_t last0 = pos;
+  // This loop appends commands to the output, processing some sub-section of a
+  // current tagged element each time. We need to keep track of the tagtype of
+  // the current element, and update it when we encounter the boundary of a
+  // next one.
+  // It is not required that the input data is a valid ICC profile, if the
+  // encoder does not recognize the data it will still be able to output bytes
+  // but will not predict as well.
+  while (pos <= size) {
+    size_t last1 = pos;
+    PaddedBytes commands_add;
+    PaddedBytes data_add;
+
+    // This means the loop brought the position beyond the tag end.
+    if (pos > tagstart + tagsize) {
+      tag = {{0, 0, 0, 0}};  // nonsensical value
+    }
+
+    if (commands_add.empty() && data_add.empty() && tagmap.count(pos) &&
+        pos + 4 <= size) {
+      size_t index = tagmap[pos];
+      tag = DecodeKeyword(icc, size, pos);
+      tagstart = tagstarts[index];
+      tagsize = tagsizes[index];
+
+      if (tag == kMlucTag && pos + tagsize <= size && tagsize > 8 &&
+          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
+          icc[pos + 7] == 0) {
+        size_t num = tagsize - 8;
+        commands_add.push_back(kCommandTypeStartFirst + 3);
+        pos += 8;
+        commands_add.push_back(kCommandShuffle2);
+        EncodeVarInt(num, &commands_add);
+        size_t start = data_add.size();
+        for (size_t i = 0; i < num; i++) {
+          data_add.push_back(icc[pos]);
+          pos++;
+        }
+        Unshuffle(data_add.data() + start, num, 2);
+      }
+
+      if (tag == kCurvTag && pos + tagsize <= size && tagsize > 8 &&
+          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
+          icc[pos + 7] == 0) {
+        size_t num = tagsize - 8;
+        if (num > 16 && num < (1 << 28) && pos + num <= size && pos > 0) {
+          commands_add.push_back(kCommandTypeStartFirst + 5);
+          pos += 8;
+          commands_add.push_back(kCommandPredict);
+          int order = 1, width = 2, stride = width;
+          commands_add.push_back((order << 2) | (width - 1));
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+    }
+
+    if (tag == kMab_Tag || tag == kMba_Tag) {
+      Tag subTag = DecodeKeyword(icc, size, pos);
+      if (pos + 12 < size && (subTag == kCurvTag || subTag == kVcgtTag) &&
+          DecodeUint32(icc, size, pos + 4) == 0) {
+        uint32_t num = DecodeUint32(icc, size, pos + 8) * 2;
+        if (num > 16 && num < (1 << 28) && pos + 12 + num <= size) {
+          pos += 12;
+          last1 = pos;
+          commands_add.push_back(kCommandPredict);
+          int order = 1, width = 2, stride = width;
+          commands_add.push_back((order << 2) | (width - 1));
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+
+      if (pos == tagstart + 24 && pos + 4 < size) {
+        // Note that this value can be remembered for next iterations of the
+        // loop, so the "pos == clutstart" if below can trigger during a later
+        // iteration.
+        clutstart = tagstart + DecodeUint32(icc, size, pos);
+      }
+
+      if (pos == clutstart && clutstart + 16 < size) {
+        size_t numi = icc[tagstart + 8];
+        size_t numo = icc[tagstart + 9];
+        size_t width = icc[clutstart + 16];
+        size_t stride = width * numo;
+        size_t num = width * numo;
+        for (size_t i = 0; i < numi && clutstart + i < size; i++) {
+          num *= icc[clutstart + i];
+        }
+        if ((width == 1 || width == 2) && num > 64 && num < (1 << 28) &&
+            pos + num <= size && pos > stride * 4) {
+          commands_add.push_back(kCommandPredict);
+          int order = 1;
+          uint8_t flags =
+              (order << 2) | (width - 1) | (stride == width ? 0 : 16);
+          commands_add.push_back(flags);
+          if (flags & 16) EncodeVarInt(stride, &commands_add);
+          EncodeVarInt(num, &commands_add);
+          JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                                size, &pos, &data_add));
+        }
+      }
+    }
+
+    if (commands_add.empty() && data_add.empty() && tag == kGbd_Tag &&
+        pos == tagstart + 8 && pos + tagsize - 8 <= size && pos > 16 &&
+        tagsize > 8) {
+      size_t width = 4, order = 0, stride = width;
+      size_t num = tagsize - 8;
+      uint8_t flags = (order << 2) | (width - 1) | (stride == width ? 0 : 16);
+      commands_add.push_back(kCommandPredict);
+      commands_add.push_back(flags);
+      if (flags & 16) EncodeVarInt(stride, &commands_add);
+      EncodeVarInt(num, &commands_add);
+      JXL_RETURN_IF_ERROR(PredictAndShuffle(stride, width, order, num, icc,
+                                            size, &pos, &data_add));
+    }
+
+    if (commands_add.empty() && data_add.empty() && pos + 20 <= size) {
+      Tag subTag = DecodeKeyword(icc, size, pos);
+      if (subTag == kXyz_Tag && DecodeUint32(icc, size, pos + 4) == 0) {
+        commands_add.push_back(kCommandXYZ);
+        pos += 8;
+        for (size_t j = 0; j < 12; j++) data_add.push_back(icc[pos++]);
+      }
+    }
+
+    if (commands_add.empty() && data_add.empty() && pos + 8 <= size) {
+      if (DecodeUint32(icc, size, pos + 4) == 0) {
+        Tag subTag = DecodeKeyword(icc, size, pos);
+        for (size_t i = 0; i < kNumTypeStrings; i++) {
+          if (subTag == *kTypeStrings[i]) {
+            commands_add.push_back(kCommandTypeStartFirst + i);
+            pos += 8;
+            break;
+          }
+        }
+      }
+    }
+
+    if (!(commands_add.empty() && data_add.empty()) || pos == size) {
+      if (last0 < last1) {
+        commands.push_back(kCommandInsert);
+        EncodeVarInt(last1 - last0, &commands);
+        while (last0 < last1) {
+          data.push_back(icc[last0++]);
+        }
+      }
+      for (size_t i = 0; i < commands_add.size(); i++) {
+        commands.push_back(commands_add[i]);
+      }
+      for (size_t i = 0; i < data_add.size(); i++) {
+        data.push_back(data_add[i]);
+      }
+      last0 = pos;
+    }
+    if (commands_add.empty() && data_add.empty()) {
+      pos++;
+    }
+  }
+
+  EncodeVarInt(commands.size(), result);
+  for (size_t i = 0; i < commands.size(); i++) {
+    result->push_back(commands[i]);
+  }
+  for (size_t i = 0; i < data.size(); i++) {
+    result->push_back(data[i]);
+  }
+
+  return true;
+}
+
+Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+                size_t layer, AuxOut* JXL_RESTRICT aux_out) {
+  if (icc.empty()) return JXL_FAILURE("ICC must be non-empty");
+  PaddedBytes enc;
+  JXL_RETURN_IF_ERROR(PredictICC(icc.data(), icc.size(), &enc));
+  std::vector<std::vector<Token>> tokens(1);
+  BitWriter::Allotment allotment(writer, 128);
+  JXL_RETURN_IF_ERROR(U64Coder::Write(enc.size(), writer));
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+
+  for (size_t i = 0; i < enc.size(); i++) {
+    tokens[0].emplace_back(
+        ICCANSContext(i, i > 0 ? enc[i - 1] : 0, i > 1 ? enc[i - 2] : 0),
+        enc[i]);
+  }
+  HistogramParams params;
+  params.lz77_method = enc.size() < 4096 ? HistogramParams::LZ77Method::kOptimal
+                                         : HistogramParams::LZ77Method::kLZ77;
+  EntropyEncodingData code;
+  std::vector<uint8_t> context_map;
+  params.force_huffman = true;
+  BuildAndEncodeHistograms(params, kNumICCContexts, tokens, &code, &context_map,
+                           writer, layer, aux_out);
+  WriteTokens(tokens[0], code, context_map, writer, layer, aux_out);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h
new file mode 100644
index 0000000000..c22cf5994e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_ICC_CODEC_H_
+#define LIB_JXL_ENC_ICC_CODEC_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Should still be called if `icc.empty()` - if so, writes only 1 bit.
+Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+                size_t layer, AuxOut* JXL_RESTRICT aux_out);
+
+// Exposed only for testing
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_ICC_CODEC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc
new file mode 100644
index 0000000000..a77d3e0743
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_image_bundle.h"
+
+#include <jxl/cms_interface.h>
+
+#include <atomic>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+namespace {
+
+// Copies ib:rect, converts, and copies into out.
+Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib,
+               const Rect& rect, const ColorEncoding& c_desired,
+               const JxlCmsInterface& cms, ThreadPool* pool, Image3F* out) {
+  PROFILER_FUNC;
+  ColorSpaceTransform c_transform(cms);
+  // Changing IsGray is probably a bug.
+  JXL_CHECK(ib->IsGray() == c_desired.IsGray());
+  bool is_gray = ib->IsGray();
+  if (out->xsize() < rect.xsize() || out->ysize() < rect.ysize()) {
+    *out = Image3F(rect.xsize(), rect.ysize());
+  } else {
+    out->ShrinkTo(rect.xsize(), rect.ysize());
+  }
+  std::atomic<bool> ok{true};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, rect.ysize(),
+      [&](const size_t num_threads) {
+        return c_transform.Init(ib->c_current(), c_desired,
+                                metadata->IntensityTarget(), rect.xsize(),
+                                num_threads);
+      },
+      [&](const uint32_t y, const size_t thread) {
+        float* mutable_src_buf = c_transform.BufSrc(thread);
+        const float* src_buf = mutable_src_buf;
+        // Interleave input.
+        if (is_gray) {
+          src_buf = rect.ConstPlaneRow(ib->color(), 0, y);
+        } else if (ib->c_current().IsCMYK()) {
+          if (!ib->HasBlack()) {
+            ok.store(false);
+            return;
+          }
+          const float* JXL_RESTRICT row_in0 =
+              rect.ConstPlaneRow(ib->color(), 0, y);
+          const float* JXL_RESTRICT row_in1 =
+              rect.ConstPlaneRow(ib->color(), 1, y);
+          const float* JXL_RESTRICT row_in2 =
+              rect.ConstPlaneRow(ib->color(), 2, y);
+          const float* JXL_RESTRICT row_in3 = rect.ConstRow(ib->black(), y);
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            // CMYK convention in JXL: 0 = max ink, 1 = white
+            mutable_src_buf[4 * x + 0] = row_in0[x];
+            mutable_src_buf[4 * x + 1] = row_in1[x];
+            mutable_src_buf[4 * x + 2] = row_in2[x];
+            mutable_src_buf[4 * x + 3] = row_in3[x];
+          }
+        } else {
+          const float* JXL_RESTRICT row_in0 =
+              rect.ConstPlaneRow(ib->color(), 0, y);
+          const float* JXL_RESTRICT row_in1 =
+              rect.ConstPlaneRow(ib->color(), 1, y);
+          const float* JXL_RESTRICT row_in2 =
+              rect.ConstPlaneRow(ib->color(), 2, y);
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            mutable_src_buf[3 * x + 0] = row_in0[x];
+            mutable_src_buf[3 * x + 1] = row_in1[x];
+            mutable_src_buf[3 * x + 2] = row_in2[x];
+          }
+        }
+        float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
+        if (!c_transform.Run(thread, src_buf, dst_buf)) {
+          ok.store(false);
+          return;
+        }
+        float* JXL_RESTRICT row_out0 = out->PlaneRow(0, y);
+        float* JXL_RESTRICT row_out1 = out->PlaneRow(1, y);
+        float* JXL_RESTRICT row_out2 = out->PlaneRow(2, y);
+        // De-interleave output and convert type.
+        if (is_gray) {
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            row_out0[x] = dst_buf[x];
+            row_out1[x] = dst_buf[x];
+            row_out2[x] = dst_buf[x];
+          }
+        } else {
+          for (size_t x = 0; x < rect.xsize(); x++) {
+            row_out0[x] = dst_buf[3 * x + 0];
+            row_out1[x] = dst_buf[3 * x + 1];
+            row_out2[x] = dst_buf[3 * x + 2];
+          }
+        }
+      },
+      "Colorspace transform"));
+  return ok.load();
+}
+
+}  // namespace
+
+Status ImageBundle::TransformTo(const ColorEncoding& c_desired,
+                                const JxlCmsInterface& cms, ThreadPool* pool) {
+  PROFILER_FUNC;
+  JXL_RETURN_IF_ERROR(CopyTo(Rect(color_), c_desired, cms, &color_, pool));
+  c_current_ = c_desired;
+  return true;
+}
+Status ImageBundle::CopyTo(const Rect& rect, const ColorEncoding& c_desired,
+                           const JxlCmsInterface& cms, Image3F* out,
+                           ThreadPool* pool) const {
+  return CopyToT(metadata_, this, rect, c_desired, cms, pool, out);
+}
+Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired,
+                         const JxlCmsInterface& cms, ThreadPool* pool,
+                         ImageBundle* store, const ImageBundle** out) {
+  if (in.c_current().SameColorEncoding(c_desired) && !in.HasBlack()) {
+    *out = &in;
+    return true;
+  }
+  // TODO(janwas): avoid copying via createExternal+copyBackToIO
+  // instead of copy+createExternal+copyBackToIO
+  store->SetFromImage(CopyImage(in.color()), in.c_current());
+
+  // Must at least copy the alpha channel for use by external_image.
+  if (in.HasExtraChannels()) {
+    std::vector<ImageF> extra_channels;
+    for (const ImageF& extra_channel : in.extra_channels()) {
+      extra_channels.emplace_back(CopyImage(extra_channel));
+    }
+    store->SetExtraChannels(std::move(extra_channels));
+  }
+
+  if (!store->TransformTo(c_desired, cms, pool)) {
+    return false;
+  }
+  *out = store;
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h
new file mode 100644
index 0000000000..85f8e14e1c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_image_bundle.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_IMAGE_BUNDLE_H_
+#define LIB_JXL_ENC_IMAGE_BUNDLE_H_
+
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Does color transformation from in.c_current() to c_desired if the color
+// encodings are different, or nothing if they are already the same.
+// If color transformation is done, stores the transformed values into store and
+// sets the out pointer to store, else leaves store untouched and sets the out
+// pointer to &in.
+// Returns false if color transform fails.
+Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired,
+                         const JxlCmsInterface& cms, ThreadPool* pool,
+                         ImageBundle* store, const ImageBundle** out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_IMAGE_BUNDLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h
new file mode 100644
index 0000000000..3c364e883d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_jxl_skcms.h
@@ -0,0 +1,54 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_JXL_SKCMS_H_
+#define LIB_JXL_ENC_JXL_SKCMS_H_
+
+// skcms wrapper to rename the skcms symbols to avoid conflicting names with
+// other projects using skcms as well. When using JPEGXL_BUNDLE_SKCMS the
+// bundled functions will be renamed from skcms_ to jxl_skcms_
+
+#ifdef SKCMS_API
+#error "Must include enc_jxl_skcms.h and not skcms.h directly"
+#endif  // SKCMS_API
+
+#if JPEGXL_BUNDLE_SKCMS
+
+#define skcms_252_random_bytes jxl_skcms_252_random_bytes
+#define skcms_AdaptToXYZD50 jxl_skcms_AdaptToXYZD50
+#define skcms_ApproximateCurve jxl_skcms_ApproximateCurve
+#define skcms_ApproximatelyEqualProfiles jxl_skcms_ApproximatelyEqualProfiles
+#define skcms_AreApproximateInverses jxl_skcms_AreApproximateInverses
+#define skcms_GetCHAD jxl_skcms_GetCHAD
+#define skcms_GetTagByIndex jxl_skcms_GetTagByIndex
+#define skcms_GetTagBySignature jxl_skcms_GetTagBySignature
+#define skcms_GetWTPT jxl_skcms_GetWTPT
+#define skcms_Identity_TransferFunction jxl_skcms_Identity_TransferFunction
+#define skcms_MakeUsableAsDestination jxl_skcms_MakeUsableAsDestination
+#define skcms_MakeUsableAsDestinationWithSingleCurve \
+  jxl_skcms_MakeUsableAsDestinationWithSingleCurve
+#define skcms_Matrix3x3_concat jxl_skcms_Matrix3x3_concat
+#define skcms_Matrix3x3_invert jxl_skcms_Matrix3x3_invert
+#define skcms_MaxRoundtripError jxl_skcms_MaxRoundtripError
+#define skcms_Parse jxl_skcms_Parse
+#define skcms_PrimariesToXYZD50 jxl_skcms_PrimariesToXYZD50
+#define skcms_sRGB_Inverse_TransferFunction \
+  jxl_skcms_sRGB_Inverse_TransferFunction
+#define skcms_sRGB_profile jxl_skcms_sRGB_profile
+#define skcms_sRGB_TransferFunction jxl_skcms_sRGB_TransferFunction
+#define skcms_TransferFunction_eval jxl_skcms_TransferFunction_eval
+#define skcms_TransferFunction_invert jxl_skcms_TransferFunction_invert
+#define skcms_TransferFunction_makeHLGish jxl_skcms_TransferFunction_makeHLGish
+#define skcms_TransferFunction_makePQish jxl_skcms_TransferFunction_makePQish
+#define skcms_Transform jxl_skcms_Transform
+#define skcms_TransformWithPalette jxl_skcms_TransformWithPalette
+#define skcms_TRCs_AreApproximateInverse jxl_skcms_TRCs_AreApproximateInverse
+#define skcms_XYZD50_profile jxl_skcms_XYZD50_profile
+
+#endif  // JPEGXL_BUNDLE_SKCMS
+
+#include "skcms.h"
+
+#endif  // LIB_JXL_ENC_JXL_SKCMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc
new file mode 100644
index 0000000000..fe2090a909
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag,
+                       ImageD* const JXL_RESTRICT U) {
+#if JXL_ENABLE_ASSERT
+  JXL_ASSERT(A.xsize() == 2);
+  JXL_ASSERT(A.ysize() == 2);
+  JXL_ASSERT(std::abs(A.Row(0)[1] - A.Row(1)[0]) < 1e-15);
+#endif
+
+  if (std::abs(A.ConstRow(0)[1]) < 1e-15) {
+    // Already diagonal.
+    diag->Row(0)[0] = A.ConstRow(0)[0];
+    diag->Row(0)[1] = A.ConstRow(1)[1];
+    U->Row(0)[0] = U->Row(1)[1] = 1.0;
+    U->Row(0)[1] = U->Row(1)[0] = 0.0;
+    return;
+  }
+  double b = -(A.Row(0)[0] + A.Row(1)[1]);
+  double c = A.Row(0)[0] * A.Row(1)[1] - A.Row(0)[1] * A.Row(0)[1];
+  double d = b * b - 4.0 * c;
+  double sqd = std::sqrt(d);
+  double l1 = (-b - sqd) * 0.5;
+  double l2 = (-b + sqd) * 0.5;
+
+  double v1[2] = {A.Row(0)[0] - l1, A.Row(1)[0]};
+  double v1n = 1.0 / std::hypot(v1[0], v1[1]);
+  v1[0] = v1[0] * v1n;
+  v1[1] = v1[1] * v1n;
+
+  diag->Row(0)[0] = l1;
+  diag->Row(0)[1] = l2;
+
+  U->Row(0)[0] = v1[1];
+  U->Row(0)[1] = -v1[0];
+  U->Row(1)[0] = v1[0];
+  U->Row(1)[1] = v1[1];
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg.h b/third_party/jpeg-xl/lib/jxl/enc_linalg.h
new file mode 100644
index 0000000000..791770d5d4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LINALG_H_
+#define LIB_JXL_LINALG_H_
+
+// Linear algebra.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+using ImageD = Plane<double>;
+
+// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U).
+void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag,
+                       ImageD* JXL_RESTRICT U);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LINALG_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc
new file mode 100644
index 0000000000..967b9a3afb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_linalg_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+ImageD Identity(const size_t N) {
+  ImageD out(N, N);
+  for (size_t i = 0; i < N; ++i) {
+    double* JXL_RESTRICT row = out.Row(i);
+    std::fill(row, row + N, 0);
+    row[i] = 1.0;
+  }
+  return out;
+}
+
+ImageD Diagonal(const ImageD& d) {
+  JXL_ASSERT(d.ysize() == 1);
+  ImageD out(d.xsize(), d.xsize());
+  const double* JXL_RESTRICT row_diag = d.Row(0);
+  for (size_t k = 0; k < d.xsize(); ++k) {
+    double* JXL_RESTRICT row_out = out.Row(k);
+    std::fill(row_out, row_out + d.xsize(), 0.0);
+    row_out[k] = row_diag[k];
+  }
+  return out;
+}
+
+ImageD MatMul(const ImageD& A, const ImageD& B) {
+  JXL_ASSERT(A.ysize() == B.xsize());
+  ImageD out(A.xsize(), B.ysize());
+  for (size_t y = 0; y < B.ysize(); ++y) {
+    const double* const JXL_RESTRICT row_b = B.Row(y);
+    double* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      row_out[x] = 0.0;
+      for (size_t k = 0; k < B.xsize(); ++k) {
+        row_out[x] += A.Row(k)[x] * row_b[k];
+      }
+    }
+  }
+  return out;
+}
+
+ImageD Transpose(const ImageD& A) {
+  ImageD out(A.ysize(), A.xsize());
+  for (size_t x = 0; x < A.xsize(); ++x) {
+    double* const JXL_RESTRICT row_out = out.Row(x);
+    for (size_t y = 0; y < A.ysize(); ++y) {
+      row_out[y] = A.Row(y)[x];
+    }
+  }
+  return out;
+}
+
+ImageD RandomSymmetricMatrix(const size_t N, Rng& rng, const double vmin,
+                             const double vmax) {
+  ImageD A(N, N);
+  GenerateImage(rng, &A, vmin, vmax);
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < i; ++j) {
+      A.Row(j)[i] = A.Row(i)[j];
+    }
+  }
+  return A;
+}
+
+void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) {
+  ASSERT_EQ(A.xsize(), B.xsize());
+  ASSERT_EQ(A.ysize(), B.ysize());
+  for (size_t y = 0; y < A.ysize(); ++y) {
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps);
+    }
+  }
+}
+
+void VerifyOrthogonal(const ImageD& A, const double eps) {
+  VerifyMatrixEqual(Identity(A.xsize()), MatMul(Transpose(A), A), eps);
+}
+
+TEST(LinAlgTest, ConvertToDiagonal) {
+  {
+    ImageD I = Identity(2);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(I, &d, &U);
+    VerifyMatrixEqual(I, U, 1e-15);
+    for (size_t k = 0; k < 2; ++k) {
+      ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15);
+    }
+  }
+  {
+    ImageD A = Identity(2);
+    A.Row(0)[1] = A.Row(1)[0] = 2.0;
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+  Rng rng(0);
+  for (size_t i = 0; i < 100; ++i) {
+    ImageD A = RandomSymmetricMatrix(2, rng, -1.0, 1.0);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.cc b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
new file mode 100644
index 0000000000..0453b34654
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
@@ -0,0 +1,1762 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_modular.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <atomic>
+#include <limits>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cluster.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_patch_dictionary.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+#include "lib/jxl/modular/encoding/enc_encoding.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/enc_transform.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+constexpr bool kPrintTree = false;
+
+// Squeeze default quantization factors
+// these quantization factors are for -Q 50  (other qualities simply scale the
+// factors; things are rounded down and obviously cannot get below 1)
+static const float squeeze_quality_factor =
+    0.35;  // for easy tweaking of the quality range (decrease this number for
+           // higher quality)
+static const float squeeze_luma_factor =
+    1.1;  // for easy tweaking of the balance between luma (or anything
+          // non-chroma) and chroma (decrease this number for higher quality
+          // luma)
+static const float squeeze_quality_factor_xyb = 2.4f;
+static const float squeeze_xyb_qtable[3][16] = {
+    {163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28, 0.64, 0.32, 0.16,
+     0.08, 0.04, 0.02, 0.01, 0.005},  // Y
+    {1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5,
+     0.5},  // X
+    {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5,
+     0.5},  // B-Y
+};
+
+static const float squeeze_luma_qtable[16] = {
+    163.84, 81.92, 40.96, 20.48, 10.24, 5.12, 2.56, 1.28,
+    0.64,   0.32,  0.16,  0.08,  0.04,  0.02, 0.01, 0.005};
+// for 8-bit input, the range of YCoCg chroma is -255..255 so basically this
+// does 4:2:0 subsampling (two most fine grained layers get quantized away)
+static const float squeeze_chroma_qtable[16] = {
+    1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+// `cutoffs` must be sorted.
+Tree MakeFixedTree(int property, const std::vector<int32_t>& cutoffs,
+                   Predictor pred, size_t num_pixels) {
+  size_t log_px = CeilLog2Nonzero(num_pixels);
+  size_t min_gap = 0;
+  // Reduce fixed tree height when encoding small images.
+  if (log_px < 14) {
+    min_gap = 8 * (14 - log_px);
+  }
+  Tree tree;
+  struct NodeInfo {
+    size_t begin, end, pos;
+  };
+  std::queue<NodeInfo> q;
+  // Leaf IDs will be set by roundtrip decoding the tree.
+  tree.push_back(PropertyDecisionNode::Leaf(pred));
+  q.push(NodeInfo{0, cutoffs.size(), 0});
+  while (!q.empty()) {
+    NodeInfo info = q.front();
+    q.pop();
+    if (info.begin + min_gap >= info.end) continue;
+    uint32_t split = (info.begin + info.end) / 2;
+    tree[info.pos] =
+        PropertyDecisionNode::Split(property, cutoffs[split], tree.size());
+    q.push(NodeInfo{split + 1, info.end, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+    q.push(NodeInfo{info.begin, split, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+  }
+  return tree;
+}
+
+Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) {
+  if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta ||
+      tree_kind == ModularOptions::TreeKind::kTrivialTreeNoPredictor) {
+    // All the data is 0, so no need for a fancy tree.
+    return {PropertyDecisionNode::Leaf(Predictor::Zero)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) {
+    // All the data is 0 except the quant field. TODO(veluca): make that 0 too.
+    return {PropertyDecisionNode::Leaf(Predictor::Left)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kACMeta) {
+    // Small image.
+    if (total_pixels < 1024) {
+      return {PropertyDecisionNode::Leaf(Predictor::Left)};
+    }
+    Tree tree;
+    // 0: c > 1
+    tree.push_back(PropertyDecisionNode::Split(0, 1, 1));
+    // 1: c > 2
+    tree.push_back(PropertyDecisionNode::Split(0, 2, 3));
+    // 2: c > 0
+    tree.push_back(PropertyDecisionNode::Split(0, 0, 5));
+    // 3: EPF control field (all 0 or 4), top > 0
+    tree.push_back(PropertyDecisionNode::Split(6, 0, 21));
+    // 4: ACS+QF, y > 0
+    tree.push_back(PropertyDecisionNode::Split(2, 0, 7));
+    // 5: CfL x
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 6: CfL b
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 7: QF: split according to the left quant value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 9));
+    // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large
+    // rectangular 6-11, 8x8 12+), according to previous ACS value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 15));
+    // QF
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 11));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 13));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    // ACS
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 17));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 19));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    // EPF, left > 0
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 23));
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 25));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    return tree;
+  }
+  if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kWPProp, cutoffs, Predictor::Weighted, total_pixels);
+  }
+  if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient,
+                         total_pixels);
+  }
+  JXL_ABORT("Unreachable");
+  return {};
+}
+
+// Merges the trees in `trees` using nodes that decide on stream_id, as defined
+// by `tree_splits`.
+void MergeTrees(const std::vector<Tree>& trees,
+                const std::vector<size_t>& tree_splits, size_t begin,
+                size_t end, Tree* tree) {
+  JXL_ASSERT(trees.size() + 1 == tree_splits.size());
+  JXL_ASSERT(end > begin);
+  JXL_ASSERT(end <= trees.size());
+  if (end == begin + 1) {
+    // Insert the tree, adding the opportune offset to all child nodes.
+    // This will make the leaf IDs wrong, but subsequent roundtripping will fix
+    // them.
+    size_t sz = tree->size();
+    tree->insert(tree->end(), trees[begin].begin(), trees[begin].end());
+    for (size_t i = sz; i < tree->size(); i++) {
+      (*tree)[i].lchild += sz;
+      (*tree)[i].rchild += sz;
+    }
+    return;
+  }
+  size_t mid = (begin + end) / 2;
+  size_t splitval = tree_splits[mid] - 1;
+  size_t cur = tree->size();
+  tree->emplace_back(1 /*stream_id*/, splitval, 0, 0, Predictor::Zero, 0, 1);
+  (*tree)[cur].lchild = tree->size();
+  MergeTrees(trees, tree_splits, mid, end, tree);
+  (*tree)[cur].rchild = tree->size();
+  MergeTrees(trees, tree_splits, begin, mid, tree);
+}
+
+void QuantizeChannel(Channel& ch, const int q) {
+  if (q == 1) return;
+  for (size_t y = 0; y < ch.plane.ysize(); y++) {
+    pixel_type* row = ch.plane.Row(y);
+    for (size_t x = 0; x < ch.plane.xsize(); x++) {
+      if (row[x] < 0) {
+        row[x] = -((-row[x] + q / 2) / q) * q;
+      } else {
+        row[x] = ((row[x] + q / 2) / q) * q;
+      }
+    }
+  }
+}
+
+// convert binary32 float that corresponds to custom [bits]-bit float (with
+// [exp_bits] exponent bits) to a [bits]-bit integer representation that should
+// fit in pixel_type
+Status float_to_int(const float* const row_in, pixel_type* const row_out,
+                    size_t xsize, unsigned int bits, unsigned int exp_bits,
+                    bool fp, double dfactor) {
+  JXL_ASSERT(sizeof(pixel_type) * 8 >= bits);
+  if (!fp) {
+    if (bits > 22) {
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row_in[x] * dfactor + (row_in[x] < 0 ? -0.5 : 0.5);
+      }
+    } else {
+      float factor = dfactor;
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row_in[x] * factor + (row_in[x] < 0 ? -0.5f : 0.5f);
+      }
+    }
+    return true;
+  }
+  if (bits == 32 && fp) {
+    JXL_ASSERT(exp_bits == 8);
+    memcpy((void*)row_out, (const void*)row_in, 4 * xsize);
+    return true;
+  }
+
+  int exp_bias = (1 << (exp_bits - 1)) - 1;
+  int max_exp = (1 << exp_bits) - 1;
+  uint32_t sign = (1u << (bits - 1));
+  int mant_bits = bits - exp_bits - 1;
+  int mant_shift = 23 - mant_bits;
+  for (size_t x = 0; x < xsize; ++x) {
+    uint32_t f;
+    memcpy(&f, &row_in[x], 4);
+    int signbit = (f >> 31);
+    f &= 0x7fffffff;
+    if (f == 0) {
+      row_out[x] = (signbit ? sign : 0);
+      continue;
+    }
+    int exp = (f >> 23) - 127;
+    if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed");
+    int mantissa = (f & 0x007fffff);
+    // broke up the binary32 into its parts, now reassemble into
+    // arbitrary float
+    exp += exp_bias;
+    if (exp < 0) {  // will become a subnormal number
+      // add implicit leading 1 to mantissa
+      mantissa |= 0x00800000;
+      if (exp < -mant_bits) {
+        return JXL_FAILURE(
+            "Invalid float number: %g cannot be represented with %i "
+            "exp_bits and %i mant_bits (exp %i)",
+            row_in[x], exp_bits, mant_bits, exp);
+      }
+      mantissa >>= 1 - exp;
+      exp = 0;
+    }
+    // exp should be representable in exp_bits, otherwise input was
+    // invalid
+    if (exp > max_exp) return JXL_FAILURE("Invalid float exponent");
+    if (mantissa & ((1 << mant_shift) - 1)) {
+      return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x],
+                         mantissa);
+    }
+    mantissa >>= mant_shift;
+    f = (signbit ? sign : 0);
+    f |= (exp << mant_bits);
+    f |= mantissa;
+    row_out[x] = (pixel_type)f;
+  }
+  return true;
+}
+}  // namespace
+
+ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
+                                         const CompressParams& cparams_orig)
+    : frame_dim_(frame_header.ToFrameDimensions()), cparams_(cparams_orig) {
+  size_t num_streams =
+      ModularStreamId::Num(frame_dim_, frame_header.passes.num_passes);
+  if (cparams_.ModularPartIsLossless()) {
+    switch (cparams_.decoding_speed_tier) {
+      case 0:
+        break;
+      case 1:
+        cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
+        break;
+      case 2: {
+        cparams_.options.wp_tree_mode = ModularOptions::TreeMode::kGradientOnly;
+        cparams_.options.predictor = Predictor::Gradient;
+        break;
+      }
+      case 3: {  // LZ77, no Gradient.
+        cparams_.options.nb_repeats = 0;
+        cparams_.options.predictor = Predictor::Gradient;
+        break;
+      }
+      default: {  // LZ77, no predictor.
+        cparams_.options.nb_repeats = 0;
+        cparams_.options.predictor = Predictor::Zero;
+        break;
+      }
+    }
+  }
+  if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive &&
+      cparams_.ModularPartIsLossless()) {
+    cparams_.options.tree_kind =
+        ModularOptions::TreeKind::kTrivialTreeNoPredictor;
+    cparams_.options.nb_repeats = 0;
+  }
+  stream_images_.resize(num_streams);
+
+  // use a sensible default if nothing explicit is specified:
+  // Squeeze for lossy, no squeeze for lossless
+  if (cparams_.responsive < 0) {
+    if (cparams_.ModularPartIsLossless()) {
+      cparams_.responsive = 0;
+    } else {
+      cparams_.responsive = 1;
+    }
+  }
+
+  if (cparams_.speed_tier > SpeedTier::kWombat) {
+    cparams_.options.splitting_heuristics_node_threshold = 192;
+  } else {
+    cparams_.options.splitting_heuristics_node_threshold = 96;
+  }
+  {
+    // Set properties.
+    std::vector<uint32_t> prop_order;
+    if (cparams_.responsive) {
+      // Properties in order of their likelihood of being useful for Squeeze
+      // residuals.
+      prop_order = {0, 1, 4, 5, 6, 7, 8, 15, 9, 10, 11, 12, 13, 14, 2, 3};
+    } else {
+      // Same, but for the non-Squeeze case.
+      prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8};
+    }
+    switch (cparams_.speed_tier) {
+      case SpeedTier::kSquirrel:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 8);
+        cparams_.options.max_property_values = 32;
+        break;
+      case SpeedTier::kKitten:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 10);
+        cparams_.options.max_property_values = 64;
+        break;
+      case SpeedTier::kTortoise:
+        cparams_.options.splitting_heuristics_properties = prop_order;
+        cparams_.options.max_property_values = 256;
+        break;
+      default:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 6);
+        cparams_.options.max_property_values = 16;
+        break;
+    }
+    if (cparams_.speed_tier > SpeedTier::kTortoise) {
+      // Gradient in previous channels.
+      for (int i = 0; i < cparams_.options.max_properties; i++) {
+        cparams_.options.splitting_heuristics_properties.push_back(
+            kNumNonrefProperties + i * 4 + 3);
+      }
+    } else {
+      // All the extra properties in Tortoise mode.
+      for (int i = 0; i < cparams_.options.max_properties * 4; i++) {
+        cparams_.options.splitting_heuristics_properties.push_back(
+            kNumNonrefProperties + i);
+      }
+    }
+  }
+
+  if (cparams_.options.predictor == static_cast<Predictor>(-1)) {
+    // no explicit predictor(s) given, set a good default
+    if ((cparams_.speed_tier <= SpeedTier::kTortoise ||
+         cparams_.modular_mode == false) &&
+        cparams_.IsLossless() && cparams_.responsive == false) {
+      // TODO(veluca): allow all predictors that don't break residual
+      // multipliers in lossy mode.
+      cparams_.options.predictor = Predictor::Variable;
+    } else if (cparams_.responsive || cparams_.lossy_palette) {
+      // zero predictor for Squeeze residues and lossy palette
+      cparams_.options.predictor = Predictor::Zero;
+    } else if (!cparams_.IsLossless()) {
+      // If not responsive and lossy. TODO(veluca): use near_lossless instead?
+      cparams_.options.predictor = Predictor::Gradient;
+    } else if (cparams_.speed_tier < SpeedTier::kFalcon) {
+      // try median and weighted predictor for anything else
+      cparams_.options.predictor = Predictor::Best;
+    } else if (cparams_.speed_tier == SpeedTier::kFalcon) {
+      // just weighted predictor in falcon mode
+      cparams_.options.predictor = Predictor::Weighted;
+    } else if (cparams_.speed_tier > SpeedTier::kFalcon) {
+      // just gradient predictor in thunder mode
+      cparams_.options.predictor = Predictor::Gradient;
+    }
+  } else {
+    delta_pred_ = cparams_.options.predictor;
+    if (cparams_.lossy_palette) cparams_.options.predictor = Predictor::Zero;
+  }
+  if (!cparams_.ModularPartIsLossless()) {
+    if (cparams_.options.predictor == Predictor::Weighted ||
+        cparams_.options.predictor == Predictor::Variable ||
+        cparams_.options.predictor == Predictor::Best)
+      cparams_.options.predictor = Predictor::Zero;
+  }
+  tree_splits_.push_back(0);
+  if (cparams_.modular_mode == false) {
+    cparams_.options.fast_decode_multiplier = 1.0f;
+    tree_splits_.push_back(ModularStreamId::VarDCTDC(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ModularDC(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ACMetadata(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::QuantTable(0).ID(frame_dim_));
+    tree_splits_.push_back(ModularStreamId::ModularAC(0, 0).ID(frame_dim_));
+    ac_metadata_size.resize(frame_dim_.num_dc_groups);
+    extra_dc_precision.resize(frame_dim_.num_dc_groups);
+  }
+  tree_splits_.push_back(num_streams);
+  cparams_.options.max_chan_size = frame_dim_.group_dim;
+  cparams_.options.group_dim = frame_dim_.group_dim;
+
+  // TODO(veluca): figure out how to use different predictor sets per channel.
+  stream_options_.resize(num_streams, cparams_.options);
+}
+
+bool do_transform(Image& image, const Transform& tr,
+                  const weighted::Header& wp_header,
+                  jxl::ThreadPool* pool = nullptr, bool force_jxlart = false) {
+  Transform t = tr;
+  bool did_it = true;
+  if (force_jxlart) {
+    if (!t.MetaApply(image)) return false;
+  } else {
+    did_it = TransformForward(t, image, wp_header, pool);
+  }
+  if (did_it) image.transform.push_back(t);
+  return did_it;
+}
+
+Status ModularFrameEncoder::ComputeEncodingData(
+    const FrameHeader& frame_header, const ImageMetadata& metadata,
+    Image3F* JXL_RESTRICT color, const std::vector<ImageF>& extra_channels,
+    PassesEncoderState* JXL_RESTRICT enc_state, const JxlCmsInterface& cms,
+    ThreadPool* pool, AuxOut* aux_out, bool do_color) {
+  JXL_DEBUG_V(6, "Computing modular encoding data for frame %s",
+              frame_header.DebugString().c_str());
+
+  if (do_color && frame_header.loop_filter.gab) {
+    float w = 0.9908511000000001f;
+    float weights[3] = {w, w, w};
+    GaborishInverse(color, weights, pool);
+  }
+
+  if (do_color && metadata.bit_depth.bits_per_sample <= 16 &&
+      cparams_.speed_tier < SpeedTier::kCheetah &&
+      cparams_.decoding_speed_tier < 2) {
+    FindBestPatchDictionary(*color, enc_state, cms, nullptr, aux_out,
+                            cparams_.color_transform == ColorTransform::kXYB);
+    PatchDictionaryEncoder::SubtractFrom(
+        enc_state->shared.image_features.patches, color);
+  }
+
+  // Convert ImageBundle to modular Image object
+  const size_t xsize = frame_dim_.xsize;
+  const size_t ysize = frame_dim_.ysize;
+
+  int nb_chans = 3;
+  if (metadata.color_encoding.IsGray() &&
+      cparams_.color_transform == ColorTransform::kNone) {
+    nb_chans = 1;
+  }
+  if (!do_color) nb_chans = 0;
+
+  nb_chans += extra_channels.size();
+
+  bool fp = metadata.bit_depth.floating_point_sample &&
+            cparams_.color_transform != ColorTransform::kXYB;
+
+  // bits_per_sample is just metadata for XYB images.
+  if (metadata.bit_depth.bits_per_sample >= 32 && do_color &&
+      cparams_.color_transform != ColorTransform::kXYB) {
+    if (metadata.bit_depth.bits_per_sample == 32 && fp == false) {
+      return JXL_FAILURE("uint32_t not supported in enc_modular");
+    } else if (metadata.bit_depth.bits_per_sample > 32) {
+      return JXL_FAILURE("bits_per_sample > 32 not supported");
+    }
+  }
+
+  // in the non-float case, there is an implicit 0 sign bit
+  int max_bitdepth =
+      do_color ? metadata.bit_depth.bits_per_sample + (fp ? 0 : 1) : 0;
+  Image& gi = stream_images_[0];
+  gi = Image(xsize, ysize, metadata.bit_depth.bits_per_sample, nb_chans);
+  int c = 0;
+  if (cparams_.color_transform == ColorTransform::kXYB &&
+      cparams_.modular_mode == true) {
+    float enc_factors[3] = {32768.0f, 2048.0f, 2048.0f};
+    if (cparams_.butteraugli_distance > 0 && !cparams_.responsive) {
+      // quantize XYB here and then treat it as a lossless image
+      enc_factors[0] *= 1.f / (1.f + 23.f * cparams_.butteraugli_distance);
+      enc_factors[1] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
+      enc_factors[2] *= 1.f / (1.f + 14.f * cparams_.butteraugli_distance);
+      cparams_.butteraugli_distance = 0;
+    }
+    if (cparams_.manual_xyb_factors.size() == 3) {
+      DequantMatricesSetCustomDC(&enc_state->shared.matrices,
+                                 cparams_.manual_xyb_factors.data());
+      // TODO(jon): update max_bitdepth in this case
+    } else {
+      DequantMatricesSetCustomDC(&enc_state->shared.matrices, enc_factors);
+      max_bitdepth = 12;
+    }
+  }
+  pixel_type maxval = gi.bitdepth < 32 ? (1u << gi.bitdepth) - 1 : 0;
+  if (do_color) {
+    for (; c < 3; c++) {
+      if (metadata.color_encoding.IsGray() &&
+          cparams_.color_transform == ColorTransform::kNone &&
+          c != (cparams_.color_transform == ColorTransform::kXYB ? 1 : 0))
+        continue;
+      int c_out = c;
+      // XYB is encoded as YX(B-Y)
+      if (cparams_.color_transform == ColorTransform::kXYB && c < 2)
+        c_out = 1 - c_out;
+      double factor = maxval;
+      if (cparams_.color_transform == ColorTransform::kXYB)
+        factor = enc_state->shared.matrices.InvDCQuant(c);
+      if (c == 2 && cparams_.color_transform == ColorTransform::kXYB) {
+        JXL_ASSERT(!fp);
+        for (size_t y = 0; y < ysize; ++y) {
+          const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y);
+          pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
+          pixel_type* const JXL_RESTRICT row_Y = gi.channel[0].Row(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row_out[x] = row_in[x] * factor + 0.5f;
+            row_out[x] -= row_Y[x];
+            // zero the lsb of B
+            row_out[x] = row_out[x] / 2 * 2;
+          }
+        }
+      } else {
+        int bits = metadata.bit_depth.bits_per_sample;
+        int exp_bits = metadata.bit_depth.exponent_bits_per_sample;
+        gi.channel[c_out].hshift =
+            enc_state->shared.frame_header.chroma_subsampling.HShift(c);
+        gi.channel[c_out].vshift =
+            enc_state->shared.frame_header.chroma_subsampling.VShift(c);
+        size_t xsize_shifted = DivCeil(xsize, 1 << gi.channel[c_out].hshift);
+        size_t ysize_shifted = DivCeil(ysize, 1 << gi.channel[c_out].vshift);
+        gi.channel[c_out].shrink(xsize_shifted, ysize_shifted);
+        std::atomic<bool> has_error{false};
+        JXL_RETURN_IF_ERROR(RunOnPool(
+            pool, 0, ysize_shifted, ThreadPool::NoInit,
+            [&](const int task, const int thread) {
+              const size_t y = task;
+              const float* const JXL_RESTRICT row_in = color->PlaneRow(c, y);
+              pixel_type* const JXL_RESTRICT row_out = gi.channel[c_out].Row(y);
+              if (!float_to_int(row_in, row_out, xsize_shifted, bits, exp_bits,
+                                fp, factor)) {
+                has_error = true;
+              };
+            },
+            "float2int"));
+        if (has_error) {
+          return JXL_FAILURE("Error in float to integer conversion");
+        }
+      }
+    }
+    if (metadata.color_encoding.IsGray() &&
+        cparams_.color_transform == ColorTransform::kNone)
+      c = 1;
+  }
+
+  for (size_t ec = 0; ec < extra_channels.size(); ec++, c++) {
+    const ExtraChannelInfo& eci = metadata.extra_channel_info[ec];
+    size_t ecups = frame_header.extra_channel_upsampling[ec];
+    gi.channel[c].shrink(DivCeil(frame_dim_.xsize_upsampled, ecups),
+                         DivCeil(frame_dim_.ysize_upsampled, ecups));
+    gi.channel[c].hshift = gi.channel[c].vshift =
+        CeilLog2Nonzero(ecups) - CeilLog2Nonzero(frame_header.upsampling);
+
+    int bits = eci.bit_depth.bits_per_sample;
+    int exp_bits = eci.bit_depth.exponent_bits_per_sample;
+    bool fp = eci.bit_depth.floating_point_sample;
+    double factor = (fp ? 1 : ((1u << eci.bit_depth.bits_per_sample) - 1));
+    if (bits + (fp ? 0 : 1) > max_bitdepth) max_bitdepth = bits + (fp ? 0 : 1);
+    std::atomic<bool> has_error{false};
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, gi.channel[c].plane.ysize(), ThreadPool::NoInit,
+        [&](const int task, const int thread) {
+          const size_t y = task;
+          const float* const JXL_RESTRICT row_in = extra_channels[ec].Row(y);
+          pixel_type* const JXL_RESTRICT row_out = gi.channel[c].Row(y);
+          if (!float_to_int(row_in, row_out, gi.channel[c].plane.xsize(), bits,
+                            exp_bits, fp, factor)) {
+            has_error = true;
+          };
+        },
+        "float2int"));
+    if (has_error) return JXL_FAILURE("Error in float to integer conversion");
+  }
+  JXL_ASSERT(c == nb_chans);
+
+  int level_max_bitdepth = (cparams_.level == 5 ? 16 : 32);
+  if (max_bitdepth > level_max_bitdepth)
+    return JXL_FAILURE(
+        "Bitdepth too high for level %i (need %i bits, have only %i in this "
+        "level)",
+        cparams_.level, max_bitdepth, level_max_bitdepth);
+
+  // Set options and apply transformations
+  if (!cparams_.ModularPartIsLossless()) {
+    if (cparams_.palette_colors != 0) {
+      JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms");
+    }
+    if (cparams_.color_transform == ColorTransform::kXYB) {
+      cparams_.channel_colors_pre_transform_percent = 0;
+    }
+    cparams_.channel_colors_percent = 0;
+    cparams_.palette_colors = 0;
+    cparams_.lossy_palette = false;
+  }
+
+  // if few colors, do all-channel palette before trying channel palette
+  // Logic is as follows:
+  // - if you can make a palette with few colors (arbitrary threshold: 200),
+  //   then you can also make channel palettes, but they will just be extra
+  //   signaling cost for almost no benefit
+  // - if the palette needs more colors, then channel palette might help to
+  //   reduce palette signaling cost
+  if (cparams_.palette_colors != 0 &&
+      cparams_.speed_tier < SpeedTier::kFalcon) {
+    // all-channel palette (e.g. RGBA)
+    if (gi.channel.size() > 1) {
+      Transform maybe_palette(TransformId::kPalette);
+      maybe_palette.begin_c = gi.nb_meta_channels;
+      maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+      maybe_palette.nb_colors =
+          std::min(std::min(200, (int)(xsize * ysize / 8)),
+                   std::abs(cparams_.palette_colors) / 16);
+      maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette.lossy_palette = false;
+      do_transform(gi, maybe_palette, weighted::Header(), pool);
+    }
+  }
+
+  // Global channel palette
+  if (cparams_.channel_colors_pre_transform_percent > 0 &&
+      !cparams_.lossy_palette &&
+      (cparams_.speed_tier <= SpeedTier::kThunder ||
+       (do_color && metadata.bit_depth.bits_per_sample > 8))) {
+    // single channel palette (like FLIF's ChannelCompact)
+    size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
+    int orig_bitdepth = max_bitdepth;
+    max_bitdepth = 0;
+    for (size_t i = 0; i < nb_channels; i++) {
+      int32_t min, max;
+      compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+      int64_t colors = (int64_t)max - min + 1;
+      JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
+      Transform maybe_palette_1(TransformId::kPalette);
+      maybe_palette_1.begin_c = i + gi.nb_meta_channels;
+      maybe_palette_1.num_c = 1;
+      // simple heuristic: if less than X percent of the values in the range
+      // actually occur, it is probably worth it to do a compaction
+      // (but only if the channel palette is less than 6% the size of the
+      // image itself)
+      maybe_palette_1.nb_colors = std::min(
+          (int)(xsize * ysize / 16),
+          (int)(cparams_.channel_colors_pre_transform_percent / 100. * colors));
+      if (do_transform(gi, maybe_palette_1, weighted::Header(), pool)) {
+        // effective bit depth is lower, adjust quantization accordingly
+        compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+        if (max < maxval) maxval = max;
+        int ch_bitdepth =
+            (max > 0 ? CeilLog2Nonzero(static_cast<uint32_t>(max)) : 0);
+        if (ch_bitdepth > max_bitdepth) max_bitdepth = ch_bitdepth;
+      } else
+        max_bitdepth = orig_bitdepth;
+    }
+  }
+
+  // Global palette
+  if ((cparams_.palette_colors != 0 || cparams_.lossy_palette) &&
+      cparams_.speed_tier < SpeedTier::kFalcon) {
+    // all-channel palette (e.g. RGBA)
+    if (gi.channel.size() - gi.nb_meta_channels > 1) {
+      Transform maybe_palette(TransformId::kPalette);
+      maybe_palette.begin_c = gi.nb_meta_channels;
+      maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+      maybe_palette.nb_colors =
+          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
+      maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette.lossy_palette =
+          (cparams_.lossy_palette && maybe_palette.num_c == 3);
+      if (maybe_palette.lossy_palette) {
+        maybe_palette.predictor = delta_pred_;
+      }
+      // TODO(veluca): use a custom weighted header if using the weighted
+      // predictor.
+      do_transform(gi, maybe_palette, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
+    }
+    // all-minus-one-channel palette (RGB with separate alpha, or CMY with
+    // separate K)
+    if (gi.channel.size() - gi.nb_meta_channels > 3) {
+      Transform maybe_palette_3(TransformId::kPalette);
+      maybe_palette_3.begin_c = gi.nb_meta_channels;
+      maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
+      maybe_palette_3.nb_colors =
+          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
+      maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette_3.lossy_palette = cparams_.lossy_palette;
+      if (maybe_palette_3.lossy_palette) {
+        maybe_palette_3.predictor = delta_pred_;
+      }
+      do_transform(gi, maybe_palette_3, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
+    }
+  }
+
+  // don't do an RCT if we're short on bits
+  if (cparams_.color_transform == ColorTransform::kNone && do_color &&
+      gi.channel.size() - gi.nb_meta_channels >= 3 &&
+      max_bitdepth + 1 < level_max_bitdepth) {
+    if (cparams_.colorspace < 0 && (!cparams_.ModularPartIsLossless() ||
+                                    cparams_.speed_tier > SpeedTier::kHare)) {
+      Transform ycocg{TransformId::kRCT};
+      ycocg.rct_type = 6;
+      ycocg.begin_c = gi.nb_meta_channels;
+      do_transform(gi, ycocg, weighted::Header(), pool);
+      max_bitdepth++;
+    } else if (cparams_.colorspace > 0) {
+      Transform sg(TransformId::kRCT);
+      sg.begin_c = gi.nb_meta_channels;
+      sg.rct_type = cparams_.colorspace;
+      do_transform(gi, sg, weighted::Header(), pool);
+      max_bitdepth++;
+    }
+  }
+
+  // don't do squeeze if we don't have some spare bits
+  if (cparams_.responsive && !gi.channel.empty() &&
+      max_bitdepth + 2 < level_max_bitdepth) {
+    Transform t(TransformId::kSqueeze);
+    t.squeezes = cparams_.squeezes;
+    do_transform(gi, t, weighted::Header(), pool);
+    max_bitdepth += 2;
+  }
+
+  if (max_bitdepth + 1 > level_max_bitdepth) {
+    // force no group RCTs if we don't have a spare bit
+    cparams_.colorspace = 0;
+  }
+  JXL_ASSERT(max_bitdepth <= level_max_bitdepth);
+
+  std::vector<uint32_t> quants;
+
+  if (!cparams_.ModularPartIsLossless()) {
+    quants.resize(gi.channel.size(), 1);
+    float quantizer = 0.25f;
+    if (!cparams_.responsive) {
+      JXL_DEBUG_V(1,
+                  "Warning: lossy compression without Squeeze "
+                  "transform is just color quantization.");
+      quantizer *= 0.1f;
+    }
+    float bitdepth_correction = 1.f;
+    if (cparams_.color_transform != ColorTransform::kXYB) {
+      bitdepth_correction = maxval / 255.f;
+    }
+    std::vector<float> quantizers;
+    float dist = cparams_.butteraugli_distance;
+    for (size_t i = 0; i < 3; i++) {
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
+    }
+    for (size_t i = 0; i < extra_channels.size(); i++) {
+      int ec_bitdepth =
+          metadata.extra_channel_info[i].bit_depth.bits_per_sample;
+      pixel_type ec_maxval = ec_bitdepth < 32 ? (1u << ec_bitdepth) - 1 : 0;
+      bitdepth_correction = ec_maxval / 255.f;
+      if (i < cparams_.ec_distance.size()) dist = cparams_.ec_distance[i];
+      if (dist < 0) dist = cparams_.butteraugli_distance;
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
+    }
+    if (cparams_.options.nb_repeats == 0) {
+      return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!");
+    }
+    for (uint32_t i = gi.nb_meta_channels; i < gi.channel.size(); i++) {
+      Channel& ch = gi.channel[i];
+      int shift = ch.hshift + ch.vshift;  // number of pixel halvings
+      if (shift > 16) shift = 16;
+      if (shift > 0) shift--;
+      int q;
+      // assuming default Squeeze here
+      int component =
+          (do_color ? 0 : 3) + ((i - gi.nb_meta_channels) % nb_chans);
+      // last 4 channels are final chroma residuals
+      if (nb_chans > 2 && i >= gi.channel.size() - 4 && cparams_.responsive) {
+        component = 1;
+      }
+      if (cparams_.color_transform == ColorTransform::kXYB && component < 3) {
+        q = quantizers[component] * squeeze_quality_factor_xyb *
+            squeeze_xyb_qtable[component][shift];
+      } else {
+        if (cparams_.colorspace != 0 && component > 0 && component < 3) {
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_chroma_qtable[shift];
+        } else {
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_luma_factor * squeeze_luma_qtable[shift];
+        }
+      }
+      if (q < 1) q = 1;
+      QuantizeChannel(gi.channel[i], q);
+      quants[i] = q;
+    }
+  }
+
+  // Fill other groups.
+  struct GroupParams {
+    Rect rect;
+    int minShift;
+    int maxShift;
+    ModularStreamId id;
+  };
+  std::vector<GroupParams> stream_params;
+
+  stream_options_[0] = cparams_.options;
+
+  // DC
+  for (size_t group_id = 0; group_id < frame_dim_.num_dc_groups; group_id++) {
+    const size_t gx = group_id % frame_dim_.xsize_dc_groups;
+    const size_t gy = group_id / frame_dim_.xsize_dc_groups;
+    const Rect rect(gx * frame_dim_.dc_group_dim, gy * frame_dim_.dc_group_dim,
+                    frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
+    // minShift==3 because (frame_dim.dc_group_dim >> 3) == frame_dim.group_dim
+    // maxShift==1000 is infinity
+    stream_params.push_back(
+        GroupParams{rect, 3, 1000, ModularStreamId::ModularDC(group_id)});
+  }
+  // AC global -> nothing.
+  // AC
+  for (size_t group_id = 0; group_id < frame_dim_.num_groups; group_id++) {
+    const size_t gx = group_id % frame_dim_.xsize_groups;
+    const size_t gy = group_id / frame_dim_.xsize_groups;
+    const Rect mrect(gx * frame_dim_.group_dim, gy * frame_dim_.group_dim,
+                     frame_dim_.group_dim, frame_dim_.group_dim);
+    for (size_t i = 0; i < enc_state->progressive_splitter.GetNumPasses();
+         i++) {
+      int maxShift, minShift;
+      frame_header.passes.GetDownsamplingBracket(i, minShift, maxShift);
+      stream_params.push_back(GroupParams{
+          mrect, minShift, maxShift, ModularStreamId::ModularAC(group_id, i)});
+    }
+  }
+  // if there's only one group, everything ends up in GlobalModular
+  // in that case, also try RCTs/WP params for the one group
+  if (stream_params.size() == 2) {
+    stream_params.push_back(GroupParams{Rect(0, 0, xsize, ysize), 0, 1000,
+                                        ModularStreamId::Global()});
+  }
+  gi_channel_.resize(stream_images_.size());
+
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, stream_params.size(), ThreadPool::NoInit,
+      [&](const uint32_t i, size_t /* thread */) {
+        stream_options_[stream_params[i].id.ID(frame_dim_)] = cparams_.options;
+        JXL_CHECK(PrepareStreamParams(
+            stream_params[i].rect, cparams_, stream_params[i].minShift,
+            stream_params[i].maxShift, stream_params[i].id, do_color));
+      },
+      "ChooseParams"));
+  {
+    // Clear out channels that have been copied to groups.
+    Image& full_image = stream_images_[0];
+    size_t c = full_image.nb_meta_channels;
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
+    }
+    for (; c < full_image.channel.size(); c++) {
+      full_image.channel[c].plane = ImageI();
+    }
+  }
+
+  if (!quants.empty()) {
+    for (uint32_t stream_id = 0; stream_id < stream_images_.size();
+         stream_id++) {
+      // skip non-modular stream_ids
+      if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
+      const Image& image = stream_images_[stream_id];
+      const ModularOptions& options = stream_options_[stream_id];
+      for (uint32_t i = image.nb_meta_channels; i < image.channel.size(); i++) {
+        if (i >= image.nb_meta_channels &&
+            (image.channel[i].w > options.max_chan_size ||
+             image.channel[i].h > options.max_chan_size)) {
+          continue;
+        }
+        if (stream_id > 0 && gi_channel_[stream_id].empty()) continue;
+        size_t ch_id = stream_id == 0
+                           ? i
+                           : gi_channel_[stream_id][i - image.nb_meta_channels];
+        uint32_t q = quants[ch_id];
+        // Inform the tree splitting heuristics that each channel in each group
+        // used this quantization factor. This will produce a tree with the
+        // given multipliers.
+        if (multiplier_info_.empty() ||
+            multiplier_info_.back().range[1][0] != stream_id ||
+            multiplier_info_.back().multiplier != q) {
+          StaticPropRange range;
+          range[0] = {{i, i + 1}};
+          range[1] = {{stream_id, stream_id + 1}};
+          multiplier_info_.push_back({range, (uint32_t)q});
+        } else {
+          // Previous channel in the same group had the same quantization
+          // factor. Don't provide two different ranges, as that creates
+          // unnecessary nodes.
+          multiplier_info_.back().range[0][1] = i + 1;
+        }
+      }
+    }
+    // Merge group+channel settings that have the same channels and quantization
+    // factors, to avoid unnecessary nodes.
+    std::sort(multiplier_info_.begin(), multiplier_info_.end(),
+              [](ModularMultiplierInfo a, ModularMultiplierInfo b) {
+                return std::make_tuple(a.range, a.multiplier) <
+                       std::make_tuple(b.range, b.multiplier);
+              });
+    size_t new_num = 1;
+    for (size_t i = 1; i < multiplier_info_.size(); i++) {
+      ModularMultiplierInfo& prev = multiplier_info_[new_num - 1];
+      ModularMultiplierInfo& cur = multiplier_info_[i];
+      if (prev.range[0] == cur.range[0] && prev.multiplier == cur.multiplier &&
+          prev.range[1][1] == cur.range[1][0]) {
+        prev.range[1][1] = cur.range[1][1];
+      } else {
+        multiplier_info_[new_num++] = multiplier_info_[i];
+      }
+    }
+    multiplier_info_.resize(new_num);
+  }
+
+  JXL_RETURN_IF_ERROR(ValidateChannelDimensions(gi, stream_options_[0]));
+
+  return PrepareEncoding(frame_header, pool, enc_state->heuristics.get(),
+                         aux_out);
+}
+
+Status ModularFrameEncoder::PrepareEncoding(const FrameHeader& frame_header,
+                                            ThreadPool* pool,
+                                            EncoderHeuristics* heuristics,
+                                            AuxOut* aux_out) {
+  if (!tree_.empty()) return true;
+
+  // Compute tree.
+  size_t num_streams = stream_images_.size();
+  stream_headers_.resize(num_streams);
+  tokens_.resize(num_streams);
+
+  if (heuristics->CustomFixedTreeLossless(frame_dim_, &tree_)) {
+    // Using a fixed tree.
+  } else if (cparams_.speed_tier < SpeedTier::kFalcon ||
+             !cparams_.modular_mode) {
+    // Avoid creating a tree with leaves that don't correspond to any pixels.
+    std::vector<size_t> useful_splits;
+    useful_splits.reserve(tree_splits_.size());
+    for (size_t chunk = 0; chunk < tree_splits_.size() - 1; chunk++) {
+      bool has_pixels = false;
+      size_t start = tree_splits_[chunk];
+      size_t stop = tree_splits_[chunk + 1];
+      for (size_t i = start; i < stop; i++) {
+        if (!stream_images_[i].empty()) has_pixels = true;
+      }
+      if (has_pixels) {
+        useful_splits.push_back(tree_splits_[chunk]);
+      }
+    }
+    // Don't do anything if modular mode does not have any pixels in this image
+    if (useful_splits.empty()) return true;
+    useful_splits.push_back(tree_splits_.back());
+
+    std::atomic_flag invalid_force_wp = ATOMIC_FLAG_INIT;
+
+    std::vector<Tree> trees(useful_splits.size() - 1);
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, useful_splits.size() - 1, ThreadPool::NoInit,
+        [&](const uint32_t chunk, size_t /* thread */) {
+          // TODO(veluca): parallelize more.
+          size_t total_pixels = 0;
+          uint32_t start = useful_splits[chunk];
+          uint32_t stop = useful_splits[chunk + 1];
+          while (start < stop && stream_images_[start].empty()) ++start;
+          while (start < stop && stream_images_[stop - 1].empty()) --stop;
+          uint32_t max_c = 0;
+          if (stream_options_[start].tree_kind !=
+              ModularOptions::TreeKind::kLearn) {
+            for (size_t i = start; i < stop; i++) {
+              for (const Channel& ch : stream_images_[i].channel) {
+                total_pixels += ch.w * ch.h;
+              }
+            }
+            trees[chunk] =
+                PredefinedTree(stream_options_[start].tree_kind, total_pixels);
+            return;
+          }
+          TreeSamples tree_samples;
+          if (!tree_samples.SetPredictor(stream_options_[start].predictor,
+                                         stream_options_[start].wp_tree_mode)) {
+            invalid_force_wp.test_and_set(std::memory_order_acq_rel);
+            return;
+          }
+          if (!tree_samples.SetProperties(
+                  stream_options_[start].splitting_heuristics_properties,
+                  stream_options_[start].wp_tree_mode)) {
+            invalid_force_wp.test_and_set(std::memory_order_acq_rel);
+            return;
+          }
+          std::vector<pixel_type> pixel_samples;
+          std::vector<pixel_type> diff_samples;
+          std::vector<uint32_t> group_pixel_count;
+          std::vector<uint32_t> channel_pixel_count;
+          for (size_t i = start; i < stop; i++) {
+            max_c = std::max<uint32_t>(stream_images_[i].channel.size(), max_c);
+            CollectPixelSamples(stream_images_[i], stream_options_[i], i,
+                                group_pixel_count, channel_pixel_count,
+                                pixel_samples, diff_samples);
+          }
+          StaticPropRange range;
+          range[0] = {{0, max_c}};
+          range[1] = {{start, stop}};
+          auto local_multiplier_info = multiplier_info_;
+
+          tree_samples.PreQuantizeProperties(
+              range, local_multiplier_info, group_pixel_count,
+              channel_pixel_count, pixel_samples, diff_samples,
+              stream_options_[start].max_property_values);
+          for (size_t i = start; i < stop; i++) {
+            JXL_CHECK(ModularGenericCompress(
+                stream_images_[i], stream_options_[i], /*writer=*/nullptr,
+                /*aux_out=*/nullptr, 0, i, &tree_samples, &total_pixels));
+          }
+
+          // TODO(veluca): parallelize more.
+          trees[chunk] =
+              LearnTree(std::move(tree_samples), total_pixels,
+                        stream_options_[start], local_multiplier_info, range);
+        },
+        "LearnTrees"));
+    if (invalid_force_wp.test_and_set(std::memory_order_acq_rel)) {
+      return JXL_FAILURE("PrepareEncoding: force_no_wp with {Weighted}");
+    }
+    tree_.clear();
+    MergeTrees(trees, useful_splits, 0, useful_splits.size() - 1, &tree_);
+  } else {
+    // Fixed tree.
+    size_t total_pixels = 0;
+    for (const Image& img : stream_images_) {
+      for (const Channel& ch : img.channel) {
+        total_pixels += ch.w * ch.h;
+      }
+    }
+    if (cparams_.speed_tier <= SpeedTier::kFalcon) {
+      tree_ =
+          PredefinedTree(ModularOptions::TreeKind::kWPFixedDC, total_pixels);
+    } else if (cparams_.speed_tier <= SpeedTier::kThunder) {
+      tree_ = PredefinedTree(ModularOptions::TreeKind::kGradientFixedDC,
+                             total_pixels);
+    } else {
+      tree_ = {PropertyDecisionNode::Leaf(Predictor::Gradient)};
+    }
+  }
+  tree_tokens_.resize(1);
+  tree_tokens_[0].clear();
+  Tree decoded_tree;
+  TokenizeTree(tree_, &tree_tokens_[0], &decoded_tree);
+  JXL_ASSERT(tree_.size() == decoded_tree.size());
+  tree_ = std::move(decoded_tree);
+
+  if (kPrintTree && WantDebugOutput(aux_out)) {
+    if (frame_header.dc_level > 0) {
+      PrintTree(tree_, aux_out->debug_prefix + "/dc_frame_level" +
+                           std::to_string(frame_header.dc_level) + "_tree");
+    } else {
+      PrintTree(tree_, aux_out->debug_prefix + "/global_tree");
+    }
+  }
+
+  image_widths_.resize(num_streams);
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, num_streams, ThreadPool::NoInit,
+      [&](const uint32_t stream_id, size_t /* thread */) {
+        AuxOut my_aux_out;
+        if (aux_out) {
+          my_aux_out.dump_image = aux_out->dump_image;
+          my_aux_out.debug_prefix = aux_out->debug_prefix;
+        }
+        tokens_[stream_id].clear();
+        JXL_CHECK(ModularGenericCompress(
+            stream_images_[stream_id], stream_options_[stream_id],
+            /*writer=*/nullptr, &my_aux_out, 0, stream_id,
+            /*tree_samples=*/nullptr,
+            /*total_pixels=*/nullptr,
+            /*tree=*/&tree_, /*header=*/&stream_headers_[stream_id],
+            /*tokens=*/&tokens_[stream_id],
+            /*widths=*/&image_widths_[stream_id]));
+      },
+      "ComputeTokens"));
+  return true;
+}
+
+Status ModularFrameEncoder::EncodeGlobalInfo(BitWriter* writer,
+                                             AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, 1);
+  // If we are using brotli, or not using modular mode.
+  if (tree_tokens_.empty() || tree_tokens_[0].empty()) {
+    writer->Write(1, 0);
+    allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
+    return true;
+  }
+  writer->Write(1, 1);
+  allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
+
+  // Write tree
+  HistogramParams params;
+  if (cparams_.speed_tier > SpeedTier::kKitten) {
+    params.clustering = HistogramParams::ClusteringType::kFast;
+    params.ans_histogram_strategy =
+        cparams_.speed_tier > SpeedTier::kThunder
+            ? HistogramParams::ANSHistogramStrategy::kFast
+            : HistogramParams::ANSHistogramStrategy::kApproximate;
+    params.lz77_method =
+        cparams_.decoding_speed_tier >= 3 && cparams_.modular_mode
+            ? (cparams_.speed_tier >= SpeedTier::kFalcon
+                   ? HistogramParams::LZ77Method::kRLE
+                   : HistogramParams::LZ77Method::kLZ77)
+            : HistogramParams::LZ77Method::kNone;
+    // Near-lossless DC, as well as modular mode, require choosing hybrid uint
+    // more carefully.
+    if ((!extra_dc_precision.empty() && extra_dc_precision[0] != 0) ||
+        (cparams_.modular_mode && cparams_.speed_tier < SpeedTier::kCheetah)) {
+      params.uint_method = HistogramParams::HybridUintMethod::kFast;
+    } else {
+      params.uint_method = HistogramParams::HybridUintMethod::kNone;
+    }
+  } else if (cparams_.speed_tier <= SpeedTier::kTortoise) {
+    params.lz77_method = HistogramParams::LZ77Method::kOptimal;
+  } else {
+    params.lz77_method = HistogramParams::LZ77Method::kLZ77;
+  }
+  if (cparams_.decoding_speed_tier >= 1) {
+    params.max_histograms = 12;
+  }
+  if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive) {
+    params.lz77_method = cparams_.speed_tier >= SpeedTier::kCheetah
+                             ? HistogramParams::LZ77Method::kRLE
+                         : cparams_.speed_tier >= SpeedTier::kKitten
+                             ? HistogramParams::LZ77Method::kLZ77
+                             : HistogramParams::LZ77Method::kOptimal;
+  }
+  if (cparams_.decoding_speed_tier >= 2 && cparams_.responsive) {
+    params.uint_method = HistogramParams::HybridUintMethod::k000;
+    params.force_huffman = true;
+  }
+  BuildAndEncodeHistograms(params, kNumTreeContexts, tree_tokens_, &code_,
+                           &context_map_, writer, kLayerModularTree, aux_out);
+  WriteTokens(tree_tokens_[0], code_, context_map_, writer, kLayerModularTree,
+              aux_out);
+  params.image_widths = image_widths_;
+  // Write histograms.
+  BuildAndEncodeHistograms(params, (tree_.size() + 1) / 2, tokens_, &code_,
+                           &context_map_, writer, kLayerModularGlobal, aux_out);
+  return true;
+}
+
+Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out,
+                                         size_t layer,
+                                         const ModularStreamId& stream) {
+  size_t stream_id = stream.ID(frame_dim_);
+  if (stream_images_[stream_id].channel.empty()) {
+    return true;  // Image with no channels, header never gets decoded.
+  }
+  JXL_RETURN_IF_ERROR(
+      Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out));
+  WriteTokens(tokens_[stream_id], code_, context_map_, writer, layer, aux_out);
+  return true;
+}
+
+namespace {
+float EstimateWPCost(const Image& img, size_t i) {
+  size_t extra_bits = 0;
+  float histo_cost = 0;
+  HybridUintConfig config;
+  int32_t cutoffs[] = {-500, -392, -255, -191, -127, -95, -63, -47, -31,
+                       -23,  -15,  -11,  -7,   -4,   -3,  -1,  0,   1,
+                       3,    5,    7,    11,   15,   23,  31,  47,  63,
+                       95,   127,  191,  255,  392,  500};
+  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
+  Histogram histo[nc] = {};
+  weighted::Header wp_header;
+  PredictorMode(i, &wp_header);
+  for (const Channel& ch : img.channel) {
+    const intptr_t onerow = ch.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, ch.w, ch.h);
+    Properties properties(1);
+    for (size_t y = 0; y < ch.h; y++) {
+      const pixel_type* JXL_RESTRICT r = ch.Row(y);
+      for (size_t x = 0; x < ch.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < ch.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        pixel_type guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, ch.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        size_t ctx = 0;
+        for (int c : cutoffs) {
+          ctx += c >= properties[0];
+        }
+        pixel_type res = r[x] - guess;
+        uint32_t token, nbits, bits;
+        config.Encode(PackSigned(res), &token, &nbits, &bits);
+        histo[ctx].Add(token);
+        extra_bits += nbits;
+        wp_state.UpdateErrors(r[x], x, y, ch.w);
+      }
+    }
+    for (size_t h = 0; h < nc; h++) {
+      histo_cost += histo[h].ShannonEntropy();
+      histo[h].Clear();
+    }
+  }
+  return histo_cost + extra_bits;
+}
+
+float EstimateCost(const Image& img) {
+  // TODO(veluca): consider SIMDfication of this code.
+  size_t extra_bits = 0;
+  float histo_cost = 0;
+  HybridUintConfig config;
+  uint32_t cutoffs[] = {0,  1,  3,  5,   7,   11,  15,  23, 31,
+                        47, 63, 95, 127, 191, 255, 392, 500};
+  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
+  Histogram histo[nc] = {};
+  for (const Channel& ch : img.channel) {
+    const intptr_t onerow = ch.plane.PixelsPerRow();
+    for (size_t y = 0; y < ch.h; y++) {
+      const pixel_type* JXL_RESTRICT r = ch.Row(y);
+      for (size_t x = 0; x < ch.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        size_t maxdiff = std::max(std::max(left, top), topleft) -
+                         std::min(std::min(left, top), topleft);
+        size_t ctx = 0;
+        for (uint32_t c : cutoffs) {
+          ctx += c > maxdiff;
+        }
+        pixel_type res = r[x] - ClampedGradient(top, left, topleft);
+        uint32_t token, nbits, bits;
+        config.Encode(PackSigned(res), &token, &nbits, &bits);
+        histo[ctx].Add(token);
+        extra_bits += nbits;
+      }
+    }
+    for (size_t h = 0; h < nc; h++) {
+      histo_cost += histo[h].ShannonEntropy();
+      histo[h].Clear();
+    }
+  }
+  return histo_cost + extra_bits;
+}
+
+}  // namespace
+
+Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect,
+                                                const CompressParams& cparams_,
+                                                int minShift, int maxShift,
+                                                const ModularStreamId& stream,
+                                                bool do_color) {
+  size_t stream_id = stream.ID(frame_dim_);
+  Image& full_image = stream_images_[0];
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  Image& gi = stream_images_[stream_id];
+  if (stream_id > 0) {
+    gi = Image(xsize, ysize, full_image.bitdepth, 0);
+    // start at the first bigger-than-frame_dim.group_dim non-metachannel
+    size_t c = full_image.nb_meta_channels;
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      if (fc.w > frame_dim_.group_dim || fc.h > frame_dim_.group_dim) break;
+    }
+    for (; c < full_image.channel.size(); c++) {
+      Channel& fc = full_image.channel[c];
+      int shift = std::min(fc.hshift, fc.vshift);
+      if (shift > maxShift) continue;
+      if (shift < minShift) continue;
+      Rect r(rect.x0() >> fc.hshift, rect.y0() >> fc.vshift,
+             rect.xsize() >> fc.hshift, rect.ysize() >> fc.vshift, fc.w, fc.h);
+      if (r.xsize() == 0 || r.ysize() == 0) continue;
+      gi_channel_[stream_id].push_back(c);
+      Channel gc(r.xsize(), r.ysize());
+      gc.hshift = fc.hshift;
+      gc.vshift = fc.vshift;
+      for (size_t y = 0; y < r.ysize(); ++y) {
+        memcpy(gc.Row(y), r.ConstRow(fc.plane, y),
+               r.xsize() * sizeof(pixel_type));
+      }
+      gi.channel.emplace_back(std::move(gc));
+    }
+
+    if (gi.channel.empty()) return true;
+    // Do some per-group transforms
+
+    // Local palette
+    // TODO(veluca): make this work with quantize-after-prediction in lossy
+    // mode.
+    if (cparams_.butteraugli_distance == 0.f && cparams_.palette_colors != 0 &&
+        cparams_.speed_tier < SpeedTier::kCheetah) {
+      // all-channel palette (e.g. RGBA)
+      if (gi.channel.size() - gi.nb_meta_channels > 1) {
+        Transform maybe_palette(TransformId::kPalette);
+        maybe_palette.begin_c = gi.nb_meta_channels;
+        maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
+        maybe_palette.nb_colors = std::abs(cparams_.palette_colors);
+        maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
+        do_transform(gi, maybe_palette, weighted::Header());
+      }
+      // all-minus-one-channel palette (RGB with separate alpha, or CMY with
+      // separate K)
+      if (gi.channel.size() - gi.nb_meta_channels > 3) {
+        Transform maybe_palette_3(TransformId::kPalette);
+        maybe_palette_3.begin_c = gi.nb_meta_channels;
+        maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
+        maybe_palette_3.nb_colors = std::abs(cparams_.palette_colors);
+        maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
+        maybe_palette_3.lossy_palette = cparams_.lossy_palette;
+        if (maybe_palette_3.lossy_palette) {
+          maybe_palette_3.predictor = Predictor::Weighted;
+        }
+        do_transform(gi, maybe_palette_3, weighted::Header());
+      }
+    }
+
+    // Local channel palette
+    if (cparams_.channel_colors_percent > 0 &&
+        cparams_.butteraugli_distance == 0.f && !cparams_.lossy_palette &&
+        cparams_.speed_tier < SpeedTier::kCheetah &&
+        !(cparams_.responsive && cparams_.decoding_speed_tier >= 1)) {
+      // single channel palette (like FLIF's ChannelCompact)
+      size_t nb_channels = gi.channel.size() - gi.nb_meta_channels;
+      for (size_t i = 0; i < nb_channels; i++) {
+        int32_t min, max;
+        compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
+        int64_t colors = (int64_t)max - min + 1;
+        JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
+        Transform maybe_palette_1(TransformId::kPalette);
+        maybe_palette_1.begin_c = i + gi.nb_meta_channels;
+        maybe_palette_1.num_c = 1;
+        // simple heuristic: if less than X percent of the values in the range
+        // actually occur, it is probably worth it to do a compaction
+        // (but only if the channel palette is less than 80% the size of the
+        // image itself)
+        maybe_palette_1.nb_colors =
+            std::min((int)(xsize * ysize * 0.8),
+                     (int)(cparams_.channel_colors_percent / 100. * colors));
+        do_transform(gi, maybe_palette_1, weighted::Header());
+      }
+    }
+  }
+
+  // lossless and no specific color transform specified: try Nothing, YCoCg,
+  // and 17 RCTs
+  if (cparams_.color_transform == ColorTransform::kNone &&
+      cparams_.IsLossless() && cparams_.colorspace < 0 &&
+      gi.channel.size() - gi.nb_meta_channels >= 3 &&
+      cparams_.responsive == false && do_color &&
+      cparams_.speed_tier <= SpeedTier::kHare) {
+    Transform sg(TransformId::kRCT);
+    sg.begin_c = gi.nb_meta_channels;
+    size_t nb_rcts_to_try = 0;
+    switch (cparams_.speed_tier) {
+      case SpeedTier::kLightning:
+      case SpeedTier::kThunder:
+      case SpeedTier::kFalcon:
+      case SpeedTier::kCheetah:
+        nb_rcts_to_try = 0;  // Just do global YCoCg
+        break;
+      case SpeedTier::kHare:
+        nb_rcts_to_try = 4;
+        break;
+      case SpeedTier::kWombat:
+        nb_rcts_to_try = 5;
+        break;
+      case SpeedTier::kSquirrel:
+        nb_rcts_to_try = 7;
+        break;
+      case SpeedTier::kKitten:
+        nb_rcts_to_try = 9;
+        break;
+      case SpeedTier::kGlacier:
+      case SpeedTier::kTortoise:
+        nb_rcts_to_try = 19;
+        break;
+    }
+    float best_cost = std::numeric_limits<float>::max();
+    size_t best_rct = 0;
+    // These should be 19 actually different transforms; the remaining ones
+    // are equivalent to one of these (note that the first two are do-nothing
+    // and YCoCg) modulo channel reordering (which only matters in the case of
+    // MA-with-prev-channels-properties) and/or sign (e.g. RmG vs GmR)
+    for (int i : {0 * 7 + 0, 0 * 7 + 6, 0 * 7 + 5, 1 * 7 + 3, 3 * 7 + 5,
+                  5 * 7 + 5, 1 * 7 + 5, 2 * 7 + 5, 1 * 7 + 1, 0 * 7 + 4,
+                  1 * 7 + 2, 2 * 7 + 1, 2 * 7 + 2, 2 * 7 + 3, 4 * 7 + 4,
+                  4 * 7 + 5, 0 * 7 + 2, 0 * 7 + 1, 0 * 7 + 3}) {
+      if (nb_rcts_to_try == 0) break;
+      sg.rct_type = i;
+      nb_rcts_to_try--;
+      if (do_transform(gi, sg, weighted::Header())) {
+        float cost = EstimateCost(gi);
+        if (cost < best_cost) {
+          best_rct = i;
+          best_cost = cost;
+        }
+        Transform t = gi.transform.back();
+        JXL_RETURN_IF_ERROR(t.Inverse(gi, weighted::Header(), nullptr));
+        gi.transform.pop_back();
+      }
+    }
+    // Apply the best RCT to the image for future encoding.
+    sg.rct_type = best_rct;
+    do_transform(gi, sg, weighted::Header());
+  } else {
+    // No need to try anything, just use the default options.
+  }
+  size_t nb_wp_modes = 1;
+  if (cparams_.speed_tier <= SpeedTier::kTortoise) {
+    nb_wp_modes = 5;
+  } else if (cparams_.speed_tier <= SpeedTier::kKitten) {
+    nb_wp_modes = 2;
+  }
+  if (nb_wp_modes > 1 &&
+      (stream_options_[stream_id].predictor == Predictor::Weighted ||
+       stream_options_[stream_id].predictor == Predictor::Best ||
+       stream_options_[stream_id].predictor == Predictor::Variable)) {
+    float best_cost = std::numeric_limits<float>::max();
+    stream_options_[stream_id].wp_mode = 0;
+    for (size_t i = 0; i < nb_wp_modes; i++) {
+      float cost = EstimateWPCost(gi, i);
+      if (cost < best_cost) {
+        best_cost = cost;
+        stream_options_[stream_id].wp_mode = i;
+      }
+    }
+  }
+  return true;
+}
+
+constexpr float q_deadzone = 0.62f;
+int QuantizeWP(const int32_t* qrow, size_t onerow, size_t c, size_t x, size_t y,
+               size_t w, weighted::State* wp_state, float value,
+               float inv_factor) {
+  float svalue = value * inv_factor;
+  PredictionResult pred =
+      PredictNoTreeWP(w, qrow + x, onerow, x, y, Predictor::Weighted, wp_state);
+  svalue -= pred.guess;
+  if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
+  int residual = roundf(svalue);
+  if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
+  return residual + pred.guess;
+}
+
+int QuantizeGradient(const int32_t* qrow, size_t onerow, size_t c, size_t x,
+                     size_t y, size_t w, float value, float inv_factor) {
+  float svalue = value * inv_factor;
+  PredictionResult pred =
+      PredictNoTreeNoWP(w, qrow + x, onerow, x, y, Predictor::Gradient);
+  svalue -= pred.guess;
+  if (svalue > -q_deadzone && svalue < q_deadzone) svalue = 0;
+  int residual = roundf(svalue);
+  if (residual > 2 || residual < -2) residual = roundf(svalue * 0.5) * 2;
+  return residual + pred.guess;
+}
+
+void ModularFrameEncoder::AddVarDCTDC(const Image3F& dc, size_t group_index,
+                                      bool nl_dc, PassesEncoderState* enc_state,
+                                      bool jpeg_transcode) {
+  const Rect r = enc_state->shared.DCGroupRect(group_index);
+  extra_dc_precision[group_index] = nl_dc ? 1 : 0;
+  float mul = 1 << extra_dc_precision[group_index];
+
+  size_t stream_id = ModularStreamId::VarDCTDC(group_index).ID(frame_dim_);
+  stream_options_[stream_id].max_chan_size = 0xFFFFFF;
+  stream_options_[stream_id].predictor = Predictor::Weighted;
+  stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kWPOnly;
+  if (cparams_.speed_tier >= SpeedTier::kSquirrel) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kWPFixedDC;
+  }
+  if (cparams_.speed_tier < SpeedTier::kSquirrel && !nl_dc) {
+    stream_options_[stream_id].predictor =
+        (cparams_.speed_tier < SpeedTier::kKitten ? Predictor::Variable
+                                                  : Predictor::Best);
+    stream_options_[stream_id].wp_tree_mode =
+        ModularOptions::TreeMode::kDefault;
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
+  }
+  if (cparams_.decoding_speed_tier >= 1) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kGradientFixedDC;
+  }
+
+  stream_images_[stream_id] = Image(r.xsize(), r.ysize(), 8, 3);
+  if (nl_dc && stream_options_[stream_id].tree_kind ==
+                   ModularOptions::TreeKind::kGradientFixedDC) {
+    JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444());
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        size_t stride = stream_images_[stream_id]
+                            .channel[c < 2 ? c ^ 1 : c]
+                            .plane.PixelsPerRow();
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeGradient(quant_row, stride, c, x, y,
+                                            r.xsize(), row[x], inv_factor);
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeGradient(
+                quant_row, stride, c, x, y, r.xsize(),
+                row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
+          }
+        }
+      }
+    }
+  } else if (nl_dc) {
+    JXL_ASSERT(enc_state->shared.frame_header.chroma_subsampling.Is444());
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      weighted::Header header;
+      weighted::State wp_state(header, r.xsize(), r.ysize());
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        size_t stride = stream_images_[stream_id]
+                            .channel[c < 2 ? c ^ 1 : c]
+                            .plane.PixelsPerRow();
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeWP(quant_row, stride, c, x, y, r.xsize(),
+                                      &wp_state, row[x], inv_factor);
+            wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = QuantizeWP(
+                quant_row, stride, c, x, y, r.xsize(), &wp_state,
+                row[x] - quant_row_y[x] * (y_factor * cfl_factor), inv_factor);
+            wp_state.UpdateErrors(quant_row[x], x, y, r.xsize());
+          }
+        }
+      }
+    }
+  } else if (enc_state->shared.frame_header.chroma_subsampling.Is444()) {
+    for (size_t c : {1, 0, 2}) {
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      float y_factor = enc_state->shared.quantizer.GetDcStep(1) / mul;
+      float cfl_factor = enc_state->shared.cmap.DCFactors()[c];
+      for (size_t y = 0; y < r.ysize(); y++) {
+        int32_t* quant_row =
+            stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c].plane.Row(y);
+        const float* row = r.ConstPlaneRow(dc, c, y);
+        if (c == 1) {
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] = roundf(row[x] * inv_factor);
+          }
+        } else {
+          int32_t* quant_row_y =
+              stream_images_[stream_id].channel[0].plane.Row(y);
+          for (size_t x = 0; x < r.xsize(); x++) {
+            quant_row[x] =
+                roundf((row[x] - quant_row_y[x] * (y_factor * cfl_factor)) *
+                       inv_factor);
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t c : {1, 0, 2}) {
+      Rect rect(
+          r.x0() >> enc_state->shared.frame_header.chroma_subsampling.HShift(c),
+          r.y0() >> enc_state->shared.frame_header.chroma_subsampling.VShift(c),
+          r.xsize() >>
+              enc_state->shared.frame_header.chroma_subsampling.HShift(c),
+          r.ysize() >>
+              enc_state->shared.frame_header.chroma_subsampling.VShift(c));
+      float inv_factor = enc_state->shared.quantizer.GetInvDcStep(c) * mul;
+      size_t ys = rect.ysize();
+      size_t xs = rect.xsize();
+      Channel& ch = stream_images_[stream_id].channel[c < 2 ? c ^ 1 : c];
+      ch.w = xs;
+      ch.h = ys;
+      ch.shrink();
+      for (size_t y = 0; y < ys; y++) {
+        int32_t* quant_row = ch.plane.Row(y);
+        const float* row = rect.ConstPlaneRow(dc, c, y);
+        for (size_t x = 0; x < xs; x++) {
+          quant_row[x] = roundf(row[x] * inv_factor);
+        }
+      }
+    }
+  }
+
+  DequantDC(r, &enc_state->shared.dc_storage, &enc_state->shared.quant_dc,
+            stream_images_[stream_id], enc_state->shared.quantizer.MulDC(),
+            1.0 / mul, enc_state->shared.cmap.DCFactors(),
+            enc_state->shared.frame_header.chroma_subsampling,
+            enc_state->shared.block_ctx_map);
+}
+
+void ModularFrameEncoder::AddACMetadata(size_t group_index, bool jpeg_transcode,
+                                        PassesEncoderState* enc_state) {
+  const Rect r = enc_state->shared.DCGroupRect(group_index);
+  size_t stream_id = ModularStreamId::ACMetadata(group_index).ID(frame_dim_);
+  stream_options_[stream_id].max_chan_size = 0xFFFFFF;
+  stream_options_[stream_id].wp_tree_mode = ModularOptions::TreeMode::kNoWP;
+  if (jpeg_transcode) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kJpegTranscodeACMeta;
+  } else if (cparams_.speed_tier >= SpeedTier::kFalcon) {
+    stream_options_[stream_id].tree_kind =
+        ModularOptions::TreeKind::kFalconACMeta;
+  } else if (cparams_.speed_tier > SpeedTier::kKitten) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kACMeta;
+  }
+  // If we are using a non-constant CfL field, and are in a slow enough mode,
+  // re-enable tree computation for it.
+  if (cparams_.speed_tier < SpeedTier::kSquirrel &&
+      cparams_.force_cfl_jpeg_recompression) {
+    stream_options_[stream_id].tree_kind = ModularOptions::TreeKind::kLearn;
+  }
+  // YToX, YToB, ACS + QF, EPF
+  Image& image = stream_images_[stream_id];
+  image = Image(r.xsize(), r.ysize(), 8, 4);
+  static_assert(kColorTileDimInBlocks == 8, "Color tile size changed");
+  Rect cr(r.x0() >> 3, r.y0() >> 3, (r.xsize() + 7) >> 3, (r.ysize() + 7) >> 3);
+  image.channel[0] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[1] = Channel(cr.xsize(), cr.ysize(), 3, 3);
+  image.channel[2] = Channel(r.xsize() * r.ysize(), 2, 0, 0);
+  ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytox_map,
+                       Rect(image.channel[0].plane), &image.channel[0].plane);
+  ConvertPlaneAndClamp(cr, enc_state->shared.cmap.ytob_map,
+                       Rect(image.channel[1].plane), &image.channel[1].plane);
+  size_t num = 0;
+  for (size_t y = 0; y < r.ysize(); y++) {
+    AcStrategyRow row_acs = enc_state->shared.ac_strategy.ConstRow(r, y);
+    const int32_t* row_qf = r.ConstRow(enc_state->shared.raw_quant_field, y);
+    const uint8_t* row_epf = r.ConstRow(enc_state->shared.epf_sharpness, y);
+    int32_t* out_acs = image.channel[2].plane.Row(0);
+    int32_t* out_qf = image.channel[2].plane.Row(1);
+    int32_t* row_out_epf = image.channel[3].plane.Row(y);
+    for (size_t x = 0; x < r.xsize(); x++) {
+      row_out_epf[x] = row_epf[x];
+      if (!row_acs[x].IsFirstBlock()) continue;
+      out_acs[num] = row_acs[x].RawStrategy();
+      out_qf[num] = row_qf[x] - 1;
+      num++;
+    }
+  }
+  image.channel[2].w = num;
+  ac_metadata_size[group_index] = num;
+}
+
+void ModularFrameEncoder::EncodeQuantTable(
+    size_t size_x, size_t size_y, BitWriter* writer,
+    const QuantEncoding& encoding, size_t idx,
+    ModularFrameEncoder* modular_frame_encoder) {
+  JXL_ASSERT(encoding.qraw.qtable != nullptr);
+  JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
+  JXL_CHECK(F16Coder::Write(encoding.qraw.qtable_den, writer));
+  if (modular_frame_encoder) {
+    JXL_CHECK(modular_frame_encoder->EncodeStream(
+        writer, nullptr, 0, ModularStreamId::QuantTable(idx)));
+    return;
+  }
+  Image image(size_x, size_y, 8, 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < size_x; x++) {
+        row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
+      }
+    }
+  }
+  ModularOptions cfopts;
+  JXL_CHECK(ModularGenericCompress(image, cfopts, writer));
+}
+
+void ModularFrameEncoder::AddQuantTable(size_t size_x, size_t size_y,
+                                        const QuantEncoding& encoding,
+                                        size_t idx) {
+  size_t stream_id = ModularStreamId::QuantTable(idx).ID(frame_dim_);
+  JXL_ASSERT(encoding.qraw.qtable != nullptr);
+  JXL_ASSERT(size_x * size_y * 3 == encoding.qraw.qtable->size());
+  Image& image = stream_images_[stream_id];
+  image = Image(size_x, size_y, 8, 3);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < size_y; y++) {
+      int32_t* JXL_RESTRICT row = image.channel[c].Row(y);
+      for (size_t x = 0; x < size_x; x++) {
+        row[x] = (*encoding.qraw.qtable)[c * size_x * size_y + y * size_x + x];
+      }
+    }
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.h b/third_party/jpeg-xl/lib/jxl/enc_modular.h
new file mode 100644
index 0000000000..2af66e951f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.h
@@ -0,0 +1,92 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_MODULAR_H_
+#define LIB_JXL_ENC_MODULAR_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+class ModularFrameEncoder {
+ public:
+  ModularFrameEncoder(const FrameHeader& frame_header,
+                      const CompressParams& cparams_orig);
+  Status ComputeEncodingData(const FrameHeader& frame_header,
+                             const ImageMetadata& metadata,
+                             Image3F* JXL_RESTRICT color,
+                             const std::vector<ImageF>& extra_channels,
+                             PassesEncoderState* JXL_RESTRICT enc_state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool do_color);
+  // Encodes global info (tree + histograms) in the `writer`.
+  Status EncodeGlobalInfo(BitWriter* writer, AuxOut* aux_out);
+  // Encodes a specific modular image (identified by `stream`) in the `writer`,
+  // assigning bits to the provided `layer`.
+  Status EncodeStream(BitWriter* writer, AuxOut* aux_out, size_t layer,
+                      const ModularStreamId& stream);
+  // Creates a modular image for a given DC group of VarDCT mode. `dc` is the
+  // input DC image, not quantized; the group is specified by `group_index`, and
+  // `nl_dc` decides whether to apply a near-lossless processing to the DC or
+  // not.
+  void AddVarDCTDC(const Image3F& dc, size_t group_index, bool nl_dc,
+                   PassesEncoderState* enc_state, bool jpeg_transcode);
+  // Creates a modular image for the AC metadata of the given group
+  // (`group_index`).
+  void AddACMetadata(size_t group_index, bool jpeg_transcode,
+                     PassesEncoderState* enc_state);
+  // Encodes a RAW quantization table in `writer`. If `modular_frame_encoder` is
+  // null, the quantization table in `encoding` is used, with dimensions `size_x
+  // x size_y`. Otherwise, the table with ID `idx` is encoded from the given
+  // `modular_frame_encoder`.
+  static void EncodeQuantTable(size_t size_x, size_t size_y, BitWriter* writer,
+                               const QuantEncoding& encoding, size_t idx,
+                               ModularFrameEncoder* modular_frame_encoder);
+  // Stores a quantization table for future usage with `EncodeQuantTable`.
+  void AddQuantTable(size_t size_x, size_t size_y,
+                     const QuantEncoding& encoding, size_t idx);
+
+  std::vector<size_t> ac_metadata_size;
+  std::vector<uint8_t> extra_dc_precision;
+
+ private:
+  Status PrepareEncoding(const FrameHeader& frame_header, ThreadPool* pool,
+                         EncoderHeuristics* heuristics,
+                         AuxOut* aux_out = nullptr);
+  Status PrepareStreamParams(const Rect& rect, const CompressParams& cparams,
+                             int minShift, int maxShift,
+                             const ModularStreamId& stream, bool do_color);
+  std::vector<Image> stream_images_;
+  std::vector<ModularOptions> stream_options_;
+
+  Tree tree_;
+  std::vector<std::vector<Token>> tree_tokens_;
+  std::vector<GroupHeader> stream_headers_;
+  std::vector<std::vector<Token>> tokens_;
+  EntropyEncodingData code_;
+  std::vector<uint8_t> context_map_;
+  FrameDimensions frame_dim_;
+  CompressParams cparams_;
+  std::vector<size_t> tree_splits_;
+  std::vector<ModularMultiplierInfo> multiplier_info_;
+  std::vector<std::vector<uint32_t>> gi_channel_;
+  std::vector<size_t> image_widths_;
+  Predictor delta_pred_ = Predictor::Average4;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_MODULAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_noise.cc
new file mode 100644
index 0000000000..54bb4482e8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_noise.cc
@@ -0,0 +1,374 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_noise.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_optimize.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+namespace {
+
+using OptimizeArray = optimize::Array<double, NoiseParams::kNumNoisePoints>;
+
+float GetScoreSumsOfAbsoluteDifferences(const Image3F& opsin, const int x,
+                                        const int y, const int block_size) {
+  const int small_bl_size_x = 3;
+  const int small_bl_size_y = 4;
+  const int kNumSAD =
+      (block_size - small_bl_size_x) * (block_size - small_bl_size_y);
+  // block_size x block_size reference pixels
+  int counter = 0;
+  const int offset = 2;
+
+  std::vector<float> sad(kNumSAD, 0);
+  for (int y_bl = 0; y_bl + small_bl_size_y < block_size; ++y_bl) {
+    for (int x_bl = 0; x_bl + small_bl_size_x < block_size; ++x_bl) {
+      float sad_sum = 0;
+      // size of the center patch, we compare all the patches inside window with
+      // the center one
+      for (int cy = 0; cy < small_bl_size_y; ++cy) {
+        for (int cx = 0; cx < small_bl_size_x; ++cx) {
+          float wnd = 0.5f * (opsin.PlaneRow(1, y + y_bl + cy)[x + x_bl + cx] +
+                              opsin.PlaneRow(0, y + y_bl + cy)[x + x_bl + cx]);
+          float center =
+              0.5f * (opsin.PlaneRow(1, y + offset + cy)[x + offset + cx] +
+                      opsin.PlaneRow(0, y + offset + cy)[x + offset + cx]);
+          sad_sum += std::abs(center - wnd);
+        }
+      }
+      sad[counter++] = sad_sum;
+    }
+  }
+  const int kSamples = (kNumSAD) / 2;
+  // As with ROAD (rank order absolute distance), we keep the smallest half of
+  // the values in SAD (we use here the more robust patch SAD instead of
+  // absolute single-pixel differences).
+  std::sort(sad.begin(), sad.end());
+  const float total_sad_sum =
+      std::accumulate(sad.begin(), sad.begin() + kSamples, 0.0f);
+  return total_sad_sum / kSamples;
+}
+
+class NoiseHistogram {
+ public:
+  static constexpr int kBins = 256;
+
+  NoiseHistogram() { std::fill(bins, bins + kBins, 0); }
+
+  void Increment(const float x) { bins[Index(x)] += 1; }
+  int Get(const float x) const { return bins[Index(x)]; }
+  int Bin(const size_t bin) const { return bins[bin]; }
+
+  int Mode() const {
+    size_t max_idx = 0;
+    for (size_t i = 0; i < kBins; i++) {
+      if (bins[i] > bins[max_idx]) max_idx = i;
+    }
+    return max_idx;
+  }
+
+  double Quantile(double q01) const {
+    const int64_t total = std::accumulate(bins, bins + kBins, int64_t{1});
+    const int64_t target = static_cast<int64_t>(q01 * total);
+    // Until sum >= target:
+    int64_t sum = 0;
+    size_t i = 0;
+    for (; i < kBins; ++i) {
+      sum += bins[i];
+      // Exact match: assume middle of bin i
+      if (sum == target) {
+        return i + 0.5;
+      }
+      if (sum > target) break;
+    }
+
+    // Next non-empty bin (in case histogram is sparsely filled)
+    size_t next = i + 1;
+    while (next < kBins && bins[next] == 0) {
+      ++next;
+    }
+
+    // Linear interpolation according to how far into next we went
+    const double excess = target - sum;
+    const double weight_next = bins[Index(next)] / excess;
+    return ClampX(next * weight_next + i * (1.0 - weight_next));
+  }
+
+  // Inter-quartile range
+  double IQR() const { return Quantile(0.75) - Quantile(0.25); }
+
+ private:
+  template <typename T>
+  T ClampX(const T x) const {
+    return std::min(std::max(T(0), x), T(kBins - 1));
+  }
+  size_t Index(const float x) const { return ClampX(static_cast<int>(x)); }
+
+  uint32_t bins[kBins];
+};
+
+std::vector<float> GetSADScoresForPatches(const Image3F& opsin,
+                                          const size_t block_s,
+                                          const size_t num_bin,
+                                          NoiseHistogram* sad_histogram) {
+  std::vector<float> sad_scores(
+      (opsin.ysize() / block_s) * (opsin.xsize() / block_s), 0.0f);
+
+  int block_index = 0;
+
+  for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) {
+    for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) {
+      float sad_sc = GetScoreSumsOfAbsoluteDifferences(opsin, x, y, block_s);
+      sad_scores[block_index++] = sad_sc;
+      sad_histogram->Increment(sad_sc * num_bin);
+    }
+  }
+  return sad_scores;
+}
+
+float GetSADThreshold(const NoiseHistogram& histogram, const int num_bin) {
+  // Here we assume that the most patches with similar SAD value is a "flat"
+  // patches. However, some images might contain regular texture part and
+  // generate second strong peak at the histogram
+  // TODO(user) handle bimodal and heavy-tailed case
+  const int mode = histogram.Mode();
+  return static_cast<float>(mode) / NoiseHistogram::kBins;
+}
+
+// loss = sum asym * (F(x) - nl)^2 + kReg * num_points * sum (w[i] - w[i+1])^2
+// where asym = 1 if F(x) < nl, kAsym if F(x) > nl.
+struct LossFunction {
+  explicit LossFunction(std::vector<NoiseLevel> nl0) : nl(std::move(nl0)) {}
+
+  double Compute(const OptimizeArray& w, OptimizeArray* df,
+                 bool skip_regularization = false) const {
+    constexpr double kReg = 0.005;
+    constexpr double kAsym = 1.1;
+    double loss_function = 0;
+    for (size_t i = 0; i < w.size(); i++) {
+      (*df)[i] = 0;
+    }
+    for (auto ind : nl) {
+      std::pair<int, float> pos = IndexAndFrac(ind.intensity);
+      JXL_DASSERT(pos.first >= 0 && static_cast<size_t>(pos.first) <
+                                        NoiseParams::kNumNoisePoints - 1);
+      double low = w[pos.first];
+      double hi = w[pos.first + 1];
+      double val = low * (1.0f - pos.second) + hi * pos.second;
+      double dist = val - ind.noise_level;
+      if (dist > 0) {
+        loss_function += kAsym * dist * dist;
+        (*df)[pos.first] -= kAsym * (1.0f - pos.second) * dist;
+        (*df)[pos.first + 1] -= kAsym * pos.second * dist;
+      } else {
+        loss_function += dist * dist;
+        (*df)[pos.first] -= (1.0f - pos.second) * dist;
+        (*df)[pos.first + 1] -= pos.second * dist;
+      }
+    }
+    if (skip_regularization) return loss_function;
+    for (size_t i = 0; i + 1 < w.size(); i++) {
+      double diff = w[i] - w[i + 1];
+      loss_function += kReg * nl.size() * diff * diff;
+      (*df)[i] -= kReg * diff * nl.size();
+      (*df)[i + 1] += kReg * diff * nl.size();
+    }
+    return loss_function;
+  }
+
+  std::vector<NoiseLevel> nl;
+};
+
+void OptimizeNoiseParameters(const std::vector<NoiseLevel>& noise_level,
+                             NoiseParams* noise_params) {
+  constexpr double kMaxError = 1e-3;
+  static const double kPrecision = 1e-8;
+  static const int kMaxIter = 40;
+
+  float avg = 0;
+  for (const NoiseLevel& nl : noise_level) {
+    avg += nl.noise_level;
+  }
+  avg /= noise_level.size();
+
+  LossFunction loss_function(noise_level);
+  OptimizeArray parameter_vector;
+  for (size_t i = 0; i < parameter_vector.size(); i++) {
+    parameter_vector[i] = avg;
+  }
+
+  parameter_vector = optimize::OptimizeWithScaledConjugateGradientMethod(
+      loss_function, parameter_vector, kPrecision, kMaxIter);
+
+  OptimizeArray df = parameter_vector;
+  float loss = loss_function.Compute(parameter_vector, &df,
+                                     /*skip_regularization=*/true) /
+               noise_level.size();
+
+  // Approximation went too badly: escape with no noise at all.
+  if (loss > kMaxError) {
+    noise_params->Clear();
+    return;
+  }
+
+  for (size_t i = 0; i < parameter_vector.size(); i++) {
+    noise_params->lut[i] = std::max(parameter_vector[i], 0.0);
+  }
+}
+
+std::vector<NoiseLevel> GetNoiseLevel(
+    const Image3F& opsin, const std::vector<float>& texture_strength,
+    const float threshold, const size_t block_s) {
+  std::vector<NoiseLevel> noise_level_per_intensity;
+
+  const int filt_size = 1;
+  static const float kLaplFilter[filt_size * 2 + 1][filt_size * 2 + 1] = {
+      {-0.25f, -1.0f, -0.25f},
+      {-1.0f, 5.0f, -1.0f},
+      {-0.25f, -1.0f, -0.25f},
+  };
+
+  // The noise model is built based on channel 0.5 * (X+Y) as we notice that it
+  // is similar to the model 0.5 * (Y-X)
+  size_t patch_index = 0;
+
+  for (size_t y = 0; y + block_s <= opsin.ysize(); y += block_s) {
+    for (size_t x = 0; x + block_s <= opsin.xsize(); x += block_s) {
+      if (texture_strength[patch_index] <= threshold) {
+        // Calculate mean value
+        float mean_int = 0;
+        for (size_t y_bl = 0; y_bl < block_s; ++y_bl) {
+          for (size_t x_bl = 0; x_bl < block_s; ++x_bl) {
+            mean_int += 0.5f * (opsin.PlaneRow(1, y + y_bl)[x + x_bl] +
+                                opsin.PlaneRow(0, y + y_bl)[x + x_bl]);
+          }
+        }
+        mean_int /= block_s * block_s;
+
+        // Calculate Noise level
+        float noise_level = 0;
+        size_t count = 0;
+        for (size_t y_bl = 0; y_bl < block_s; ++y_bl) {
+          for (size_t x_bl = 0; x_bl < block_s; ++x_bl) {
+            float filtered_value = 0;
+            for (int y_f = -1 * filt_size; y_f <= filt_size; ++y_f) {
+              if ((static_cast<ssize_t>(y_bl) + y_f) >= 0 &&
+                  (y_bl + y_f) < block_s) {
+                for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) {
+                  if ((static_cast<ssize_t>(x_bl) + x_f) >= 0 &&
+                      (x_bl + x_f) < block_s) {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl + x_f] +
+                         opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl + x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  } else {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl + y_f)[x + x_bl - x_f] +
+                         opsin.PlaneRow(0, y + y_bl + y_f)[x + x_bl - x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  }
+                }
+              } else {
+                for (int x_f = -1 * filt_size; x_f <= filt_size; ++x_f) {
+                  if ((static_cast<ssize_t>(x_bl) + x_f) >= 0 &&
+                      (x_bl + x_f) < block_s) {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl + x_f] +
+                         opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl + x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  } else {
+                    filtered_value +=
+                        0.5f *
+                        (opsin.PlaneRow(1, y + y_bl - y_f)[x + x_bl - x_f] +
+                         opsin.PlaneRow(0, y + y_bl - y_f)[x + x_bl - x_f]) *
+                        kLaplFilter[y_f + filt_size][x_f + filt_size];
+                  }
+                }
+              }
+            }
+            noise_level += std::abs(filtered_value);
+            ++count;
+          }
+        }
+        noise_level /= count;
+        NoiseLevel nl;
+        nl.intensity = mean_int;
+        nl.noise_level = noise_level;
+        noise_level_per_intensity.push_back(nl);
+      }
+      ++patch_index;
+    }
+  }
+  return noise_level_per_intensity;
+}
+
+void EncodeFloatParam(float val, float precision, BitWriter* writer) {
+  JXL_ASSERT(val >= 0);
+  const int absval_quant = static_cast<int>(val * precision + 0.5f);
+  JXL_ASSERT(absval_quant < (1 << 10));
+  writer->Write(10, absval_quant);
+}
+
+}  // namespace
+
+Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params,
+                         float quality_coef) {
+  // The size of a patch in decoder might be different from encoder's patch
+  // size.
+  // For encoder: the patch size should be big enough to estimate
+  //              noise level, but, at the same time, it should be not too big
+  //              to be able to estimate intensity value of the patch
+  const size_t block_s = 8;
+  const size_t kNumBin = 256;
+  NoiseHistogram sad_histogram;
+  std::vector<float> sad_scores =
+      GetSADScoresForPatches(opsin, block_s, kNumBin, &sad_histogram);
+  float sad_threshold = GetSADThreshold(sad_histogram, kNumBin);
+  // If threshold is too large, the image has a strong pattern. This pattern
+  // fools our model and it will add too much noise. Therefore, we do not add
+  // noise for such images
+  if (sad_threshold > 0.15f || sad_threshold <= 0.0f) {
+    noise_params->Clear();
+    return false;
+  }
+  std::vector<NoiseLevel> nl =
+      GetNoiseLevel(opsin, sad_scores, sad_threshold, block_s);
+
+  OptimizeNoiseParameters(nl, noise_params);
+  for (float& i : noise_params->lut) {
+    i *= quality_coef * 1.4;
+  }
+  return noise_params->HasAny();
+}
+
+void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out) {
+  JXL_ASSERT(noise_params.HasAny());
+
+  BitWriter::Allotment allotment(writer, NoiseParams::kNumNoisePoints * 16);
+  for (float i : noise_params.lut) {
+    EncodeFloatParam(i, kNoisePrecision, writer);
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_noise.h b/third_party/jpeg-xl/lib/jxl/enc_noise.h
new file mode 100644
index 0000000000..851fdd12db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_noise.h
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_NOISE_H_
+#define LIB_JXL_ENC_NOISE_H_
+
+// Noise parameter estimation.
+
+#include <stddef.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Get parameters of the noise for NoiseParams model
+// Returns whether a valid noise model (with HasAny()) is set.
+Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params,
+                         float quality_coef);
+
+// Does not write anything if `noise_params` are empty. Otherwise, caller must
+// set FrameHeader.flags.kNoise.
+void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer,
+                 size_t layer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc
new file mode 100644
index 0000000000..6865ff67df
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.cc
@@ -0,0 +1,163 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_optimize.h"
+
+#include <algorithm>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace optimize {
+
+namespace {
+
+// simplex vector must be sorted by first element of its elements
+std::vector<double> Midpoint(const std::vector<std::vector<double>>& simplex) {
+  JXL_CHECK(!simplex.empty());
+  JXL_CHECK(simplex.size() == simplex[0].size());
+  int dim = simplex.size() - 1;
+  std::vector<double> result(dim + 1, 0);
+  for (int i = 0; i < dim; i++) {
+    for (int k = 0; k < dim; k++) {
+      result[i + 1] += simplex[k][i + 1];
+    }
+    result[i + 1] /= dim;
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Subtract(const std::vector<double>& a,
+                             const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = a[i] - b[i];
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Add(const std::vector<double>& a,
+                        const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = a[i] + b[i];
+  }
+  return result;
+}
+
+// first element ignored
+std::vector<double> Average(const std::vector<double>& a,
+                            const std::vector<double>& b) {
+  JXL_CHECK(a.size() == b.size());
+  std::vector<double> result(a.size());
+  result[0] = 0;
+  for (size_t i = 1; i < result.size(); i++) {
+    result[i] = 0.5 * (a[i] + b[i]);
+  }
+  return result;
+}
+
+// vec: [0] will contain the objective function, [1:] will
+//   contain the vector position for the objective function.
+// fun: the function evaluates the value.
+void Eval(std::vector<double>* vec,
+          const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> args(vec->begin() + 1, vec->end());
+  (*vec)[0] = fun(args);
+}
+
+void Sort(std::vector<std::vector<double>>* simplex) {
+  std::sort(simplex->begin(), simplex->end());
+}
+
+// Main iteration step of Nelder-Mead like optimization.
+void Reflect(std::vector<std::vector<double>>* simplex,
+             const std::function<double(const std::vector<double>&)>& fun) {
+  Sort(simplex);
+  const std::vector<double>& last = simplex->back();
+  std::vector<double> mid = Midpoint(*simplex);
+  std::vector<double> diff = Subtract(mid, last);
+  std::vector<double> mirrored = Add(mid, diff);
+  Eval(&mirrored, fun);
+  if (mirrored[0] > (*simplex)[simplex->size() - 2][0]) {
+    // Still the worst, shrink towards the best.
+    std::vector<double> shrinking = Average(simplex->back(), (*simplex)[0]);
+    Eval(&shrinking, fun);
+    simplex->back() = shrinking;
+  } else if (mirrored[0] < (*simplex)[0][0]) {
+    // new best
+    std::vector<double> even_further = Add(mirrored, diff);
+    Eval(&even_further, fun);
+    if (even_further[0] < mirrored[0]) {
+      mirrored = even_further;
+    }
+    simplex->back() = mirrored;
+  } else {
+    // not a best, not a worst point
+    simplex->back() = mirrored;
+  }
+}
+
+// Initialize the simplex at origin.
+std::vector<std::vector<double>> InitialSimplex(
+    int dim, double amount, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> best(1 + dim, 0);
+  std::copy(init.begin(), init.end(), best.begin() + 1);
+  Eval(&best, fun);
+  std::vector<std::vector<double>> result{best};
+  for (int i = 0; i < dim; i++) {
+    best = result[0];
+    best[i + 1] += amount;
+    Eval(&best, fun);
+    result.push_back(best);
+    Sort(&result);
+  }
+  return result;
+}
+
+// For comparing the same with the python tool
+/*void RunSimplexExternal(
+    int dim, double amount, int max_iterations,
+    const std::function<double((const vector<double>&))>& fun) {
+  vector<double> vars;
+  for (int i = 0; i < dim; i++) {
+    vars.push_back(atof(getenv(StrCat("VAR", i).c_str())));
+  }
+  double result = fun(vars);
+  std::cout << "Result=" << result;
+}*/
+
+}  // namespace
+
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<std::vector<double>> simplex =
+      InitialSimplex(dim, amount, init, fun);
+  for (int i = 0; i < max_iterations; i++) {
+    Sort(&simplex);
+    Reflect(&simplex, fun);
+  }
+  return simplex[0];
+}
+
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations,
+    const std::function<double(const std::vector<double>&)>& fun) {
+  std::vector<double> init(dim, 0.0);
+  return RunSimplex(dim, amount, max_iterations, init, fun);
+}
+
+}  // namespace optimize
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize.h b/third_party/jpeg-xl/lib/jxl/enc_optimize.h
new file mode 100644
index 0000000000..0a60198214
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize.h
@@ -0,0 +1,218 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Utility functions for optimizing multi-dimensional nonlinear functions.
+
+#ifndef LIB_JXL_OPTIMIZE_H_
+#define LIB_JXL_OPTIMIZE_H_
+
+#include <stdio.h>
+
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace optimize {
+
+// An array type of numeric values that supports math operations with operator-,
+// operator+, etc.
+template <typename T, size_t N>
+class Array {
+ public:
+  Array() = default;
+  explicit Array(T v) {
+    for (size_t i = 0; i < N; i++) v_[i] = v;
+  }
+
+  size_t size() const { return N; }
+
+  T& operator[](size_t index) {
+    JXL_DASSERT(index < N);
+    return v_[index];
+  }
+  T operator[](size_t index) const {
+    JXL_DASSERT(index < N);
+    return v_[index];
+  }
+
+ private:
+  // The values used by this Array.
+  T v_[N];
+};
+
+template <typename T, size_t N>
+Array<T, N> operator+(const Array<T, N>& x, const Array<T, N>& y) {
+  Array<T, N> z;
+  for (size_t i = 0; i < N; ++i) {
+    z[i] = x[i] + y[i];
+  }
+  return z;
+}
+
+template <typename T, size_t N>
+Array<T, N> operator-(const Array<T, N>& x, const Array<T, N>& y) {
+  Array<T, N> z;
+  for (size_t i = 0; i < N; ++i) {
+    z[i] = x[i] - y[i];
+  }
+  return z;
+}
+
+template <typename T, size_t N>
+Array<T, N> operator*(T v, const Array<T, N>& x) {
+  Array<T, N> y;
+  for (size_t i = 0; i < N; ++i) {
+    y[i] = v * x[i];
+  }
+  return y;
+}
+
+template <typename T, size_t N>
+T operator*(const Array<T, N>& x, const Array<T, N>& y) {
+  T r = 0.0;
+  for (size_t i = 0; i < N; ++i) {
+    r += x[i] * y[i];
+  }
+  return r;
+}
+
+// Runs Nelder-Mead like optimization. Runs for max_iterations times,
+// fun gets called with a vector of size dim as argument, and returns the score
+// based on those parameters (lower is better). Returns a vector of dim+1
+// dimensions, where the first value is the optimal value of the function and
+// the rest is the argmin value. Use init to pass an initial guess or where
+// the optimal value is.
+//
+// Usage example:
+//
+// RunSimplex(2, 0.1, 100, [](const vector<float>& v) {
+//   return (v[0] - 5) * (v[0] - 5) + (v[1] - 7) * (v[1] - 7);
+// });
+//
+// Returns (0.0, 5, 7)
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations,
+    const std::function<double(const std::vector<double>&)>& fun);
+std::vector<double> RunSimplex(
+    int dim, double amount, int max_iterations, const std::vector<double>& init,
+    const std::function<double(const std::vector<double>&)>& fun);
+
+// Implementation of the Scaled Conjugate Gradient method described in the
+// following paper:
+//   Moller, M. "A Scaled Conjugate Gradient Algorithm for Fast Supervised
+//   Learning", Neural Networks, Vol. 6. pp. 525-533, 1993
+//   http://sci2s.ugr.es/keel/pdf/algorithm/articulo/moller1990.pdf
+//
+// The Function template parameter is a class that has the following method:
+//
+//   // Returns the value of the function at point w and sets *df to be the
+//   // negative gradient vector of the function at point w.
+//   double Compute(const optimize::Array<T, N>& w,
+//                  optimize::Array<T, N>* df) const;
+//
+// Returns a vector w, such that |df(w)| < grad_norm_threshold.
+template <typename T, size_t N, typename Function>
+Array<T, N> OptimizeWithScaledConjugateGradientMethod(
+    const Function& f, const Array<T, N>& w0, const T grad_norm_threshold,
+    size_t max_iters) {
+  const size_t n = w0.size();
+  const T rsq_threshold = grad_norm_threshold * grad_norm_threshold;
+  const T sigma0 = static_cast<T>(0.0001);
+  const T l_min = static_cast<T>(1.0e-15);
+  const T l_max = static_cast<T>(1.0e15);
+
+  Array<T, N> w = w0;
+  Array<T, N> wp;
+  Array<T, N> r;
+  Array<T, N> rt;
+  Array<T, N> e;
+  Array<T, N> p;
+  T psq;
+  T fp;
+  T D;
+  T d;
+  T m;
+  T a;
+  T b;
+  T s;
+  T t;
+
+  T fw = f.Compute(w, &r);
+  T rsq = r * r;
+  e = r;
+  p = r;
+  T l = static_cast<T>(1.0);
+  bool success = true;
+  size_t n_success = 0;
+  size_t k = 0;
+
+  while (k++ < max_iters) {
+    if (success) {
+      m = -(p * r);
+      if (m >= 0) {
+        p = r;
+        m = -(p * r);
+      }
+      psq = p * p;
+      s = sigma0 / std::sqrt(psq);
+      f.Compute(w + (s * p), &rt);
+      t = (p * (r - rt)) / s;
+    }
+
+    d = t + l * psq;
+    if (d <= 0) {
+      d = l * psq;
+      l = l - t / psq;
+    }
+
+    a = -m / d;
+    wp = w + a * p;
+    fp = f.Compute(wp, &rt);
+
+    D = 2.0 * (fp - fw) / (a * m);
+    if (D >= 0.0) {
+      success = true;
+      n_success++;
+      w = wp;
+    } else {
+      success = false;
+    }
+
+    if (success) {
+      e = r;
+      r = rt;
+      rsq = r * r;
+      fw = fp;
+      if (rsq <= rsq_threshold) {
+        break;
+      }
+    }
+
+    if (D < 0.25) {
+      l = std::min(4.0 * l, l_max);
+    } else if (D > 0.75) {
+      l = std::max(0.25 * l, l_min);
+    }
+
+    if ((n_success % n) == 0) {
+      p = r;
+      l = 1.0;
+    } else if (success) {
+      b = ((e - r) * r) / m;
+      p = b * p + r;
+    }
+  }
+
+  return w;
+}
+
+}  // namespace optimize
+}  // namespace jxl
+
+#endif  // LIB_JXL_OPTIMIZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc
new file mode 100644
index 0000000000..1c6699f99e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_optimize_test.cc
@@ -0,0 +1,109 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_optimize.h"
+
+#include <stdio.h>
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace optimize {
+namespace {
+
+// The maximum number of iterations for the test.
+static const size_t kMaxTestIter = 100000;
+
+// F(w) = (w - w_min)^2.
+struct SimpleQuadraticFunction {
+  typedef Array<double, 2> ArrayType;
+  explicit SimpleQuadraticFunction(const ArrayType& w0) : w_min(w0) {}
+
+  double Compute(const ArrayType& w, ArrayType* df) const {
+    ArrayType dw = w - w_min;
+    *df = -2.0 * dw;
+    return dw * dw;
+  }
+
+  ArrayType w_min;
+};
+
+// F(alpha, beta, gamma| x,y) = \sum_i(y_i - (alpha x_i ^ gamma + beta))^2.
+struct PowerFunction {
+  explicit PowerFunction(const std::vector<double>& x0,
+                         const std::vector<double>& y0)
+      : x(x0), y(y0) {}
+
+  typedef Array<double, 3> ArrayType;
+  double Compute(const ArrayType& w, ArrayType* df) const {
+    double loss_function = 0;
+    (*df)[0] = 0;
+    (*df)[1] = 0;
+    (*df)[2] = 0;
+    for (size_t ind = 0; ind < y.size(); ++ind) {
+      if (x[ind] != 0) {
+        double l_f = y[ind] - (w[0] * pow(x[ind], w[1]) + w[2]);
+        (*df)[0] += 2.0 * l_f * pow(x[ind], w[1]);
+        (*df)[1] += 2.0 * l_f * w[0] * pow(x[ind], w[1]) * log(x[ind]);
+        (*df)[2] += 2.0 * l_f * 1;
+        loss_function += l_f * l_f;
+      }
+    }
+    return loss_function;
+  }
+
+  std::vector<double> x;
+  std::vector<double> y;
+};
+
+TEST(OptimizeTest, SimpleQuadraticFunction) {
+  SimpleQuadraticFunction::ArrayType w_min;
+  w_min[0] = 1.0;
+  w_min[1] = 2.0;
+  SimpleQuadraticFunction f(w_min);
+  SimpleQuadraticFunction::ArrayType w(0.);
+  static const double kPrecision = 1e-8;
+  w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision,
+                                                          kMaxTestIter);
+  EXPECT_NEAR(w[0], 1.0, kPrecision);
+  EXPECT_NEAR(w[1], 2.0, kPrecision);
+}
+
+TEST(OptimizeTest, PowerFunction) {
+  std::vector<double> x(10);
+  std::vector<double> y(10);
+  for (int ind = 0; ind < 10; ++ind) {
+    x[ind] = 1. * ind;
+    y[ind] = 2. * pow(x[ind], 3) + 5.;
+  }
+  PowerFunction f(x, y);
+  PowerFunction::ArrayType w(0.);
+
+  static const double kPrecision = 0.01;
+  w = optimize::OptimizeWithScaledConjugateGradientMethod(f, w, kPrecision,
+                                                          kMaxTestIter);
+  EXPECT_NEAR(w[0], 2.0, kPrecision);
+  EXPECT_NEAR(w[1], 3.0, kPrecision);
+  EXPECT_NEAR(w[2], 5.0, kPrecision);
+}
+
+TEST(OptimizeTest, SimplexOptTest) {
+  auto f = [](const std::vector<double>& x) -> double {
+    double t1 = x[0] - 1.0;
+    double t2 = x[1] + 1.5;
+    return 2.0 + t1 * t1 + t2 * t2;
+  };
+  auto opt = RunSimplex(2, 0.01, 100, f);
+  EXPECT_EQ(opt.size(), 3u);
+
+  static const double kPrecision = 0.01;
+  EXPECT_NEAR(opt[0], 2.0, kPrecision);
+  EXPECT_NEAR(opt[1], 1.0, kPrecision);
+  EXPECT_NEAR(opt[2], -1.5, kPrecision);
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_params.h b/third_party/jpeg-xl/lib/jxl/enc_params.h
new file mode 100644
index 0000000000..737a951362
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_params.h
@@ -0,0 +1,225 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PARAMS_H_
+#define LIB_JXL_ENC_PARAMS_H_
+
+// Parameters and flags that govern JXL compression.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+enum class SpeedTier {
+  // Try multiple combinations of Tortoise flags for modular mode. Otherwise
+  // like kTortoise.
+  kGlacier = 0,
+  // Turns on FindBestQuantizationHQ loop. Equivalent to "guetzli" mode.
+  kTortoise = 1,
+  // Turns on FindBestQuantization butteraugli loop.
+  kKitten = 2,
+  // Turns on dots, patches, and spline detection by default, as well as full
+  // context clustering. Default.
+  kSquirrel = 3,
+  // Turns on error diffusion and full AC strategy heuristics. Equivalent to
+  // "fast" mode.
+  kWombat = 4,
+  // Turns on gaborish by default, non-default cmap, initial quant field.
+  kHare = 5,
+  // Turns on simple heuristics for AC strategy, quant field, and clustering;
+  // also enables coefficient reordering.
+  kCheetah = 6,
+  // Turns off most encoder features. Does context clustering.
+  // Modular: uses fixed tree with Weighted predictor.
+  kFalcon = 7,
+  // Currently fastest possible setting for VarDCT.
+  // Modular: uses fixed tree with Gradient predictor.
+  kThunder = 8,
+  // VarDCT: same as kThunder.
+  // Modular: no tree, Gradient predictor, fast histograms
+  kLightning = 9
+};
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct CompressParams {
+  float butteraugli_distance = 1.0f;
+
+  // explicit distances for extra channels (defaults to butteraugli_distance
+  // when not set; value of -1 can be used to represent 'default')
+  std::vector<float> ec_distance;
+  size_t target_size = 0;
+  float target_bitrate = 0.0f;
+
+  // 0.0 means search for the adaptive quantization map that matches the
+  // butteraugli distance, positive values mean quantize everywhere with that
+  // value.
+  float uniform_quant = 0.0f;
+  float quant_border_bias = 0.0f;
+
+  // Try to achieve a maximum pixel-by-pixel error on each channel.
+  bool max_error_mode = false;
+  float max_error[3] = {0.0, 0.0, 0.0};
+
+  SpeedTier speed_tier = SpeedTier::kSquirrel;
+  int brotli_effort = -1;
+
+  // 0 = default.
+  // 1 = slightly worse quality.
+  // 4 = fastest speed, lowest quality
+  size_t decoding_speed_tier = 0;
+
+  int max_butteraugli_iters = 4;
+
+  int max_butteraugli_iters_guetzli_mode = 100;
+
+  ColorTransform color_transform = ColorTransform::kXYB;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  // If true, the "modular mode options" members below are used.
+  bool modular_mode = false;
+
+  // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024).
+  size_t modular_group_size_shift = 1;
+
+  Override preview = Override::kDefault;
+  Override noise = Override::kDefault;
+  Override dots = Override::kDefault;
+  Override patches = Override::kDefault;
+  Override gaborish = Override::kDefault;
+  int epf = -1;
+
+  // Progressive mode.
+  bool progressive_mode = false;
+
+  // Quantized-progressive mode.
+  bool qprogressive_mode = false;
+
+  // Put center groups first in the bitstream.
+  bool centerfirst = false;
+
+  // Pixel coordinates of the center. First group will contain that center.
+  size_t center_x = static_cast<size_t>(-1);
+  size_t center_y = static_cast<size_t>(-1);
+
+  int progressive_dc = -1;
+
+  // If on: preserve color of invisible pixels (if off: don't care)
+  // Default: on for lossless, off for lossy
+  Override keep_invisible = Override::kDefault;
+
+  // Currently unused as of 2020-01.
+  bool clear_metadata = false;
+
+  // Prints extra information during/after encoding.
+  bool verbose = false;
+  bool log_search_state = false;
+
+  ButteraugliParams ba_params;
+
+  // Force usage of CfL when doing JPEG recompression. This can have unexpected
+  // effects on the decoded pixels, while still being JPEG-compliant and
+  // allowing reconstruction of the original JPEG.
+  bool force_cfl_jpeg_recompression = true;
+
+  // Use brotli compression for any boxes derived from a JPEG frame.
+  bool jpeg_compress_boxes = true;
+
+  // Set the noise to what it would approximately be if shooting at the nominal
+  // exposure for a given ISO setting on a 35mm camera.
+  float photon_noise_iso = 0;
+
+  // modular mode options below
+  ModularOptions options;
+  int responsive = -1;
+  // empty for default squeeze
+  std::vector<SqueezeParams> squeezes;
+  int colorspace = -1;
+  // Use Global channel palette if #colors < this percentage of range
+  float channel_colors_pre_transform_percent = 95.f;
+  // Use Local channel palette if #colors < this percentage of range
+  float channel_colors_percent = 80.f;
+  int palette_colors = 1 << 10;  // up to 10-bit palette is probably worthwhile
+  bool lossy_palette = false;
+
+  // Returns whether these params are lossless as defined by SetLossless();
+  bool IsLossless() const { return modular_mode && ModularPartIsLossless(); }
+
+  bool ModularPartIsLossless() const {
+    if (modular_mode) {
+      // YCbCr is also considered lossless here since it's intended for
+      // source material that is already YCbCr (we don't do the fwd transform)
+      if (butteraugli_distance != 0 ||
+          color_transform == jxl::ColorTransform::kXYB)
+        return false;
+    }
+    for (float f : ec_distance) {
+      if (f > 0) return false;
+      if (f < 0 && butteraugli_distance != 0) return false;
+    }
+    // if no explicit ec_distance given, and using vardct, then the modular part
+    // is empty or not lossless
+    if (!modular_mode && ec_distance.empty()) return false;
+    // all modular channels are encoded at distance 0
+    return true;
+  }
+
+  // Sets the parameters required to make the codec lossless.
+  void SetLossless() {
+    modular_mode = true;
+    butteraugli_distance = 0.0f;
+    for (float &f : ec_distance) f = 0.0f;
+    color_transform = jxl::ColorTransform::kNone;
+  }
+
+  // Down/upsample the image before encoding / after decoding by this factor.
+  // The resampling value can also be set to <= 0 to automatically choose based
+  // on distance, however EncodeFrame doesn't support this, so it is
+  // required to call PostInit() to set a valid positive resampling
+  // value and altered butteraugli score if this is used.
+  int resampling = -1;
+  int ec_resampling = -1;
+  // Skip the downsampling before encoding if this is true.
+  bool already_downsampled = false;
+  // Butteraugli target distance on the original full size image, this can be
+  // different from butteraugli_distance if resampling was used.
+  float original_butteraugli_distance = -1.0f;
+
+  float quant_ac_rescale = 1.0;
+
+  // Codestream level to conform to.
+  // -1: don't care
+  int level = -1;
+
+  std::vector<float> manual_noise;
+  std::vector<float> manual_xyb_factors;
+};
+
+static constexpr float kMinButteraugliForDynamicAR = 0.5f;
+static constexpr float kMinButteraugliForDots = 3.0f;
+static constexpr float kMinButteraugliToSubtractOriginalPatches = 3.0f;
+
+// Always off
+static constexpr float kMinButteraugliForNoise = 99.0f;
+
+// Minimum butteraugli distance the encoder accepts.
+static constexpr float kMinButteraugliDistance = 0.001f;
+
+// Tile size for encoder-side processing. Must be equal to color tile dim in the
+// current implementation.
+static constexpr size_t kEncTileDim = 64;
+static constexpr size_t kEncTileDimInBlocks = kEncTileDim / kBlockDim;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
new file mode 100644
index 0000000000..157e18c3a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
@@ -0,0 +1,813 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_patch_dictionary.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_dot_dictionary.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/patch_dictionary_internal.h"
+
+namespace jxl {
+
+static constexpr size_t kPatchFrameReferenceId = 3;
+
+// static
+void PatchDictionaryEncoder::Encode(const PatchDictionary& pdic,
+                                    BitWriter* writer, size_t layer,
+                                    AuxOut* aux_out) {
+  JXL_ASSERT(pdic.HasAny());
+  std::vector<std::vector<Token>> tokens(1);
+  size_t num_ec = pdic.shared_->metadata->m.num_extra_channels;
+
+  auto add_num = [&](int context, size_t num) {
+    tokens[0].emplace_back(context, num);
+  };
+  size_t num_ref_patch = 0;
+  for (size_t i = 0; i < pdic.positions_.size();) {
+    size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx;
+    while (i < pdic.positions_.size() &&
+           pdic.positions_[i].ref_pos_idx == ref_pos_idx) {
+      i++;
+    }
+    num_ref_patch++;
+  }
+  add_num(kNumRefPatchContext, num_ref_patch);
+  size_t blend_pos = 0;
+  for (size_t i = 0; i < pdic.positions_.size();) {
+    size_t i_start = i;
+    size_t ref_pos_idx = pdic.positions_[i].ref_pos_idx;
+    const auto& ref_pos = pdic.ref_positions_[ref_pos_idx];
+    while (i < pdic.positions_.size() &&
+           pdic.positions_[i].ref_pos_idx == ref_pos_idx) {
+      i++;
+    }
+    size_t num = i - i_start;
+    JXL_ASSERT(num > 0);
+    add_num(kReferenceFrameContext, ref_pos.ref);
+    add_num(kPatchReferencePositionContext, ref_pos.x0);
+    add_num(kPatchReferencePositionContext, ref_pos.y0);
+    add_num(kPatchSizeContext, ref_pos.xsize - 1);
+    add_num(kPatchSizeContext, ref_pos.ysize - 1);
+    add_num(kPatchCountContext, num - 1);
+    for (size_t j = i_start; j < i; j++) {
+      const PatchPosition& pos = pdic.positions_[j];
+      if (j == i_start) {
+        add_num(kPatchPositionContext, pos.x);
+        add_num(kPatchPositionContext, pos.y);
+      } else {
+        add_num(kPatchOffsetContext,
+                PackSigned(pos.x - pdic.positions_[j - 1].x));
+        add_num(kPatchOffsetContext,
+                PackSigned(pos.y - pdic.positions_[j - 1].y));
+      }
+      for (size_t j = 0; j < num_ec + 1; ++j, ++blend_pos) {
+        const PatchBlending& info = pdic.blendings_[blend_pos];
+        add_num(kPatchBlendModeContext, static_cast<uint32_t>(info.mode));
+        if (UsesAlpha(info.mode) &&
+            pdic.shared_->metadata->m.extra_channel_info.size() > 1) {
+          add_num(kPatchAlphaChannelContext, info.alpha_channel);
+        }
+        if (UsesClamp(info.mode)) {
+          add_num(kPatchClampContext, info.clamp);
+        }
+      }
+    }
+  }
+
+  EntropyEncodingData codes;
+  std::vector<uint8_t> context_map;
+  BuildAndEncodeHistograms(HistogramParams(), kNumPatchDictionaryContexts,
+                           tokens, &codes, &context_map, writer, layer,
+                           aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+// static
+void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic,
+                                          Image3F* opsin) {
+  size_t num_ec = pdic.shared_->metadata->m.num_extra_channels;
+  // TODO(veluca): this can likely be optimized knowing it runs on full images.
+  for (size_t y = 0; y < opsin->ysize(); y++) {
+    float* JXL_RESTRICT rows[3] = {
+        opsin->PlaneRow(0, y),
+        opsin->PlaneRow(1, y),
+        opsin->PlaneRow(2, y),
+    };
+    for (size_t pos_idx : pdic.GetPatchesForRow(y)) {
+      const size_t blending_idx = pos_idx * (num_ec + 1);
+      const PatchPosition& pos = pdic.positions_[pos_idx];
+      const PatchReferencePosition& ref_pos =
+          pdic.ref_positions_[pos.ref_pos_idx];
+      const PatchBlendMode mode = pdic.blendings_[blending_idx].mode;
+      size_t by = pos.y;
+      size_t bx = pos.x;
+      size_t xsize = ref_pos.xsize;
+      JXL_DASSERT(y >= by);
+      JXL_DASSERT(y < by + ref_pos.ysize);
+      size_t iy = y - by;
+      size_t ref = ref_pos.ref;
+      const float* JXL_RESTRICT ref_rows[3] = {
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              0, ref_pos.y0 + iy) +
+              ref_pos.x0,
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              1, ref_pos.y0 + iy) +
+              ref_pos.x0,
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
+              2, ref_pos.y0 + iy) +
+              ref_pos.x0,
+      };
+      for (size_t ix = 0; ix < xsize; ix++) {
+        for (size_t c = 0; c < 3; c++) {
+          if (mode == PatchBlendMode::kAdd) {
+            rows[c][bx + ix] -= ref_rows[c][ix];
+          } else if (mode == PatchBlendMode::kReplace) {
+            rows[c][bx + ix] = 0;
+          } else if (mode == PatchBlendMode::kNone) {
+            // Nothing to do.
+          } else {
+            JXL_ABORT("Blending mode %u not yet implemented", (uint32_t)mode);
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace {
+
+struct PatchColorspaceInfo {
+  float kChannelDequant[3];
+  float kChannelWeights[3];
+
+  explicit PatchColorspaceInfo(bool is_xyb) {
+    if (is_xyb) {
+      kChannelDequant[0] = 0.01615;
+      kChannelDequant[1] = 0.08875;
+      kChannelDequant[2] = 0.1922;
+      kChannelWeights[0] = 30.0;
+      kChannelWeights[1] = 3.0;
+      kChannelWeights[2] = 1.0;
+    } else {
+      kChannelDequant[0] = 20.0f / 255;
+      kChannelDequant[1] = 22.0f / 255;
+      kChannelDequant[2] = 20.0f / 255;
+      kChannelWeights[0] = 0.017 * 255;
+      kChannelWeights[1] = 0.02 * 255;
+      kChannelWeights[2] = 0.017 * 255;
+    }
+  }
+
+  float ScaleForQuantization(float val, size_t c) {
+    return val / kChannelDequant[c];
+  }
+
+  int Quantize(float val, size_t c) {
+    return truncf(ScaleForQuantization(val, c));
+  }
+
+  bool is_similar_v(const float v1[3], const float v2[3], float threshold) {
+    float distance = 0;
+    for (size_t c = 0; c < 3; c++) {
+      distance += std::fabs(v1[c] - v2[c]) * kChannelWeights[c];
+    }
+    return distance <= threshold;
+  }
+};
+
+std::vector<PatchInfo> FindTextLikePatches(
+    const Image3F& opsin, const PassesEncoderState* JXL_RESTRICT state,
+    ThreadPool* pool, AuxOut* aux_out, bool is_xyb) {
+  if (state->cparams.patches == Override::kOff) return {};
+
+  PatchColorspaceInfo pci(is_xyb);
+  float kSimilarThreshold = 0.8f;
+
+  auto is_similar_impl = [&pci](std::pair<uint32_t, uint32_t> p1,
+                                std::pair<uint32_t, uint32_t> p2,
+                                const float* JXL_RESTRICT rows[3],
+                                size_t stride, float threshold) {
+    float v1[3], v2[3];
+    for (size_t c = 0; c < 3; c++) {
+      v1[c] = rows[c][p1.second * stride + p1.first];
+      v2[c] = rows[c][p2.second * stride + p2.first];
+    }
+    return pci.is_similar_v(v1, v2, threshold);
+  };
+
+  std::atomic<bool> has_screenshot_areas{false};
+  const size_t opsin_stride = opsin.PixelsPerRow();
+  const float* JXL_RESTRICT opsin_rows[3] = {opsin.ConstPlaneRow(0, 0),
+                                             opsin.ConstPlaneRow(1, 0),
+                                             opsin.ConstPlaneRow(2, 0)};
+
+  auto is_same = [&opsin_rows, opsin_stride](std::pair<uint32_t, uint32_t> p1,
+                                             std::pair<uint32_t, uint32_t> p2) {
+    for (size_t c = 0; c < 3; c++) {
+      float v1 = opsin_rows[c][p1.second * opsin_stride + p1.first];
+      float v2 = opsin_rows[c][p2.second * opsin_stride + p2.first];
+      if (std::fabs(v1 - v2) > 1e-4) {
+        return false;
+      }
+    }
+    return true;
+  };
+
+  auto is_similar = [&](std::pair<uint32_t, uint32_t> p1,
+                        std::pair<uint32_t, uint32_t> p2) {
+    return is_similar_impl(p1, p2, opsin_rows, opsin_stride, kSimilarThreshold);
+  };
+
+  constexpr int64_t kPatchSide = 4;
+  constexpr int64_t kExtraSide = 4;
+
+  // Look for kPatchSide size squares, naturally aligned, that all have the same
+  // pixel values.
+  ImageB is_screenshot_like(DivCeil(opsin.xsize(), kPatchSide),
+                            DivCeil(opsin.ysize(), kPatchSide));
+  ZeroFillImage(&is_screenshot_like);
+  uint8_t* JXL_RESTRICT screenshot_row = is_screenshot_like.Row(0);
+  const size_t screenshot_stride = is_screenshot_like.PixelsPerRow();
+  const auto process_row = [&](const uint32_t y, size_t /* thread */) {
+    for (uint64_t x = 0; x < opsin.xsize() / kPatchSide; x++) {
+      bool all_same = true;
+      for (size_t iy = 0; iy < static_cast<size_t>(kPatchSide); iy++) {
+        for (size_t ix = 0; ix < static_cast<size_t>(kPatchSide); ix++) {
+          size_t cx = x * kPatchSide + ix;
+          size_t cy = y * kPatchSide + iy;
+          if (!is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) {
+            all_same = false;
+            break;
+          }
+        }
+      }
+      if (!all_same) continue;
+      size_t num = 0;
+      size_t num_same = 0;
+      for (int64_t iy = -kExtraSide; iy < kExtraSide + kPatchSide; iy++) {
+        for (int64_t ix = -kExtraSide; ix < kExtraSide + kPatchSide; ix++) {
+          int64_t cx = x * kPatchSide + ix;
+          int64_t cy = y * kPatchSide + iy;
+          if (cx < 0 || static_cast<uint64_t>(cx) >= opsin.xsize() ||  //
+              cy < 0 || static_cast<uint64_t>(cy) >= opsin.ysize()) {
+            continue;
+          }
+          num++;
+          if (is_same({cx, cy}, {x * kPatchSide, y * kPatchSide})) num_same++;
+        }
+      }
+      // Too few equal pixels nearby.
+      if (num_same * 8 < num * 7) continue;
+      screenshot_row[y * screenshot_stride + x] = 1;
+      has_screenshot_areas = true;
+    }
+  };
+  JXL_CHECK(RunOnPool(pool, 0, opsin.ysize() / kPatchSide, ThreadPool::NoInit,
+                      process_row, "IsScreenshotLike"));
+
+  // TODO(veluca): also parallelize the rest of this function.
+  if (WantDebugOutput(aux_out)) {
+    aux_out->DumpPlaneNormalized("screenshot_like", is_screenshot_like);
+  }
+
+  constexpr int kSearchRadius = 1;
+
+  if (!ApplyOverride(state->cparams.patches, has_screenshot_areas)) {
+    return {};
+  }
+
+  // Search for "similar enough" pixels near the screenshot-like areas.
+  ImageB is_background(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&is_background);
+  Image3F background(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&background);
+  constexpr size_t kDistanceLimit = 50;
+  float* JXL_RESTRICT background_rows[3] = {
+      background.PlaneRow(0, 0),
+      background.PlaneRow(1, 0),
+      background.PlaneRow(2, 0),
+  };
+  const size_t background_stride = background.PixelsPerRow();
+  uint8_t* JXL_RESTRICT is_background_row = is_background.Row(0);
+  const size_t is_background_stride = is_background.PixelsPerRow();
+  std::vector<
+      std::pair<std::pair<uint32_t, uint32_t>, std::pair<uint32_t, uint32_t>>>
+      queue;
+  size_t queue_front = 0;
+  for (size_t y = 0; y < opsin.ysize(); y++) {
+    for (size_t x = 0; x < opsin.xsize(); x++) {
+      if (!screenshot_row[screenshot_stride * (y / kPatchSide) +
+                          (x / kPatchSide)])
+        continue;
+      queue.push_back({{x, y}, {x, y}});
+    }
+  }
+  while (queue.size() != queue_front) {
+    std::pair<uint32_t, uint32_t> cur = queue[queue_front].first;
+    std::pair<uint32_t, uint32_t> src = queue[queue_front].second;
+    queue_front++;
+    if (is_background_row[cur.second * is_background_stride + cur.first])
+      continue;
+    is_background_row[cur.second * is_background_stride + cur.first] = 1;
+    for (size_t c = 0; c < 3; c++) {
+      background_rows[c][cur.second * background_stride + cur.first] =
+          opsin_rows[c][src.second * opsin_stride + src.first];
+    }
+    for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
+      for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
+        if (dx == 0 && dy == 0) continue;
+        int next_first = cur.first + dx;
+        int next_second = cur.second + dy;
+        if (next_first < 0 || next_second < 0 ||
+            static_cast<uint32_t>(next_first) >= opsin.xsize() ||
+            static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+          continue;
+        }
+        if (static_cast<uint32_t>(
+                std::abs(next_first - static_cast<int>(src.first)) +
+                std::abs(next_second - static_cast<int>(src.second))) >
+            kDistanceLimit) {
+          continue;
+        }
+        std::pair<uint32_t, uint32_t> next{next_first, next_second};
+        if (is_similar(src, next)) {
+          if (!screenshot_row[next.second / kPatchSide * screenshot_stride +
+                              next.first / kPatchSide] ||
+              is_same(src, next)) {
+            if (!is_background_row[next.second * is_background_stride +
+                                   next.first])
+              queue.emplace_back(next, src);
+          }
+        }
+      }
+    }
+  }
+  queue.clear();
+
+  ImageF ccs;
+  Rng rng(0);
+  bool paint_ccs = false;
+  if (WantDebugOutput(aux_out)) {
+    aux_out->DumpPlaneNormalized("is_background", is_background);
+    if (is_xyb) {
+      aux_out->DumpXybImage("background", background);
+    } else {
+      aux_out->DumpImage("background", background);
+    }
+    ccs = ImageF(opsin.xsize(), opsin.ysize());
+    ZeroFillImage(&ccs);
+    paint_ccs = true;
+  }
+
+  constexpr float kVerySimilarThreshold = 0.03f;
+  constexpr float kHasSimilarThreshold = 0.03f;
+
+  const float* JXL_RESTRICT const_background_rows[3] = {
+      background_rows[0], background_rows[1], background_rows[2]};
+  auto is_similar_b = [&](std::pair<int, int> p1, std::pair<int, int> p2) {
+    return is_similar_impl(p1, p2, const_background_rows, background_stride,
+                           kVerySimilarThreshold);
+  };
+
+  constexpr int kMinPeak = 2;
+  constexpr int kHasSimilarRadius = 2;
+
+  std::vector<PatchInfo> info;
+
+  // Find small CC outside the "similar enough" areas, compute bounding boxes,
+  // and run heuristics to exclude some patches.
+  ImageB visited(opsin.xsize(), opsin.ysize());
+  ZeroFillImage(&visited);
+  uint8_t* JXL_RESTRICT visited_row = visited.Row(0);
+  const size_t visited_stride = visited.PixelsPerRow();
+  std::vector<std::pair<uint32_t, uint32_t>> cc;
+  std::vector<std::pair<uint32_t, uint32_t>> stack;
+  for (size_t y = 0; y < opsin.ysize(); y++) {
+    for (size_t x = 0; x < opsin.xsize(); x++) {
+      if (is_background_row[y * is_background_stride + x]) continue;
+      cc.clear();
+      stack.clear();
+      stack.emplace_back(x, y);
+      size_t min_x = x;
+      size_t max_x = x;
+      size_t min_y = y;
+      size_t max_y = y;
+      std::pair<uint32_t, uint32_t> reference;
+      bool found_border = false;
+      bool all_similar = true;
+      while (!stack.empty()) {
+        std::pair<uint32_t, uint32_t> cur = stack.back();
+        stack.pop_back();
+        if (visited_row[cur.second * visited_stride + cur.first]) continue;
+        visited_row[cur.second * visited_stride + cur.first] = 1;
+        if (cur.first < min_x) min_x = cur.first;
+        if (cur.first > max_x) max_x = cur.first;
+        if (cur.second < min_y) min_y = cur.second;
+        if (cur.second > max_y) max_y = cur.second;
+        if (paint_ccs) {
+          cc.push_back(cur);
+        }
+        for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
+          for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
+            if (dx == 0 && dy == 0) continue;
+            int next_first = static_cast<int32_t>(cur.first) + dx;
+            int next_second = static_cast<int32_t>(cur.second) + dy;
+            if (next_first < 0 || next_second < 0 ||
+                static_cast<uint32_t>(next_first) >= opsin.xsize() ||
+                static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+              continue;
+            }
+            std::pair<uint32_t, uint32_t> next{next_first, next_second};
+            if (!is_background_row[next.second * is_background_stride +
+                                   next.first]) {
+              stack.push_back(next);
+            } else {
+              if (!found_border) {
+                reference = next;
+                found_border = true;
+              } else {
+                if (!is_similar_b(next, reference)) all_similar = false;
+              }
+            }
+          }
+        }
+      }
+      if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize ||
+          max_y - min_y >= kMaxPatchSize) {
+        continue;
+      }
+      size_t bpos = background_stride * reference.second + reference.first;
+      float ref[3] = {background_rows[0][bpos], background_rows[1][bpos],
+                      background_rows[2][bpos]};
+      bool has_similar = false;
+      for (size_t iy = std::max<int>(
+               static_cast<int32_t>(min_y) - kHasSimilarRadius, 0);
+           iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) {
+        for (size_t ix = std::max<int>(
+                 static_cast<int32_t>(min_x) - kHasSimilarRadius, 0);
+             ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize());
+             ix++) {
+          size_t opos = opsin_stride * iy + ix;
+          float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos],
+                         opsin_rows[2][opos]};
+          if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) {
+            has_similar = true;
+          }
+        }
+      }
+      if (!has_similar) continue;
+      info.emplace_back();
+      info.back().second.emplace_back(min_x, min_y);
+      QuantizedPatch& patch = info.back().first;
+      patch.xsize = max_x - min_x + 1;
+      patch.ysize = max_y - min_y + 1;
+      int max_value = 0;
+      for (size_t c : {1, 0, 2}) {
+        for (size_t iy = min_y; iy <= max_y; iy++) {
+          for (size_t ix = min_x; ix <= max_x; ix++) {
+            size_t offset = (iy - min_y) * patch.xsize + ix - min_x;
+            patch.fpixels[c][offset] =
+                opsin_rows[c][iy * opsin_stride + ix] - ref[c];
+            int val = pci.Quantize(patch.fpixels[c][offset], c);
+            patch.pixels[c][offset] = val;
+            if (std::abs(val) > max_value) max_value = std::abs(val);
+          }
+        }
+      }
+      if (max_value < kMinPeak) {
+        info.pop_back();
+        continue;
+      }
+      if (paint_ccs) {
+        float cc_color = rng.UniformF(0.5, 1.0);
+        for (std::pair<uint32_t, uint32_t> p : cc) {
+          ccs.Row(p.second)[p.first] = cc_color;
+        }
+      }
+    }
+  }
+
+  if (paint_ccs) {
+    JXL_ASSERT(WantDebugOutput(aux_out));
+    aux_out->DumpPlaneNormalized("ccs", ccs);
+  }
+  if (info.empty()) {
+    return {};
+  }
+
+  // Remove duplicates.
+  constexpr size_t kMinPatchOccurrences = 2;
+  std::sort(info.begin(), info.end());
+  size_t unique = 0;
+  for (size_t i = 1; i < info.size(); i++) {
+    if (info[i].first == info[unique].first) {
+      info[unique].second.insert(info[unique].second.end(),
+                                 info[i].second.begin(), info[i].second.end());
+    } else {
+      if (info[unique].second.size() >= kMinPatchOccurrences) {
+        unique++;
+      }
+      info[unique] = info[i];
+    }
+  }
+  if (info[unique].second.size() >= kMinPatchOccurrences) {
+    unique++;
+  }
+  info.resize(unique);
+
+  size_t max_patch_size = 0;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    size_t pixels = info[i].first.xsize * info[i].first.ysize;
+    if (pixels > max_patch_size) max_patch_size = pixels;
+  }
+
+  // don't use patches if all patches are smaller than this
+  constexpr size_t kMinMaxPatchSize = 20;
+  if (max_patch_size < kMinMaxPatchSize) return {};
+
+  return info;
+}
+
+}  // namespace
+
+void FindBestPatchDictionary(const Image3F& opsin,
+                             PassesEncoderState* JXL_RESTRICT state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool is_xyb) {
+  std::vector<PatchInfo> info =
+      FindTextLikePatches(opsin, state, pool, aux_out, is_xyb);
+
+  // TODO(veluca): this doesn't work if both dots and patches are enabled.
+  // For now, since dots and patches are not likely to occur in the same kind of
+  // images, disable dots if some patches were found.
+  if (info.empty() &&
+      ApplyOverride(
+          state->cparams.dots,
+          state->cparams.speed_tier <= SpeedTier::kSquirrel &&
+              state->cparams.butteraugli_distance >= kMinButteraugliForDots)) {
+    info = FindDotDictionary(state->cparams, opsin, state->shared.cmap, pool);
+  }
+
+  if (info.empty()) return;
+
+  std::sort(
+      info.begin(), info.end(), [&](const PatchInfo& a, const PatchInfo& b) {
+        return a.first.xsize * a.first.ysize > b.first.xsize * b.first.ysize;
+      });
+
+  size_t max_x_size = 0;
+  size_t max_y_size = 0;
+  size_t total_pixels = 0;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    size_t pixels = info[i].first.xsize * info[i].first.ysize;
+    if (max_x_size < info[i].first.xsize) max_x_size = info[i].first.xsize;
+    if (max_y_size < info[i].first.ysize) max_y_size = info[i].first.ysize;
+    total_pixels += pixels;
+  }
+
+  // Bin-packing & conversion of patches.
+  constexpr float kBinPackingSlackness = 1.05f;
+  size_t ref_xsize = std::max<float>(max_x_size, std::sqrt(total_pixels));
+  size_t ref_ysize = std::max<float>(max_y_size, std::sqrt(total_pixels));
+  std::vector<std::pair<size_t, size_t>> ref_positions(info.size());
+  // TODO(veluca): allow partial overlaps of patches that have the same pixels.
+  size_t max_y = 0;
+  do {
+    max_y = 0;
+    // Increase packed image size.
+    ref_xsize = ref_xsize * kBinPackingSlackness + 1;
+    ref_ysize = ref_ysize * kBinPackingSlackness + 1;
+
+    ImageB occupied(ref_xsize, ref_ysize);
+    ZeroFillImage(&occupied);
+    uint8_t* JXL_RESTRICT occupied_rows = occupied.Row(0);
+    size_t occupied_stride = occupied.PixelsPerRow();
+
+    bool success = true;
+    // For every patch...
+    for (size_t patch = 0; patch < info.size(); patch++) {
+      size_t x0 = 0;
+      size_t y0 = 0;
+      size_t xsize = info[patch].first.xsize;
+      size_t ysize = info[patch].first.ysize;
+      bool found = false;
+      // For every possible start position ...
+      for (; y0 + ysize <= ref_ysize; y0++) {
+        x0 = 0;
+        for (; x0 + xsize <= ref_xsize; x0++) {
+          bool has_occupied_pixel = false;
+          size_t x = x0;
+          // Check if it is possible to place the patch in this position in the
+          // reference frame.
+          for (size_t y = y0; y < y0 + ysize; y++) {
+            x = x0;
+            for (; x < x0 + xsize; x++) {
+              if (occupied_rows[y * occupied_stride + x]) {
+                has_occupied_pixel = true;
+                break;
+              }
+            }
+          }  // end of positioning check
+          if (!has_occupied_pixel) {
+            found = true;
+            break;
+          }
+          x0 = x;  // Jump to next pixel after the occupied one.
+        }
+        if (found) break;
+      }  // end of start position checking
+
+      // We didn't find a possible position: repeat from the beginning with a
+      // larger reference frame size.
+      if (!found) {
+        success = false;
+        break;
+      }
+
+      // We found a position: mark the corresponding positions in the reference
+      // image as used.
+      ref_positions[patch] = {x0, y0};
+      for (size_t y = y0; y < y0 + ysize; y++) {
+        for (size_t x = x0; x < x0 + xsize; x++) {
+          occupied_rows[y * occupied_stride + x] = true;
+        }
+      }
+      max_y = std::max(max_y, y0 + ysize);
+    }
+
+    if (success) break;
+  } while (true);
+
+  JXL_ASSERT(ref_ysize >= max_y);
+
+  ref_ysize = max_y;
+
+  Image3F reference_frame(ref_xsize, ref_ysize);
+  // TODO(veluca): figure out a better way to fill the image.
+  ZeroFillImage(&reference_frame);
+  std::vector<PatchPosition> positions;
+  std::vector<PatchReferencePosition> pref_positions;
+  std::vector<PatchBlending> blendings;
+  float* JXL_RESTRICT ref_rows[3] = {
+      reference_frame.PlaneRow(0, 0),
+      reference_frame.PlaneRow(1, 0),
+      reference_frame.PlaneRow(2, 0),
+  };
+  size_t ref_stride = reference_frame.PixelsPerRow();
+  size_t num_ec = state->shared.metadata->m.num_extra_channels;
+
+  for (size_t i = 0; i < info.size(); i++) {
+    PatchReferencePosition ref_pos;
+    ref_pos.xsize = info[i].first.xsize;
+    ref_pos.ysize = info[i].first.ysize;
+    ref_pos.x0 = ref_positions[i].first;
+    ref_pos.y0 = ref_positions[i].second;
+    ref_pos.ref = kPatchFrameReferenceId;
+    for (size_t y = 0; y < ref_pos.ysize; y++) {
+      for (size_t x = 0; x < ref_pos.xsize; x++) {
+        for (size_t c = 0; c < 3; c++) {
+          ref_rows[c][(y + ref_pos.y0) * ref_stride + x + ref_pos.x0] =
+              info[i].first.fpixels[c][y * ref_pos.xsize + x];
+        }
+      }
+    }
+    for (const auto& pos : info[i].second) {
+      positions.emplace_back(
+          PatchPosition{pos.first, pos.second, pref_positions.size()});
+      // Add blending for color channels, ignore other channels.
+      blendings.push_back({PatchBlendMode::kAdd, 0, false});
+      for (size_t j = 0; j < num_ec; ++j) {
+        blendings.push_back({PatchBlendMode::kNone, 0, false});
+      }
+    }
+    pref_positions.emplace_back(std::move(ref_pos));
+  }
+
+  CompressParams cparams = state->cparams;
+  // Recursive application of patches could create very weird issues.
+  cparams.patches = Override::kOff;
+
+  RoundtripPatchFrame(&reference_frame, state, kPatchFrameReferenceId, cparams,
+                      cms, pool, aux_out, /*subtract=*/true);
+
+  // TODO(veluca): this assumes that applying patches is commutative, which is
+  // not true for all blending modes. This code only produces kAdd patches, so
+  // this works out.
+  PatchDictionaryEncoder::SetPositions(
+      &state->shared.image_features.patches, std::move(positions),
+      std::move(pref_positions), std::move(blendings));
+}
+
+void RoundtripPatchFrame(Image3F* reference_frame,
+                         PassesEncoderState* JXL_RESTRICT state, int idx,
+                         CompressParams& cparams, const JxlCmsInterface& cms,
+                         ThreadPool* pool, AuxOut* aux_out, bool subtract) {
+  FrameInfo patch_frame_info;
+  cparams.resampling = 1;
+  cparams.ec_resampling = 1;
+  cparams.dots = Override::kOff;
+  cparams.noise = Override::kOff;
+  cparams.modular_mode = true;
+  cparams.responsive = 0;
+  cparams.progressive_dc = 0;
+  cparams.progressive_mode = false;
+  cparams.qprogressive_mode = false;
+  // Use gradient predictor and not Predictor::Best.
+  cparams.options.predictor = Predictor::Gradient;
+  patch_frame_info.save_as_reference = idx;  // always saved.
+  patch_frame_info.frame_type = FrameType::kReferenceOnly;
+  patch_frame_info.save_before_color_transform = true;
+  ImageBundle ib(&state->shared.metadata->m);
+  // TODO(veluca): metadata.color_encoding is a lie: ib is in XYB, but there is
+  // no simple way to express that yet.
+  patch_frame_info.ib_needs_color_transform = false;
+  ib.SetFromImage(std::move(*reference_frame),
+                  state->shared.metadata->m.color_encoding);
+  if (!ib.metadata()->extra_channel_info.empty()) {
+    // Add dummy extra channels to the patch image: patch encoding does not yet
+    // support extra channels, but the codec expects that the amount of extra
+    // channels in frames matches that in the metadata of the codestream.
+    std::vector<ImageF> extra_channels;
+    extra_channels.reserve(ib.metadata()->extra_channel_info.size());
+    for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
+      extra_channels.emplace_back(ib.xsize(), ib.ysize());
+      // Must initialize the image with data to not affect blending with
+      // uninitialized memory.
+      // TODO(lode): patches must copy and use the real extra channels instead.
+      ZeroFillImage(&extra_channels.back());
+    }
+    ib.SetExtraChannels(std::move(extra_channels));
+  }
+  PassesEncoderState roundtrip_state;
+  auto special_frame = std::unique_ptr<BitWriter>(new BitWriter());
+  AuxOut patch_aux_out;
+  JXL_CHECK(EncodeFrame(cparams, patch_frame_info, state->shared.metadata, ib,
+                        &roundtrip_state, cms, pool, special_frame.get(),
+                        aux_out ? &patch_aux_out : nullptr));
+  if (aux_out) {
+    for (const auto& l : patch_aux_out.layers) {
+      aux_out->layers[kLayerDictionary].Assimilate(l);
+    }
+  }
+  const Span<const uint8_t> encoded = special_frame->GetSpan();
+  state->special_frames.emplace_back(std::move(special_frame));
+  if (subtract) {
+    ImageBundle decoded(&state->shared.metadata->m);
+    PassesDecoderState dec_state;
+    JXL_CHECK(dec_state.output_encoding_info.SetFromMetadata(
+        *state->shared.metadata));
+    const uint8_t* frame_start = encoded.data();
+    size_t encoded_size = encoded.size();
+    JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size, &decoded,
+                          *state->shared.metadata));
+    frame_start += decoded.decoded_bytes();
+    encoded_size -= decoded.decoded_bytes();
+    size_t ref_xsize =
+        dec_state.shared_storage.reference_frames[idx].frame.color()->xsize();
+    // if the frame itself uses patches, we need to decode another frame
+    if (!ref_xsize) {
+      JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size,
+                            &decoded, *state->shared.metadata));
+    }
+    JXL_CHECK(encoded_size == 0);
+    state->shared.reference_frames[idx] =
+        std::move(dec_state.shared_storage.reference_frames[idx]);
+  } else {
+    state->shared.reference_frames[idx].frame = std::move(ib);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h
new file mode 100644
index 0000000000..f30881b232
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.h
@@ -0,0 +1,109 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PATCH_DICTIONARY_H_
+#define LIB_JXL_ENC_PATCH_DICTIONARY_H_
+
+// Chooses reference patches, and avoids encoding them once per occurrence.
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include <tuple>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+constexpr size_t kMaxPatchSize = 32;
+
+struct QuantizedPatch {
+  size_t xsize;
+  size_t ysize;
+  QuantizedPatch() {
+    for (size_t i = 0; i < 3; i++) {
+      pixels[i].resize(kMaxPatchSize * kMaxPatchSize);
+      fpixels[i].resize(kMaxPatchSize * kMaxPatchSize);
+    }
+  }
+  std::vector<int8_t> pixels[3] = {};
+  // Not compared. Used only to retrieve original pixels to construct the
+  // reference image.
+  std::vector<float> fpixels[3] = {};
+  bool operator==(const QuantizedPatch& other) const {
+    if (xsize != other.xsize) return false;
+    if (ysize != other.ysize) return false;
+    for (size_t c = 0; c < 3; c++) {
+      if (memcmp(pixels[c].data(), other.pixels[c].data(),
+                 sizeof(int8_t) * xsize * ysize) != 0)
+        return false;
+    }
+    return true;
+  }
+
+  bool operator<(const QuantizedPatch& other) const {
+    if (xsize != other.xsize) return xsize < other.xsize;
+    if (ysize != other.ysize) return ysize < other.ysize;
+    for (size_t c = 0; c < 3; c++) {
+      int cmp = memcmp(pixels[c].data(), other.pixels[c].data(),
+                       sizeof(int8_t) * xsize * ysize);
+      if (cmp > 0) return false;
+      if (cmp < 0) return true;
+    }
+    return false;
+  }
+};
+
+// Pair (patch, vector of occurrences).
+using PatchInfo =
+    std::pair<QuantizedPatch, std::vector<std::pair<uint32_t, uint32_t>>>;
+
+// Friend class of PatchDictionary.
+class PatchDictionaryEncoder {
+ public:
+  // Only call if HasAny().
+  static void Encode(const PatchDictionary& pdic, BitWriter* writer,
+                     size_t layer, AuxOut* aux_out);
+
+  static void SetPositions(PatchDictionary* pdic,
+                           std::vector<PatchPosition> positions,
+                           std::vector<PatchReferencePosition> ref_positions,
+                           std::vector<PatchBlending> blendings) {
+    pdic->positions_ = std::move(positions);
+    pdic->ref_positions_ = std::move(ref_positions);
+    pdic->blendings_ = std::move(blendings);
+    pdic->ComputePatchTree();
+  }
+
+  static void SubtractFrom(const PatchDictionary& pdic, Image3F* opsin);
+};
+
+void FindBestPatchDictionary(const Image3F& opsin,
+                             PassesEncoderState* JXL_RESTRICT state,
+                             const JxlCmsInterface& cms, ThreadPool* pool,
+                             AuxOut* aux_out, bool is_xyb = true);
+
+void RoundtripPatchFrame(Image3F* reference_frame,
+                         PassesEncoderState* JXL_RESTRICT state, int idx,
+                         CompressParams& cparams, const JxlCmsInterface& cms,
+                         ThreadPool* pool, AuxOut* aux_out, bool subtract);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PATCH_DICTIONARY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc
new file mode 100644
index 0000000000..3786ef5cf5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.cc
@@ -0,0 +1,89 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_photon_noise.h"
+
+namespace jxl {
+
+namespace {
+
+// Assumes a daylight-like spectrum.
+// https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s
+constexpr float kPhotonsPerLxSPerUm2 = 11260;
+
+// Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into
+// account.
+constexpr float kEffectiveQuantumEfficiency = 0.20;
+
+// TODO(sboukortt): reevaluate whether these are good defaults, notably whether
+// it would be worth making read noise higher at lower ISO settings.
+constexpr float kPhotoResponseNonUniformity = 0.005;
+constexpr float kInputReferredReadNoise = 3;
+
+// Assumes a 35mm sensor.
+constexpr float kSensorAreaUm2 = 36000.f * 24000;
+
+template <typename T>
+inline constexpr T Square(const T x) {
+  return x * x;
+}
+template <typename T>
+inline constexpr T Cube(const T x) {
+  return x * x * x;
+}
+
+}  // namespace
+
+NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize,
+                                const float iso) {
+  const float kOpsinAbsorbanceBiasCbrt = std::cbrt(kOpsinAbsorbanceBias[1]);
+
+  // Focal plane exposure for 18% of kDefaultIntensityTarget, in lx·s.
+  // (ISO = 10 lx·s ÷ H)
+  const float h_18 = 10 / iso;
+
+  const float pixel_area_um2 = kSensorAreaUm2 / (xsize * ysize);
+
+  const float electrons_per_pixel_18 = kEffectiveQuantumEfficiency *
+                                       kPhotonsPerLxSPerUm2 * h_18 *
+                                       pixel_area_um2;
+
+  NoiseParams params;
+
+  for (size_t i = 0; i < NoiseParams::kNumNoisePoints; ++i) {
+    const float scaled_index = i / (NoiseParams::kNumNoisePoints - 2.f);
+    // scaled_index is used for XYB = (0, 2·scaled_index, 2·scaled_index)
+    const float y = 2 * scaled_index;
+    // 1 = default intensity target
+    const float linear = std::max(
+        0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) + kOpsinAbsorbanceBias[1]);
+    const float electrons_per_pixel = electrons_per_pixel_18 * (linear / 0.18f);
+    // Quadrature sum of read noise, photon shot noise (sqrt(S) so simply not
+    // squared here) and photo response non-uniformity.
+    // https://doi.org/10.1117/3.725073
+    // Units are electrons rms.
+    const float noise =
+        std::sqrt(Square(kInputReferredReadNoise) + electrons_per_pixel +
+                  Square(kPhotoResponseNonUniformity * electrons_per_pixel));
+    const float linear_noise = noise * (0.18f / electrons_per_pixel_18);
+    const float opsin_derivative =
+        (1.f / 3) / Square(std::cbrt(linear - kOpsinAbsorbanceBias[1]));
+    const float opsin_noise = linear_noise * opsin_derivative;
+
+    // TODO(sboukortt): verify more thoroughly whether the denominator is
+    // correct.
+    params.lut[i] =
+        Clamp1(opsin_noise /
+                   (0.22f             // norm_const
+                    * std::sqrt(2.f)  // red_noise + green_noise
+                    * 1.13f  // standard deviation of a plane of generated noise
+                    ),
+               0.f, 1.f);
+  }
+
+  return params;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h
new file mode 100644
index 0000000000..f43e14d560
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_PHOTON_NOISE_H_
+#define LIB_JXL_ENC_PHOTON_NOISE_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/noise.h"
+
+namespace jxl {
+
+// Constructs a NoiseParams representing the noise that would be seen at the
+// selected nominal exposure on a last-decade (as of 2021) color camera with a
+// 36×24mm sensor (“35mm format”).
+NoiseParams SimulatePhotonNoise(size_t xsize, size_t ysize, float iso);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_PHOTON_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc
new file mode 100644
index 0000000000..be11b465ad
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_photon_noise_test.cc
@@ -0,0 +1,51 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_photon_noise.h"
+
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+MATCHER(AreApproximatelyEqual, "") {
+  constexpr float kTolerance = 1e-6;
+  const float actual = std::get<0>(arg);
+  const float expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(FloatNear(expected, kTolerance), actual,
+                                     result_listener);
+}
+
+TEST(EncPhotonNoiseTest, LUTs) {
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/100).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.00259652, 0.0139648, 0.00681551, 0.00632582, 0.00694917,
+                 0.00803922, 0.00934574, 0.0107607}));
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/800).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.02077220, 0.0420923, 0.01820690, 0.01439020, 0.01293670,
+                 0.01254030, 0.01277390, 0.0134161}));
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/6000, /*ysize=*/4000, /*iso=*/6400).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.1661770, 0.1691120, 0.05309080, 0.03963960, 0.03357410,
+                 0.03001650, 0.02776740, 0.0263478}));
+
+  // Lower when measured on a per-pixel basis as there are fewer of them.
+  EXPECT_THAT(
+      SimulatePhotonNoise(/*xsize=*/4000, /*ysize=*/3000, /*iso=*/6400).lut,
+      Pointwise(AreApproximatelyEqual(),
+                {0.0830886, 0.1008720, 0.0367748, 0.0280305, 0.0240236,
+                 0.0218040, 0.0205771, 0.0200058}));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc
new file mode 100644
index 0000000000..b65319f3fd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.cc
@@ -0,0 +1,82 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_progressive_split.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+void ProgressiveSplitter::SplitACCoefficients(
+    const T* JXL_RESTRICT block, const AcStrategy& acs, size_t bx, size_t by,
+    T* JXL_RESTRICT output[kMaxNumPasses]) {
+  size_t size = acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
+  auto shift_right_round0 = [&](T v, int shift) {
+    T one_if_negative = static_cast<uint32_t>(v) >> 31;
+    T add = (one_if_negative << shift) - one_if_negative;
+    return (v + add) >> shift;
+  };
+  // Early quit for the simple case of only one pass.
+  if (mode_.num_passes == 1) {
+    memcpy(output[0], block, sizeof(T) * size);
+    return;
+  }
+  size_t ncoeffs_all_done_from_earlier_passes = 1;
+
+  int previous_pass_shift = 0;
+  for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) {  // pass
+    // Zero out output block.
+    memset(output[num_pass], 0, size * sizeof(T));
+    const int pass_shift = mode_.passes[num_pass].shift;
+    size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients;
+    size_t xsize = acs.covered_blocks_x();
+    size_t ysize = acs.covered_blocks_y();
+    CoefficientLayout(&ysize, &xsize);
+    for (size_t y = 0; y < ysize * frame_ncoeffs; y++) {    // superblk-y
+      for (size_t x = 0; x < xsize * frame_ncoeffs; x++) {  // superblk-x
+        size_t pos = y * xsize * kBlockDim + x;
+        if (x < xsize * ncoeffs_all_done_from_earlier_passes &&
+            y < ysize * ncoeffs_all_done_from_earlier_passes) {
+          // This coefficient was already included in an earlier pass,
+          // which included a genuinely smaller set of coefficients.
+          continue;
+        }
+        T v = block[pos];
+        // Previous pass discarded some bits: do not encode them again.
+        if (previous_pass_shift != 0) {
+          T previous_v = shift_right_round0(v, previous_pass_shift) *
+                         (1 << previous_pass_shift);
+          v -= previous_v;
+        }
+        output[num_pass][pos] = shift_right_round0(v, pass_shift);
+      }  // superblk-x
+    }    // superblk-y
+    // We just finished a pass.
+    // Hence, we are now guaranteed to have included all coeffs up to
+    // frame_ncoeffs in every block, unless the current pass is shifted.
+    if (mode_.passes[num_pass].shift == 0) {
+      ncoeffs_all_done_from_earlier_passes = frame_ncoeffs;
+    }
+    previous_pass_shift = mode_.passes[num_pass].shift;
+  }  // num_pass
+}
+
+template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
+
+template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h
new file mode 100644
index 0000000000..ef25944bb7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_progressive_split.h
@@ -0,0 +1,131 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PROGRESSIVE_SPLIT_H_
+#define LIB_JXL_PROGRESSIVE_SPLIT_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/splines.h"
+
+// Functions to split DCT coefficients in multiple passes. All the passes of a
+// single frame are added together.
+
+namespace jxl {
+
+constexpr size_t kNoDownsamplingFactor = std::numeric_limits<size_t>::max();
+
+struct PassDefinition {
+  // Side of the square of the coefficients that should be kept in each 8x8
+  // block. Must be greater than 1, and at most 8. Should be in non-decreasing
+  // order.
+  size_t num_coefficients;
+
+  // How much to shift the encoded values by, with rounding.
+  size_t shift;
+
+  // If specified, this indicates that if the requested downsampling factor is
+  // sufficiently high, then it is fine to stop decoding after this pass.
+  // By default, passes are not marked as being suitable for any downsampling.
+  size_t suitable_for_downsampling_of_at_least;
+};
+
+struct ProgressiveMode {
+  size_t num_passes = 1;
+  PassDefinition passes[kMaxNumPasses] = {
+      PassDefinition{/*num_coefficients=*/8, /*shift=*/0,
+                     /*suitable_for_downsampling_of_at_least=*/1}};
+
+  ProgressiveMode() = default;
+
+  template <size_t nump>
+  explicit ProgressiveMode(const PassDefinition (&p)[nump]) {
+    JXL_ASSERT(nump <= kMaxNumPasses);
+    num_passes = nump;
+    PassDefinition previous_pass{
+        /*num_coefficients=*/1, /*shift=*/0,
+        /*suitable_for_downsampling_of_at_least=*/kNoDownsamplingFactor};
+    size_t last_downsampling_factor = kNoDownsamplingFactor;
+    for (size_t i = 0; i < nump; i++) {
+      JXL_ASSERT(p[i].num_coefficients > previous_pass.num_coefficients ||
+                 (p[i].num_coefficients == previous_pass.num_coefficients &&
+                  p[i].shift < previous_pass.shift));
+      JXL_ASSERT(p[i].suitable_for_downsampling_of_at_least ==
+                     kNoDownsamplingFactor ||
+                 p[i].suitable_for_downsampling_of_at_least <=
+                     last_downsampling_factor);
+      // Only used inside assert.
+      (void)last_downsampling_factor;
+      if (p[i].suitable_for_downsampling_of_at_least != kNoDownsamplingFactor) {
+        last_downsampling_factor = p[i].suitable_for_downsampling_of_at_least;
+      }
+      previous_pass = passes[i] = p[i];
+    }
+  }
+};
+
+class ProgressiveSplitter {
+ public:
+  void SetProgressiveMode(ProgressiveMode mode) { mode_ = mode; }
+
+  size_t GetNumPasses() const { return mode_.num_passes; }
+
+  void InitPasses(Passes* JXL_RESTRICT passes) const {
+    passes->num_passes = static_cast<uint32_t>(GetNumPasses());
+    passes->num_downsample = 0;
+    JXL_ASSERT(passes->num_passes != 0);
+    passes->shift[passes->num_passes - 1] = 0;
+    if (passes->num_passes == 1) return;  // Done, arrays are empty
+
+    for (uint32_t i = 0; i < mode_.num_passes - 1; ++i) {
+      const size_t min_downsampling_factor =
+          mode_.passes[i].suitable_for_downsampling_of_at_least;
+      passes->shift[i] = mode_.passes[i].shift;
+      if (1 < min_downsampling_factor &&
+          min_downsampling_factor != kNoDownsamplingFactor) {
+        passes->downsample[passes->num_downsample] = min_downsampling_factor;
+        passes->last_pass[passes->num_downsample] = i;
+        if (mode_.passes[i + 1].suitable_for_downsampling_of_at_least <
+            min_downsampling_factor) {
+          passes->num_downsample += 1;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void SplitACCoefficients(const T* JXL_RESTRICT block, const AcStrategy& acs,
+                           size_t bx, size_t by,
+                           T* JXL_RESTRICT output[kMaxNumPasses]);
+
+ private:
+  ProgressiveMode mode_;
+};
+
+extern template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
+
+extern template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PROGRESSIVE_SPLIT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc
new file mode 100644
index 0000000000..848310e75d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.cc
@@ -0,0 +1,214 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_quant_weights.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+
+Status EncodeDctParams(const DctQuantWeightParams& params, BitWriter* writer) {
+  JXL_ASSERT(params.num_distance_bands >= 1);
+  writer->Write(DctQuantWeightParams::kLog2MaxDistanceBands,
+                params.num_distance_bands - 1);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t i = 0; i < params.num_distance_bands; i++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Write(
+          params.distance_bands[c][i] * (i == 0 ? (1 / 64.0f) : 1.0f), writer));
+    }
+  }
+  return true;
+}
+
+Status EncodeQuant(const QuantEncoding& encoding, size_t idx, size_t size_x,
+                   size_t size_y, BitWriter* writer,
+                   ModularFrameEncoder* modular_frame_encoder) {
+  writer->Write(kLog2NumQuantModes, encoding.mode);
+  size_x *= kBlockDim;
+  size_y *= kBlockDim;
+  switch (encoding.mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      writer->Write(kCeilLog2NumPredefinedTables, encoding.predefined);
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 3; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Write(encoding.idweights[c][i] * (1.0f / 64), writer));
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 6; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Write(
+              encoding.dct2weights[c][i] * (1.0f / 64), writer));
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      for (size_t c = 0; c < 3; c++) {
+        JXL_RETURN_IF_ERROR(
+            F16Coder::Write(encoding.dct4x8multipliers[c], writer));
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 2; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Write(encoding.dct4multipliers[c][i], writer));
+        }
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      ModularFrameEncoder::EncodeQuantTable(size_x, size_y, writer, encoding,
+                                            idx, modular_frame_encoder);
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 9; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Write(
+              encoding.afv_weights[c][i] * (i < 6 ? 1.0f / 64 : 1.0f), writer));
+        }
+      }
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params, writer));
+      JXL_RETURN_IF_ERROR(EncodeDctParams(encoding.dct_params_afv_4x4, writer));
+      break;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status DequantMatricesEncode(const DequantMatrices* matrices, BitWriter* writer,
+                             size_t layer, AuxOut* aux_out,
+                             ModularFrameEncoder* modular_frame_encoder) {
+  bool all_default = true;
+  const std::vector<QuantEncoding>& encodings = matrices->encodings();
+
+  for (size_t i = 0; i < encodings.size(); i++) {
+    if (encodings[i].mode != QuantEncoding::kQuantModeLibrary ||
+        encodings[i].predefined != 0) {
+      all_default = false;
+    }
+  }
+  // TODO(janwas): better bound
+  BitWriter::Allotment allotment(writer, 512 * 1024);
+  writer->Write(1, all_default);
+  if (!all_default) {
+    for (size_t i = 0; i < encodings.size(); i++) {
+      JXL_RETURN_IF_ERROR(EncodeQuant(
+          encodings[i], i, DequantMatrices::required_size_x[i],
+          DequantMatrices::required_size_y[i], writer, modular_frame_encoder));
+    }
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
+                               BitWriter* writer, size_t layer,
+                               AuxOut* aux_out) {
+  bool all_default = true;
+  const float* dc_quant = matrices->DCQuants();
+  for (size_t c = 0; c < 3; c++) {
+    if (dc_quant[c] != kDCQuant[c]) {
+      all_default = false;
+    }
+  }
+  BitWriter::Allotment allotment(writer, 1 + sizeof(float) * kBitsPerByte * 3);
+  writer->Write(1, all_default);
+  if (!all_default) {
+    for (size_t c = 0; c < 3; c++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, writer));
+    }
+  }
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc) {
+  matrices->SetDCQuant(dc);
+  // Roundtrip encode/decode DC to ensure same values as decoder.
+  BitWriter writer;
+  JXL_CHECK(DequantMatricesEncodeDC(matrices, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  BitReader br(writer.GetSpan());
+  // Called only in the encoder: should fail only for programmer errors.
+  JXL_CHECK(matrices->DecodeDC(&br));
+  JXL_CHECK(br.Close());
+}
+
+void DequantMatricesScaleDC(DequantMatrices* matrices, const float scale) {
+  float dc[3];
+  for (size_t c = 0; c < 3; ++c) {
+    dc[c] = matrices->InvDCQuant(c) * (1.0f / scale);
+  }
+  DequantMatricesSetCustomDC(matrices, dc);
+}
+
+void DequantMatricesRoundtrip(DequantMatrices* matrices) {
+  // Do not pass modular en/decoder, as they only change entropy and not
+  // values.
+  BitWriter writer;
+  JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  BitReader br(writer.GetSpan());
+  // Called only in the encoder: should fail only for programmer errors.
+  JXL_CHECK(matrices->Decode(&br));
+  JXL_CHECK(br.Close());
+}
+
+void DequantMatricesSetCustom(DequantMatrices* matrices,
+                              const std::vector<QuantEncoding>& encodings,
+                              ModularFrameEncoder* encoder) {
+  JXL_ASSERT(encodings.size() == DequantMatrices::kNum);
+  matrices->SetEncodings(encodings);
+  for (size_t i = 0; i < encodings.size(); i++) {
+    if (encodings[i].mode == QuantEncodingInternal::kQuantModeRAW) {
+      encoder->AddQuantTable(DequantMatrices::required_size_x[i] * kBlockDim,
+                             DequantMatrices::required_size_y[i] * kBlockDim,
+                             encodings[i], i);
+    }
+  }
+  DequantMatricesRoundtrip(matrices);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h
new file mode 100644
index 0000000000..e0a387fed5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_quant_weights.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_QUANT_WEIGHTS_H_
+#define LIB_JXL_ENC_QUANT_WEIGHTS_H_
+
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct BitWriter;
+
+Status DequantMatricesEncode(
+    const DequantMatrices* matrices, BitWriter* writer, size_t layer,
+    AuxOut* aux_out, ModularFrameEncoder* modular_frame_encoder = nullptr);
+Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
+                               BitWriter* writer, size_t layer,
+                               AuxOut* aux_out);
+// For consistency with QuantEncoding, higher values correspond to more
+// precision.
+void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc);
+
+void DequantMatricesScaleDC(DequantMatrices* matrices, float scale);
+
+void DequantMatricesSetCustom(DequantMatrices* matrices,
+                              const std::vector<QuantEncoding>& encodings,
+                              ModularFrameEncoder* encoder);
+
+// Roundtrip encode/decode the matrices to ensure same values as decoder.
+void DequantMatricesRoundtrip(DequantMatrices* matrices);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_QUANT_WEIGHTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.cc b/third_party/jpeg-xl/lib/jxl/enc_splines.cc
new file mode 100644
index 0000000000..ddcd78a748
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_splines.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <algorithm>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+class QuantizedSplineEncoder {
+ public:
+  // Only call if HasAny().
+  static void Tokenize(const QuantizedSpline& spline,
+                       std::vector<Token>* const tokens) {
+    tokens->emplace_back(kNumControlPointsContext,
+                         spline.control_points_.size());
+    for (const auto& point : spline.control_points_) {
+      tokens->emplace_back(kControlPointsContext, PackSigned(point.first));
+      tokens->emplace_back(kControlPointsContext, PackSigned(point.second));
+    }
+    const auto encode_dct = [tokens](const int dct[32]) {
+      for (int i = 0; i < 32; ++i) {
+        tokens->emplace_back(kDCTContext, PackSigned(dct[i]));
+      }
+    };
+    for (int c = 0; c < 3; ++c) {
+      encode_dct(spline.color_dct_[c]);
+    }
+    encode_dct(spline.sigma_dct_);
+  }
+};
+
+namespace {
+
+void EncodeAllStartingPoints(const std::vector<Spline::Point>& points,
+                             std::vector<Token>* tokens) {
+  int64_t last_x = 0;
+  int64_t last_y = 0;
+  for (size_t i = 0; i < points.size(); i++) {
+    const int64_t x = lroundf(points[i].x);
+    const int64_t y = lroundf(points[i].y);
+    if (i == 0) {
+      tokens->emplace_back(kStartingPositionContext, x);
+      tokens->emplace_back(kStartingPositionContext, y);
+    } else {
+      tokens->emplace_back(kStartingPositionContext, PackSigned(x - last_x));
+      tokens->emplace_back(kStartingPositionContext, PackSigned(y - last_y));
+    }
+    last_x = x;
+    last_y = y;
+  }
+}
+
+}  // namespace
+
+void EncodeSplines(const Splines& splines, BitWriter* writer,
+                   const size_t layer, const HistogramParams& histogram_params,
+                   AuxOut* aux_out) {
+  JXL_ASSERT(splines.HasAny());
+
+  const std::vector<QuantizedSpline>& quantized_splines =
+      splines.QuantizedSplines();
+  std::vector<std::vector<Token>> tokens(1);
+  tokens[0].emplace_back(kNumSplinesContext, quantized_splines.size() - 1);
+  EncodeAllStartingPoints(splines.StartingPoints(), &tokens[0]);
+
+  tokens[0].emplace_back(kQuantizationAdjustmentContext,
+                         PackSigned(splines.GetQuantizationAdjustment()));
+
+  for (const QuantizedSpline& spline : quantized_splines) {
+    QuantizedSplineEncoder::Tokenize(spline, &tokens[0]);
+  }
+
+  EntropyEncodingData codes;
+  std::vector<uint8_t> context_map;
+  BuildAndEncodeHistograms(histogram_params, kNumSplineContexts, tokens, &codes,
+                           &context_map, writer, layer, aux_out);
+  WriteTokens(tokens[0], codes, context_map, writer, layer, aux_out);
+}
+
+Splines FindSplines(const Image3F& opsin) {
+  // TODO: implement spline detection.
+  return {};
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_splines.h b/third_party/jpeg-xl/lib/jxl/enc_splines.h
new file mode 100644
index 0000000000..be700dba75
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_splines.h
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_SPLINES_H_
+#define LIB_JXL_ENC_SPLINES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Only call if splines.HasAny().
+void EncodeSplines(const Splines& splines, BitWriter* writer, size_t layer,
+                   const HistogramParams& histogram_params, AuxOut* aux_out);
+
+Splines FindSplines(const Image3F& opsin);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.cc b/third_party/jpeg-xl/lib/jxl/enc_toc.cc
new file mode 100644
index 0000000000..dc75fdd9ba
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_toc.cc
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_toc.h"
+
+#include <stdint.h>
+
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
+                         const std::vector<coeff_order_t>* permutation,
+                         BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
+  BitWriter::Allotment allotment(writer, MaxBits(group_codes.size()));
+  if (permutation && !group_codes.empty()) {
+    // Don't write a permutation at all for an empty group_codes.
+    writer->Write(1, 1);  // permutation
+    JXL_DASSERT(permutation->size() == group_codes.size());
+    EncodePermutation(permutation->data(), /*skip=*/0, permutation->size(),
+                      writer, /* layer= */ 0, aux_out);
+
+  } else {
+    writer->Write(1, 0);  // no permutation
+  }
+  writer->ZeroPadToByte();  // before TOC entries
+
+  for (size_t i = 0; i < group_codes.size(); i++) {
+    JXL_ASSERT(group_codes[i].BitsWritten() % kBitsPerByte == 0);
+    const size_t group_size = group_codes[i].BitsWritten() / kBitsPerByte;
+    JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer));
+  }
+  writer->ZeroPadToByte();  // before first group
+  allotment.ReclaimAndCharge(writer, kLayerTOC, aux_out);
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_toc.h b/third_party/jpeg-xl/lib/jxl/enc_toc.h
new file mode 100644
index 0000000000..242b3efccb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_toc.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_TOC_H_
+#define LIB_JXL_ENC_TOC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/enc_bit_writer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Writes the group offsets. If the permutation vector is nullptr, the identity
+// permutation will be used.
+Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
+                         const std::vector<coeff_order_t>* permutation,
+                         BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_TOC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
new file mode 100644
index 0000000000..ef6dc2bbd7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
@@ -0,0 +1,827 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_ENC_TRANSFORMS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_ENC_TRANSFORMS_INL_H_
+#undef LIB_JXL_ENC_TRANSFORMS_INL_H_
+#else
+#define LIB_JXL_ENC_TRANSFORMS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/dct_scales.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// Inverse of ReinterpretingDCT.
+template <size_t DCT_ROWS, size_t DCT_COLS, size_t LF_ROWS, size_t LF_COLS,
+          size_t ROWS, size_t COLS>
+HWY_INLINE void ReinterpretingIDCT(const float* input,
+                                   const size_t input_stride, float* output,
+                                   const size_t output_stride) {
+  HWY_ALIGN float block[ROWS * COLS] = {};
+  if (ROWS < COLS) {
+    for (size_t y = 0; y < LF_ROWS; y++) {
+      for (size_t x = 0; x < LF_COLS; x++) {
+        block[y * COLS + x] = input[y * input_stride + x] *
+                              DCTTotalResampleScale<DCT_ROWS, ROWS>(y) *
+                              DCTTotalResampleScale<DCT_COLS, COLS>(x);
+      }
+    }
+  } else {
+    for (size_t y = 0; y < LF_COLS; y++) {
+      for (size_t x = 0; x < LF_ROWS; x++) {
+        block[y * ROWS + x] = input[y * input_stride + x] *
+                              DCTTotalResampleScale<DCT_COLS, COLS>(y) *
+                              DCTTotalResampleScale<DCT_ROWS, ROWS>(x);
+      }
+    }
+  }
+
+  // ROWS, COLS <= 8, so we can put scratch space on the stack.
+  HWY_ALIGN float scratch_space[ROWS * COLS];
+  ComputeScaledIDCT<ROWS, COLS>()(block, DCTTo(output, output_stride),
+                                  scratch_space);
+}
+
+template <size_t S>
+void DCT2TopBlock(const float* block, size_t stride, float* out) {
+  static_assert(kBlockDim % S == 0, "S should be a divisor of kBlockDim");
+  static_assert(S % 2 == 0, "S should be even");
+  float temp[kDCTBlockSize];
+  constexpr size_t num_2x2 = S / 2;
+  for (size_t y = 0; y < num_2x2; y++) {
+    for (size_t x = 0; x < num_2x2; x++) {
+      float c00 = block[y * 2 * stride + x * 2];
+      float c01 = block[y * 2 * stride + x * 2 + 1];
+      float c10 = block[(y * 2 + 1) * stride + x * 2];
+      float c11 = block[(y * 2 + 1) * stride + x * 2 + 1];
+      float r00 = c00 + c01 + c10 + c11;
+      float r01 = c00 + c01 - c10 - c11;
+      float r10 = c00 - c01 + c10 - c11;
+      float r11 = c00 - c01 - c10 + c11;
+      r00 *= 0.25f;
+      r01 *= 0.25f;
+      r10 *= 0.25f;
+      r11 *= 0.25f;
+      temp[y * kBlockDim + x] = r00;
+      temp[y * kBlockDim + num_2x2 + x] = r01;
+      temp[(y + num_2x2) * kBlockDim + x] = r10;
+      temp[(y + num_2x2) * kBlockDim + num_2x2 + x] = r11;
+    }
+  }
+  for (size_t y = 0; y < S; y++) {
+    for (size_t x = 0; x < S; x++) {
+      out[y * kBlockDim + x] = temp[y * kBlockDim + x];
+    }
+  }
+}
+
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) {
+  HWY_ALIGN static constexpr float k4x4AFVBasisTranspose[16][16] = {
+      {
+          0.2500000000000000,
+          0.8769029297991420f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.4105377591765233f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          0.2206518106944235f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.7071067811865474f,
+          0.6235485373547691f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          0.4067007583026075f,
+          -0.2125574805828875f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          -0.4517556589999482f,
+          -0.3046847507248690f,
+          0.3017929516615495f,
+          0.4082482904638627f,
+          0.1747866975480809f,
+          -0.2110560104933578f,
+          -0.1426608480880726f,
+          -0.1381354035075859f,
+          -0.1743760259965107f,
+          0.1135498731499434f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375375f,
+          0.4444481661973445f,
+          0.3085497062849767f,
+          0.0000000000000000f,
+          -0.0643507165794627f,
+          0.1585450355184006f,
+          0.5112616136591823f,
+          0.2579236279634118f,
+          0.0000000000000000,
+          0.0812611176717539f,
+          0.1856718091610980f,
+          -0.3416446842253372f,
+          0.3302282550303788f,
+          0.0702790691196284f,
+          -0.0741750459581035f,
+      },
+      {
+          0.2500000000000000,
+          0.2206518106944236f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.7071067811865476f,
+          0.6235485373547694f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375378f,
+          0.0000000000000000,
+          0.4706702258572536f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          -0.0403851516082220f,
+          0.0000000000000000,
+          0.1627234014286620f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.7367497537172237f,
+          0.0875511500058708f,
+          -0.2921026642334881f,
+          0.1940289303259434f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          0.1957439937204294f,
+          -0.1621205195722993f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0074182263792424f,
+          -0.2904801297289980f,
+          0.0952002265347504f,
+          0.0000000000000000,
+          -0.3675398009862027f,
+          0.4921585901373873f,
+          0.2462710772207515f,
+          -0.0794670660590957f,
+          0.3623817333531167f,
+          -0.4351904965232280f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          0.2929100136981264f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.3935103426921017f,
+          -0.0657870154914280f,
+          0.0000000000000000,
+          -0.4082482904638628f,
+          -0.3078822139579090f,
+          -0.3852501370925192f,
+          -0.0857401903551931f,
+          -0.4613374887461511f,
+          0.0000000000000000,
+          0.2191868483885747f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.4067007583026072f,
+          -0.2125574805828705f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          -0.4517556589999464f,
+          0.3046847507248840f,
+          0.3017929516615503f,
+          -0.4082482904638635f,
+          -0.1747866975480813f,
+          0.2110560104933581f,
+          -0.1426608480880734f,
+          -0.1381354035075829f,
+          -0.1743760259965108f,
+          0.1135498731499426f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          -0.1957439937204287f,
+          -0.1621205195722833f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0074182263792444f,
+          0.2904801297290076f,
+          0.0952002265347505f,
+          0.0000000000000000,
+          0.3675398009862011f,
+          -0.4921585901373891f,
+          0.2462710772207514f,
+          -0.0794670660591026f,
+          0.3623817333531165f,
+          -0.4351904965232251f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375375f,
+          0.0000000000000000,
+          -0.4706702258572528f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.1107416575309343f,
+          0.0000000000000000,
+          -0.1627234014286617f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.1488339922711357f,
+          0.4972464710953509f,
+          0.2921026642334879f,
+          0.5550443808910661f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          0.1137907446044809f,
+          -0.1464291867126764f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.0829816309488205f,
+          -0.2388977352334460f,
+          -0.3531238544981630f,
+          -0.4082482904638630f,
+          0.4826689115059883f,
+          0.1741941265991622f,
+          -0.0476868035022925f,
+          0.1253805944856366f,
+          -0.4326608024727445f,
+          -0.2546827712406646f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375377f,
+          -0.4444481661973438f,
+          0.3085497062849487f,
+          0.0000000000000000,
+          -0.0643507165794628f,
+          0.1585450355183970f,
+          -0.5112616136592012f,
+          0.2579236279634129f,
+          0.0000000000000000,
+          -0.0812611176717504f,
+          -0.1856718091610990f,
+          -0.3416446842253373f,
+          0.3302282550303805f,
+          0.0702790691196282f,
+          -0.0741750459581023f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.2929100136981264f,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.3935103426921022f,
+          0.0657870154914254f,
+          0.0000000000000000,
+          0.4082482904638634f,
+          0.3078822139579031f,
+          0.3852501370925211f,
+          -0.0857401903551927f,
+          -0.4613374887461554f,
+          0.0000000000000000,
+          0.2191868483885728f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375376f,
+          -0.1137907446044814f,
+          -0.1464291867126654f,
+          0.0000000000000000,
+          -0.0643507165794627f,
+          0.0829816309488214f,
+          0.2388977352334547f,
+          -0.3531238544981624f,
+          0.4082482904638630f,
+          -0.4826689115059858f,
+          -0.1741941265991621f,
+          -0.0476868035022928f,
+          0.1253805944856431f,
+          -0.4326608024727457f,
+          -0.2546827712406641f,
+      },
+      {
+          0.2500000000000000,
+          -0.1014005039375374f,
+          0.0000000000000000,
+          0.4251149611657548f,
+          0.0000000000000000,
+          -0.0643507165794626f,
+          -0.4517556589999480f,
+          0.0000000000000000,
+          -0.6035859033230976f,
+          0.0000000000000000,
+          0.0000000000000000,
+          0.0000000000000000,
+          -0.1426608480880724f,
+          -0.1381354035075845f,
+          0.3487520519930227f,
+          0.1135498731499429f,
+      },
+  };
+
+  const HWY_CAPPED(float, 16) d;
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    auto scalar = Zero(d);
+    for (size_t j = 0; j < 16; j++) {
+      auto px = Set(d, pixels[j]);
+      auto basis = Load(d, k4x4AFVBasisTranspose[j] + i);
+      scalar = MulAdd(px, basis, scalar);
+    }
+    Store(scalar, d, coeffs + i);
+  }
+}
+
+// Coefficient layout:
+//  - (even, even) positions hold AFV coefficients
+//  - (odd, even) positions hold DCT4x4 coefficients
+//  - (any, odd) positions hold DCT4x8 coefficients
+template <size_t afv_kind>
+void AFVTransformFromPixels(const float* JXL_RESTRICT pixels,
+                            size_t pixels_stride,
+                            float* JXL_RESTRICT coefficients) {
+  HWY_ALIGN float scratch_space[4 * 8 * 2];
+  size_t afv_x = afv_kind & 1;
+  size_t afv_y = afv_kind / 2;
+  HWY_ALIGN float block[4 * 8];
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)] =
+          pixels[(iy + 4 * afv_y) * pixels_stride + ix + 4 * afv_x];
+    }
+  }
+  // AFV coefficients in (even, even) positions.
+  HWY_ALIGN float coeff[4 * 4];
+  AFVDCT4x4(block, coeff);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 4; ix++) {
+      coefficients[iy * 2 * 8 + ix * 2] = coeff[iy * 4 + ix];
+    }
+  }
+  // 4x4 DCT of the block with same y and different x.
+  ComputeScaledDCT<4, 4>()(
+      DCTFrom(pixels + afv_y * 4 * pixels_stride + (afv_x == 1 ? 0 : 4),
+              pixels_stride),
+      block, scratch_space);
+  // ... in (odd, even) positions.
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      coefficients[iy * 2 * 8 + ix * 2 + 1] = block[iy * 4 + ix];
+    }
+  }
+  // 4x8 DCT of the other half of the block.
+  ComputeScaledDCT<4, 8>()(
+      DCTFrom(pixels + (afv_y == 1 ? 0 : 4) * pixels_stride, pixels_stride),
+      block, scratch_space);
+  for (size_t iy = 0; iy < 4; iy++) {
+    for (size_t ix = 0; ix < 8; ix++) {
+      coefficients[(1 + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+    }
+  }
+  float block00 = coefficients[0] * 0.25f;
+  float block01 = coefficients[1];
+  float block10 = coefficients[8];
+  coefficients[0] = (block00 + block01 + 2 * block10) * 0.25f;
+  coefficients[1] = (block00 - block01) * 0.5f;
+  coefficients[8] = (block00 + block01 - 2 * block10) * 0.25f;
+}
+
+HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
+                                          const float* JXL_RESTRICT pixels,
+                                          size_t pixels_stride,
+                                          float* JXL_RESTRICT coefficients,
+                                          float* JXL_RESTRICT scratch_space) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::IDENTITY: {
+      PROFILER_ZONE("DCT Identity");
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          float block_dc = 0;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              block_dc += pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix];
+            }
+          }
+          block_dc *= 1.0f / 16;
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              if (ix == 1 && iy == 1) continue;
+              coefficients[(y + iy * 2) * 8 + x + ix * 2] =
+                  pixels[(y * 4 + iy) * pixels_stride + x * 4 + ix] -
+                  pixels[(y * 4 + 1) * pixels_stride + x * 4 + 1];
+            }
+          }
+          coefficients[(y + 2) * 8 + x + 2] = coefficients[y * 8 + x];
+          coefficients[y * 8 + x] = block_dc;
+        }
+      }
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f;
+      coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f;
+      coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f;
+      coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f;
+      break;
+    }
+    case Type::DCT8X4: {
+      PROFILER_ZONE("DCT 8x4");
+      for (size_t x = 0; x < 2; x++) {
+        HWY_ALIGN float block[4 * 8];
+        ComputeScaledDCT<8, 4>()(DCTFrom(pixels + x * 4, pixels_stride), block,
+                                 scratch_space);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            // Store transposed.
+            coefficients[(x + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+          }
+        }
+      }
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      coefficients[0] = (block0 + block1) * 0.5f;
+      coefficients[8] = (block0 - block1) * 0.5f;
+      break;
+    }
+    case Type::DCT4X8: {
+      PROFILER_ZONE("DCT 4x8");
+      for (size_t y = 0; y < 2; y++) {
+        HWY_ALIGN float block[4 * 8];
+        ComputeScaledDCT<4, 8>()(
+            DCTFrom(pixels + y * 4 * pixels_stride, pixels_stride), block,
+            scratch_space);
+        for (size_t iy = 0; iy < 4; iy++) {
+          for (size_t ix = 0; ix < 8; ix++) {
+            coefficients[(y + iy * 2) * 8 + ix] = block[iy * 8 + ix];
+          }
+        }
+      }
+      float block0 = coefficients[0];
+      float block1 = coefficients[8];
+      coefficients[0] = (block0 + block1) * 0.5f;
+      coefficients[8] = (block0 - block1) * 0.5f;
+      break;
+    }
+    case Type::DCT4X4: {
+      PROFILER_ZONE("DCT 4");
+      for (size_t y = 0; y < 2; y++) {
+        for (size_t x = 0; x < 2; x++) {
+          HWY_ALIGN float block[4 * 4];
+          ComputeScaledDCT<4, 4>()(
+              DCTFrom(pixels + y * 4 * pixels_stride + x * 4, pixels_stride),
+              block, scratch_space);
+          for (size_t iy = 0; iy < 4; iy++) {
+            for (size_t ix = 0; ix < 4; ix++) {
+              coefficients[(y + iy * 2) * 8 + x + ix * 2] = block[iy * 4 + ix];
+            }
+          }
+        }
+      }
+      float block00 = coefficients[0];
+      float block01 = coefficients[1];
+      float block10 = coefficients[8];
+      float block11 = coefficients[9];
+      coefficients[0] = (block00 + block01 + block10 + block11) * 0.25f;
+      coefficients[1] = (block00 + block01 - block10 - block11) * 0.25f;
+      coefficients[8] = (block00 - block01 + block10 - block11) * 0.25f;
+      coefficients[9] = (block00 - block01 - block10 + block11) * 0.25f;
+      break;
+    }
+    case Type::DCT2X2: {
+      PROFILER_ZONE("DCT 2");
+      DCT2TopBlock<8>(pixels, pixels_stride, coefficients);
+      DCT2TopBlock<4>(coefficients, kBlockDim, coefficients);
+      DCT2TopBlock<2>(coefficients, kBlockDim, coefficients);
+      break;
+    }
+    case Type::DCT16X16: {
+      PROFILER_ZONE("DCT 16");
+      ComputeScaledDCT<16, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT16X8: {
+      PROFILER_ZONE("DCT 16x8");
+      ComputeScaledDCT<16, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT8X16: {
+      PROFILER_ZONE("DCT 8x16");
+      ComputeScaledDCT<8, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT32X8: {
+      PROFILER_ZONE("DCT 32x8");
+      ComputeScaledDCT<32, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT8X32: {
+      PROFILER_ZONE("DCT 8x32");
+      ComputeScaledDCT<8, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                scratch_space);
+      break;
+    }
+    case Type::DCT32X16: {
+      PROFILER_ZONE("DCT 32x16");
+      ComputeScaledDCT<32, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT16X32: {
+      PROFILER_ZONE("DCT 16x32");
+      ComputeScaledDCT<16, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X32: {
+      PROFILER_ZONE("DCT 32");
+      ComputeScaledDCT<32, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT: {
+      PROFILER_ZONE("DCT 8");
+      ComputeScaledDCT<8, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
+                               scratch_space);
+      break;
+    }
+    case Type::AFV0: {
+      PROFILER_ZONE("AFV0");
+      AFVTransformFromPixels<0>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV1: {
+      PROFILER_ZONE("AFV1");
+      AFVTransformFromPixels<1>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV2: {
+      PROFILER_ZONE("AFV2");
+      AFVTransformFromPixels<2>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::AFV3: {
+      PROFILER_ZONE("AFV3");
+      AFVTransformFromPixels<3>(pixels, pixels_stride, coefficients);
+      break;
+    }
+    case Type::DCT64X64: {
+      PROFILER_ZONE("DCT 64x64");
+      ComputeScaledDCT<64, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT64X32: {
+      PROFILER_ZONE("DCT 64x32");
+      ComputeScaledDCT<64, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT32X64: {
+      PROFILER_ZONE("DCT 32x64");
+      ComputeScaledDCT<32, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                 scratch_space);
+      break;
+    }
+    case Type::DCT128X128: {
+      PROFILER_ZONE("DCT 128x128");
+      ComputeScaledDCT<128, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X64: {
+      PROFILER_ZONE("DCT 128x64");
+      ComputeScaledDCT<128, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT64X128: {
+      PROFILER_ZONE("DCT 64x128");
+      ComputeScaledDCT<64, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                  scratch_space);
+      break;
+    }
+    case Type::DCT256X256: {
+      PROFILER_ZONE("DCT 256x256");
+      ComputeScaledDCT<256, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT256X128: {
+      PROFILER_ZONE("DCT 256x128");
+      ComputeScaledDCT<256, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::DCT128X256: {
+      PROFILER_ZONE("DCT 128x256");
+      ComputeScaledDCT<128, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
+                                   scratch_space);
+      break;
+    }
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+HWY_MAYBE_UNUSED void DCFromLowestFrequencies(const AcStrategy::Type strategy,
+                                              const float* block, float* dc,
+                                              size_t dc_stride) {
+  using Type = AcStrategy::Type;
+  switch (strategy) {
+    case Type::DCT16X8: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/1, /*ROWS=*/2, /*COLS=*/1>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT8X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/1, /*LF_COLS=*/2, /*ROWS=*/1, /*COLS=*/2>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT16X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/2, /*ROWS=*/2, /*COLS=*/2>(
+          block, 2 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X8: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/1, /*ROWS=*/4, /*COLS=*/1>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT8X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/1, /*LF_COLS=*/4, /*ROWS=*/1, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X16: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/2, /*ROWS=*/4, /*COLS=*/2>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT16X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/2, /*LF_COLS=*/4, /*ROWS=*/2, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/4, /*ROWS=*/4, /*COLS=*/4>(
+          block, 4 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X32: {
+      ReinterpretingIDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
+                         /*LF_ROWS=*/8, /*LF_COLS=*/4, /*ROWS=*/8, /*COLS=*/4>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT32X64: {
+      ReinterpretingIDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                         /*LF_ROWS=*/4, /*LF_COLS=*/8, /*ROWS=*/4, /*COLS=*/8>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X64: {
+      ReinterpretingIDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+                         /*LF_ROWS=*/8, /*LF_COLS=*/8, /*ROWS=*/8, /*COLS=*/8>(
+          block, 8 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X64: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT64X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>(
+          block, 16 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT256X128: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT128X256: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT256X256: {
+      ReinterpretingIDCT<
+          /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
+          /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>(
+          block, 32 * kBlockDim, dc, dc_stride);
+      break;
+    }
+    case Type::DCT:
+    case Type::DCT2X2:
+    case Type::DCT4X4:
+    case Type::DCT4X8:
+    case Type::DCT8X4:
+    case Type::AFV0:
+    case Type::AFV1:
+    case Type::AFV2:
+    case Type::AFV3:
+    case Type::IDENTITY:
+      dc[0] = block[0];
+      break;
+    case Type::kNumValidStrategies:
+      JXL_ABORT("Invalid strategy");
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_ENC_TRANSFORMS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.cc b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc
new file mode 100644
index 0000000000..8978ba1dcb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.cc
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_transforms.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_transforms.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_transforms-inl.h"
+
+namespace jxl {
+
+#if HWY_ONCE
+HWY_EXPORT(TransformFromPixels);
+void TransformFromPixels(const AcStrategy::Type strategy,
+                         const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* scratch_space) {
+  return HWY_DYNAMIC_DISPATCH(TransformFromPixels)(
+      strategy, pixels, pixels_stride, coefficients, scratch_space);
+}
+
+HWY_EXPORT(DCFromLowestFrequencies);
+void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block,
+                             float* dc, size_t dc_stride) {
+  return HWY_DYNAMIC_DISPATCH(DCFromLowestFrequencies)(strategy, block, dc,
+                                                       dc_stride);
+}
+
+HWY_EXPORT(AFVDCT4x4);
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs) {
+  return HWY_DYNAMIC_DISPATCH(AFVDCT4x4)(pixels, coeffs);
+}
+#endif  // HWY_ONCE
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/enc_transforms.h b/third_party/jpeg-xl/lib/jxl/enc_transforms.h
new file mode 100644
index 0000000000..039ccc3893
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_TRANSFORMS_H_
+#define LIB_JXL_ENC_TRANSFORMS_H_
+
+// Facade for (non-inlined) integral transforms.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+void TransformFromPixels(const AcStrategy::Type strategy,
+                         const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                         float* JXL_RESTRICT coefficients,
+                         float* JXL_RESTRICT scratch_space);
+
+// Equivalent of the above for DC image.
+void DCFromLowestFrequencies(AcStrategy::Type strategy, const float* block,
+                             float* dc, size_t dc_stride);
+
+void AFVDCT4x4(const float* JXL_RESTRICT pixels, float* JXL_RESTRICT coeffs);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_TRANSFORMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.cc b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
new file mode 100644
index 0000000000..2ee0abf821
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
@@ -0,0 +1,520 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_xyb.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstdlib>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_image_bundle.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+// 4x3 matrix * 3x1 SIMD vectors
+template <class V>
+JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b,
+                                const float* JXL_RESTRICT premul_absorb,
+                                V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1,
+                                V* JXL_RESTRICT mixed2) {
+  const float* bias = &kOpsinAbsorbanceBias[0];
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const auto m0 = Load(d, premul_absorb + 0 * N);
+  const auto m1 = Load(d, premul_absorb + 1 * N);
+  const auto m2 = Load(d, premul_absorb + 2 * N);
+  const auto m3 = Load(d, premul_absorb + 3 * N);
+  const auto m4 = Load(d, premul_absorb + 4 * N);
+  const auto m5 = Load(d, premul_absorb + 5 * N);
+  const auto m6 = Load(d, premul_absorb + 6 * N);
+  const auto m7 = Load(d, premul_absorb + 7 * N);
+  const auto m8 = Load(d, premul_absorb + 8 * N);
+  *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0]))));
+  *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1]))));
+  *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2]))));
+}
+
+template <class V>
+void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx,
+              float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) {
+  const HWY_FULL(float) d;
+  const V half = Set(d, 0.5f);
+  Store(Mul(half, Sub(r, g)), d, valx);
+  Store(Mul(half, Add(r, g)), d, valy);
+  Store(b, d, valz);
+}
+
+// Converts one RGB vector to XYB.
+template <class V>
+void LinearRGBToXYB(const V r, const V g, const V b,
+                    const float* JXL_RESTRICT premul_absorb,
+                    float* JXL_RESTRICT valx, float* JXL_RESTRICT valy,
+                    float* JXL_RESTRICT valz) {
+  V mixed0, mixed1, mixed2;
+  OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2);
+
+  // mixed* should be non-negative even for wide-gamut, so clamp to zero.
+  mixed0 = ZeroIfNegative(mixed0);
+  mixed1 = ZeroIfNegative(mixed1);
+  mixed2 = ZeroIfNegative(mixed2);
+
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N));
+  mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N));
+  mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N));
+  StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz);
+
+  // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative.
+}
+
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  const HWY_FULL(float) d;
+  for (size_t x = 0; x < xsize; x += Lanes(d)) {
+    const auto r = Load(d, row0 + x);
+    const auto g = Load(d, row1 + x);
+    const auto b = Load(d, row2 + x);
+    LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x);
+  }
+}
+
+// Input/output uses the codec.h scaling: nominally 0-1 if in-gamut.
+template <class V>
+V LinearFromSRGB(V encoded) {
+  return TF_SRGB().DisplayFromEncoded(encoded);
+}
+
+Status LinearSRGBToXYB(const Image3F& linear,
+                       const float* JXL_RESTRICT premul_absorb,
+                       ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
+  const size_t xsize = linear.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(linear.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y);
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = Load(d, row_in0 + x);
+          const auto in_g = Load(d, row_in1 + x);
+          const auto in_b = Load(d, row_in2 + x);
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "LinearToXYB");
+}
+
+Status SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb,
+                 ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
+  const size_t xsize = srgb.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "SRGBToXYB");
+}
+
+Status SRGBToXYBAndLinear(const Image3F& srgb,
+                          const float* JXL_RESTRICT premul_absorb,
+                          ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
+                          Image3F* JXL_RESTRICT linear) {
+  const size_t xsize = srgb.xsize();
+
+  const HWY_FULL(float) d;
+  return RunOnPool(
+      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = static_cast<size_t>(task);
+        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
+        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
+        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
+
+        float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y);
+        float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y);
+        float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y);
+
+        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
+        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
+        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+
+        for (size_t x = 0; x < xsize; x += Lanes(d)) {
+          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
+
+          Store(in_r, d, row_linear0 + x);
+          Store(in_g, d, row_linear1 + x);
+          Store(in_b, d, row_linear2 + x);
+
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
+                         row_xyb1 + x, row_xyb2 + x);
+        }
+      },
+      "SRGBToXYBAndLinear");
+}
+
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const float mul = intensity_target / 255.0f;
+  for (size_t i = 0; i < 9; ++i) {
+    const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] * mul);
+    Store(absorb, d, premul_absorb + i * N);
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i]));
+    Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
+  }
+}
+
+Image3F TransformToLinearRGB(const Image3F& in,
+                             const ColorEncoding& color_encoding,
+                             float intensity_target, const JxlCmsInterface& cms,
+                             ThreadPool* pool) {
+  ColorSpaceTransform c_transform(cms);
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray);
+  Image3F out(in.xsize(), in.ysize());
+  std::atomic<bool> ok{true};
+  JXL_CHECK(RunOnPool(
+      pool, 0, in.ysize(),
+      [&](const size_t num_threads) {
+        return c_transform.Init(color_encoding, c_desired, intensity_target,
+                                in.xsize(), num_threads);
+      },
+      [&](const uint32_t y, const size_t thread) {
+        float* mutable_src_buf = c_transform.BufSrc(thread);
+        const float* src_buf = mutable_src_buf;
+        // Interleave input.
+        if (is_gray) {
+          src_buf = in.ConstPlaneRow(0, y);
+        } else {
+          const float* JXL_RESTRICT row_in0 = in.ConstPlaneRow(0, y);
+          const float* JXL_RESTRICT row_in1 = in.ConstPlaneRow(1, y);
+          const float* JXL_RESTRICT row_in2 = in.ConstPlaneRow(2, y);
+          for (size_t x = 0; x < in.xsize(); x++) {
+            mutable_src_buf[3 * x + 0] = row_in0[x];
+            mutable_src_buf[3 * x + 1] = row_in1[x];
+            mutable_src_buf[3 * x + 2] = row_in2[x];
+          }
+        }
+        float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
+        if (!c_transform.Run(thread, src_buf, dst_buf)) {
+          ok.store(false);
+          return;
+        }
+        float* JXL_RESTRICT row_out0 = out.PlaneRow(0, y);
+        float* JXL_RESTRICT row_out1 = out.PlaneRow(1, y);
+        float* JXL_RESTRICT row_out2 = out.PlaneRow(2, y);
+        // De-interleave output and convert type.
+        if (is_gray) {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[x];
+            row_out1[x] = dst_buf[x];
+            row_out2[x] = dst_buf[x];
+          }
+        } else {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[3 * x + 0];
+            row_out1[x] = dst_buf[3 * x + 1];
+            row_out2[x] = dst_buf[3 * x + 2];
+          }
+        }
+      },
+      "Colorspace transform"));
+  JXL_CHECK(ok.load());
+  return out;
+}
+
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  JXL_ASSERT(SameSize(in, *xyb));
+
+  const HWY_FULL(float) d;
+  // Pre-broadcasted constants
+  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
+  ComputePremulAbsorb(intensity_target, premul_absorb);
+
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(is_gray);
+  if (c_linear_srgb.SameColorEncoding(color_encoding)) {
+    JXL_CHECK(LinearSRGBToXYB(in, premul_absorb, pool, xyb));
+  } else if (color_encoding.IsSRGB()) {
+    JXL_CHECK(SRGBToXYB(in, premul_absorb, pool, xyb));
+  } else {
+    Image3F linear =
+        TransformToLinearRGB(in, color_encoding, intensity_target, cms, pool);
+    JXL_CHECK(LinearSRGBToXYB(linear, premul_absorb, pool, xyb));
+  }
+}
+
+// This is different from Butteraugli's OpsinDynamicsImage() in the sense that
+// it does not contain a sensitivity multiplier based on the blurred image.
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* const JXL_RESTRICT linear) {
+  PROFILER_FUNC;
+
+  const size_t xsize = in.xsize();
+  const size_t ysize = in.ysize();
+  JXL_ASSERT(SameSize(in, *xyb));
+
+  const HWY_FULL(float) d;
+  // Pre-broadcasted constants
+  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
+  ComputePremulAbsorb(in.metadata()->IntensityTarget(), premul_absorb);
+
+  const bool want_linear = linear != nullptr;
+
+  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray());
+  // Linear sRGB inputs are rare but can be useful for the fastest encoders, for
+  // which undoing the sRGB transfer function would be a large part of the cost.
+  if (c_linear_srgb.SameColorEncoding(in.c_current())) {
+    JXL_CHECK(LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb));
+    // This only happens if kitten or slower, moving ImageBundle might be
+    // possible but the encoder is much slower than this copy.
+    if (want_linear) {
+      *linear = in.Copy();
+      return linear;
+    }
+    return &in;
+  }
+
+  // Common case: already sRGB, can avoid the color transform
+  if (in.IsSRGB()) {
+    // Common case: can avoid allocating/copying
+    if (!want_linear) {
+      JXL_CHECK(SRGBToXYB(in.color(), premul_absorb, pool, xyb));
+      return &in;
+    }
+
+    // Slow encoder also wants linear sRGB.
+    linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb);
+    JXL_CHECK(SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb,
+                                 linear->color()));
+    return linear;
+  }
+
+  // General case: not sRGB, need color transform.
+  ImageBundle linear_storage;  // Local storage only used if !want_linear.
+
+  ImageBundle* linear_storage_ptr;
+  if (want_linear) {
+    // Caller asked for linear, use that storage directly.
+    linear_storage_ptr = linear;
+  } else {
+    // Caller didn't ask for linear, create our own local storage
+    // OK to reuse metadata, it will not be changed.
+    linear_storage = ImageBundle(const_cast<ImageMetadata*>(in.metadata()));
+    linear_storage_ptr = &linear_storage;
+  }
+
+  const ImageBundle* ptr;
+  JXL_CHECK(TransformIfNeeded(in, c_linear_srgb, cms, pool, linear_storage_ptr,
+                              &ptr));
+  // If no transform was necessary, should have taken the above codepath.
+  JXL_ASSERT(ptr == linear_storage_ptr);
+
+  JXL_CHECK(
+      LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb));
+  return want_linear ? linear : &in;
+}
+
+// Transform RGB to YCbCr.
+// Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool) {
+  const HWY_FULL(float) df;
+  const size_t S = Lanes(df);  // Step.
+
+  const size_t xsize = r_plane.xsize();
+  const size_t ysize = r_plane.ysize();
+  if ((xsize == 0) || (ysize == 0)) return true;
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto k128 = Set(df, 128.0f / 255);
+  const auto kR = Set(df, 0.299f);  // NTSC luma
+  const auto kG = Set(df, 0.587f);
+  const auto kB = Set(df, 0.114f);
+  const auto kAmpR = Set(df, 0.701f);
+  const auto kAmpB = Set(df, 0.886f);
+  const auto kDiffR = Add(kAmpR, kR);
+  const auto kDiffB = Add(kAmpB, kB);
+  const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
+  const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
+
+  constexpr size_t kGroupArea = kGroupDim * kGroupDim;
+  const size_t lines_per_group = DivCeil(kGroupArea, xsize);
+  const size_t num_stripes = DivCeil(ysize, lines_per_group);
+  const auto transform = [&](int idx, int /* thread*/) {
+    const size_t y0 = idx * lines_per_group;
+    const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize);
+    for (size_t y = y0; y < y1; ++y) {
+      const float* r_row = r_plane.ConstRow(y);
+      const float* g_row = g_plane.ConstRow(y);
+      const float* b_row = b_plane.ConstRow(y);
+      float* y_row = y_plane->Row(y);
+      float* cb_row = cb_plane->Row(y);
+      float* cr_row = cr_plane->Row(y);
+      for (size_t x = 0; x < xsize; x += S) {
+        const auto r = Load(df, r_row + x);
+        const auto g = Load(df, g_row + x);
+        const auto b = Load(df, b_row + x);
+        const auto r_base = Mul(r, kR);
+        const auto r_diff = Mul(r, kDiffR);
+        const auto g_base = Mul(g, kG);
+        const auto b_base = Mul(b, kB);
+        const auto b_diff = Mul(b, kDiffB);
+        const auto y_base = Add(r_base, Add(g_base, b_base));
+        const auto y_vec = Sub(y_base, k128);
+        const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB);
+        const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR);
+        Store(y_vec, df, y_row + x);
+        Store(cb_vec, df, cb_row + x);
+        Store(cr_vec, df, cr_row + x);
+      }
+    }
+  };
+  return RunOnPool(pool, 0, static_cast<int>(num_stripes), ThreadPool::NoInit,
+                   transform, "RgbToYcbCr");
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(ToXYB);
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* JXL_RESTRICT linear_storage) {
+  return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, cms, linear_storage);
+}
+
+HWY_EXPORT(LinearRGBRowToXYB);
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB)
+  (row0, row1, row2, premul_absorb, xsize);
+}
+
+HWY_EXPORT(ComputePremulAbsorb);
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb);
+}
+
+void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                 float* JXL_RESTRICT row2, size_t xsize) {
+  for (size_t x = 0; x < xsize; x++) {
+    row2[x] = (row2[x] - row1[x] + kScaledXYBOffset[2]) * kScaledXYBScale[2];
+    row0[x] = (row0[x] + kScaledXYBOffset[0]) * kScaledXYBScale[0];
+    row1[x] = (row1[x] + kScaledXYBOffset[1]) * kScaledXYBScale[1];
+  }
+}
+
+void ScaleXYB(Image3F* opsin) {
+  for (size_t y = 0; y < opsin->ysize(); y++) {
+    float* row0 = opsin->PlaneRow(0, y);
+    float* row1 = opsin->PlaneRow(1, y);
+    float* row2 = opsin->PlaneRow(2, y);
+    ScaleXYBRow(row0, row1, row2, opsin->xsize());
+  }
+}
+
+HWY_EXPORT(Image3FToXYB);
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  return HWY_DYNAMIC_DISPATCH(Image3FToXYB)(in, color_encoding,
+                                            intensity_target, pool, xyb, cms);
+}
+
+HWY_EXPORT(RgbToYcbcr);
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane,
+                                          cb_plane, cr_plane, pool);
+}
+
+// DEPRECATED
+Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms) {
+  ImageMetadata metadata;
+  metadata.SetUintSamples(8);
+  metadata.color_encoding = ColorEncoding::SRGB();
+  ImageBundle ib(&metadata);
+  ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding);
+  JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()), cms));
+  ThreadPool* null_pool = nullptr;
+  Image3F xyb(srgb8.xsize(), srgb8.ysize());
+
+  ImageBundle linear_storage(&metadata);
+  (void)ToXYB(ib, null_pool, &xyb, cms, &linear_storage);
+  return xyb;
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/enc_xyb.h b/third_party/jpeg-xl/lib/jxl/enc_xyb.h
new file mode 100644
index 0000000000..fc902848ee
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.h
@@ -0,0 +1,56 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_XYB_H_
+#define LIB_JXL_ENC_XYB_H_
+
+// Converts to XYB color space.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jxl {
+
+// Converts any color space to XYB. If `linear` is not null, returns `linear`
+// after filling it with a linear sRGB copy of `in`. Otherwise, returns `&in`.
+//
+// NOTE this return value can avoid an extra color conversion if `in` would
+// later be passed to JxlButteraugliComparator.
+const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
+                         Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
+                         ImageBundle* JXL_RESTRICT linear = nullptr);
+
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms);
+
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize);
+
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb);
+
+// Transforms each color component of the given XYB image into the [0.0, 1.0]
+// interval with an affine transform.
+void ScaleXYB(Image3F* opsin);
+void ScaleXYBRow(float* row0, float* row1, float* row2, size_t xsize);
+
+// Bt.601 to match JPEG/JFIF. Outputs _signed_ YCbCr values suitable for DCT,
+// see F.1.1.3 of T.81 (because our data type is float, there is no need to add
+// a bias to make the values unsigned).
+Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
+                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
+                  ImageF* cr_plane, ThreadPool* pool);
+
+// DEPRECATED, used by opsin_image_wrapper.
+Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/encode.cc b/third_party/jpeg-xl/lib/jxl/encode.cc
new file mode 100644
index 0000000000..fbd5133ae5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode.cc
@@ -0,0 +1,2128 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <brotli/encode.h>
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/exif.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/luminance.h"
+#include "lib/jxl/sanitizers.h"
+
+// Debug-printing failure macro similar to JXL_FAILURE, but for the status code
+// JXL_ENC_ERROR
+#ifdef JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(enc, error_code, format, ...)                          \
+  (enc->error = error_code,                                                  \
+   ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_ENC_ERROR)
+#define JXL_API_ERROR_NOSET(format, ...)                                     \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   ::jxl::Abort(), JXL_ENC_ERROR)
+#else  // JXL_CRASH_ON_ERROR
+#define JXL_API_ERROR(enc, error_code, format, ...)                            \
+  (enc->error = error_code,                                                    \
+   ((JXL_DEBUG_ON_ERROR) &&                                                    \
+    ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \
+   JXL_ENC_ERROR)
+#define JXL_API_ERROR_NOSET(format, ...)                                     \
+  (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
+   JXL_ENC_ERROR)
+#endif  // JXL_CRASH_ON_ERROR
+
+namespace jxl {}  // namespace jxl
+
+uint32_t JxlEncoderVersion(void) {
+  return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 +
+         JPEGXL_PATCH_VERSION;
+}
+
+namespace {
+template <typename T>
+void AppendJxlpBoxCounter(uint32_t counter, bool last, T* output) {
+  if (last) counter |= 0x80000000;
+  for (size_t i = 0; i < 4; i++) {
+    output->push_back(counter >> (8 * (3 - i)) & 0xff);
+  }
+}
+
+void QueueFrame(
+    const JxlEncoderFrameSettings* frame_settings,
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedFrame>& frame) {
+  if (frame_settings->values.lossless) {
+    frame->option_values.cparams.SetLossless();
+  }
+
+  jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager);
+  queued_input.frame = std::move(frame);
+  frame_settings->enc->input_queue.emplace_back(std::move(queued_input));
+  frame_settings->enc->num_queued_frames++;
+}
+
+void QueueFastLosslessFrame(const JxlEncoderFrameSettings* frame_settings,
+                            JxlFastLosslessFrameState* fast_lossless_frame) {
+  jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager);
+  queued_input.fast_lossless_frame.reset(fast_lossless_frame);
+  frame_settings->enc->input_queue.emplace_back(std::move(queued_input));
+  frame_settings->enc->num_queued_frames++;
+}
+
+void QueueBox(JxlEncoder* enc,
+              jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedBox>& box) {
+  jxl::JxlEncoderQueuedInput queued_input(enc->memory_manager);
+  queued_input.box = std::move(box);
+  enc->input_queue.emplace_back(std::move(queued_input));
+  enc->num_queued_boxes++;
+}
+
+// TODO(lode): share this code and the Brotli compression code in enc_jpeg_data
+JxlEncoderStatus BrotliCompress(int quality, const uint8_t* in, size_t in_size,
+                                jxl::PaddedBytes* out) {
+  std::unique_ptr<BrotliEncoderState, decltype(BrotliEncoderDestroyInstance)*>
+      enc(BrotliEncoderCreateInstance(nullptr, nullptr, nullptr),
+          BrotliEncoderDestroyInstance);
+  if (!enc) return JXL_API_ERROR_NOSET("BrotliEncoderCreateInstance failed");
+
+  BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_QUALITY, quality);
+  BrotliEncoderSetParameter(enc.get(), BROTLI_PARAM_SIZE_HINT, in_size);
+
+  constexpr size_t kBufferSize = 128 * 1024;
+  jxl::PaddedBytes temp_buffer(kBufferSize);
+
+  size_t avail_in = in_size;
+  const uint8_t* next_in = in;
+
+  size_t total_out = 0;
+
+  for (;;) {
+    size_t avail_out = kBufferSize;
+    uint8_t* next_out = temp_buffer.data();
+    jxl::msan::MemoryIsInitialized(next_in, avail_in);
+    if (!BrotliEncoderCompressStream(enc.get(), BROTLI_OPERATION_FINISH,
+                                     &avail_in, &next_in, &avail_out, &next_out,
+                                     &total_out)) {
+      return JXL_API_ERROR_NOSET("Brotli compression failed");
+    }
+    size_t out_size = next_out - temp_buffer.data();
+    jxl::msan::UnpoisonMemory(next_out - out_size, out_size);
+    out->resize(out->size() + out_size);
+    memcpy(out->data() + out->size() - out_size, temp_buffer.data(), out_size);
+    if (BrotliEncoderIsFinished(enc.get())) break;
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+// The JXL codestream can have level 5 or level 10. Levels have certain
+// restrictions such as max allowed image dimensions. This function checks the
+// level required to support the current encoder settings. The debug_string is
+// intended to be used for developer API error messages, and may be set to
+// nullptr.
+int VerifyLevelSettings(const JxlEncoder* enc, std::string* debug_string) {
+  const auto& m = enc->metadata.m;
+
+  uint64_t xsize = enc->metadata.size.xsize();
+  uint64_t ysize = enc->metadata.size.ysize();
+  // The uncompressed ICC size, if it is used.
+  size_t icc_size = 0;
+  if (m.color_encoding.WantICC()) {
+    icc_size = m.color_encoding.ICC().size();
+  }
+
+  // Level 10 checks
+
+  if (xsize > (1ull << 30ull) || ysize > (1ull << 30ull) ||
+      xsize * ysize > (1ull << 40ull)) {
+    if (debug_string) *debug_string = "Too large image dimensions";
+    return -1;
+  }
+  if (icc_size > (1ull << 28)) {
+    if (debug_string) *debug_string = "Too large ICC profile size";
+    return -1;
+  }
+  if (m.num_extra_channels > 256) {
+    if (debug_string) *debug_string = "Too many extra channels";
+    return -1;
+  }
+
+  // Level 5 checks
+
+  if (!m.modular_16_bit_buffer_sufficient) {
+    if (debug_string) *debug_string = "Too high modular bit depth";
+    return 10;
+  }
+  if (xsize > (1ull << 18ull) || ysize > (1ull << 18ull) ||
+      xsize * ysize > (1ull << 28ull)) {
+    if (debug_string) *debug_string = "Too large image dimensions";
+    return 10;
+  }
+  if (icc_size > (1ull << 22)) {
+    if (debug_string) *debug_string = "Too large ICC profile";
+    return 10;
+  }
+  if (m.num_extra_channels > 4) {
+    if (debug_string) *debug_string = "Too many extra channels";
+    return 10;
+  }
+  for (size_t i = 0; i < m.extra_channel_info.size(); ++i) {
+    if (m.extra_channel_info[i].type == jxl::ExtraChannel::kBlack) {
+      if (debug_string) *debug_string = "CMYK channel not allowed";
+      return 10;
+    }
+  }
+
+  // TODO(lode): also need to check if consecutive composite-still frames total
+  // pixel amount doesn't exceed 2**28 in the case of level 5. This should be
+  // done when adding frame and requires ability to add composite still frames
+  // to be added first.
+
+  // TODO(lode): also need to check animation duration of a frame. This should
+  // be done when adding frame, but first requires implementing setting the
+  // JxlFrameHeader for a frame.
+
+  // TODO(lode): also need to check properties such as num_splines, num_patches,
+  // modular_16bit_buffers and multiple properties of modular trees. However
+  // these are not user-set properties so cannot be checked here, but decisions
+  // the C++ encoder should be able to make based on the level.
+
+  // All level 5 checks passes, so can return the more compatible level 5
+  return 5;
+}
+
+size_t BitsPerChannel(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      return 0;  // signals unhandled JxlDataType
+  }
+}
+
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
+JxlEncoderStatus CheckValidBitdepth(uint32_t bits_per_sample,
+                                    uint32_t exponent_bits_per_sample) {
+  if (!exponent_bits_per_sample) {
+    // The spec allows up to 31 for bits_per_sample here, but
+    // the code does not (yet) support it.
+    if (!(bits_per_sample > 0 && bits_per_sample <= 24)) {
+      return JXL_API_ERROR_NOSET("Invalid value for bits_per_sample");
+    }
+  } else if ((exponent_bits_per_sample > 8) ||
+             (bits_per_sample > 24 + exponent_bits_per_sample) ||
+             (bits_per_sample < 3 + exponent_bits_per_sample)) {
+    return JXL_API_ERROR_NOSET("Invalid float description");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus VerifyInputBitDepth(JxlBitDepth bit_depth,
+                                     JxlPixelFormat format) {
+  if ((format.data_type == JXL_TYPE_FLOAT ||
+       format.data_type == JXL_TYPE_FLOAT16) &&
+      bit_depth.type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return JXL_API_ERROR_NOSET(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT is "
+        "implemented for float types.");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
+                         jxl::BitWriter& writer) {
+  bool ok = true;
+  int NF = 0;
+  for (size_t i = 0; i < frame_index_box.entries.size(); ++i) {
+    if (i == 0 || frame_index_box.entries[i].to_be_indexed) {
+      ++NF;
+    }
+  }
+  // Frame index box contents varint + 8 bytes
+  // continue with NF * 3 * varint
+  // varint max length is 10 for 64 bit numbers, and these numbers
+  // are limited to 63 bits.
+  static const int kVarintMaxLength = 10;
+  static const int kFrameIndexBoxHeaderLength = kVarintMaxLength + 8;
+  static const int kFrameIndexBoxElementLength = 3 * kVarintMaxLength;
+  const int buffer_size =
+      kFrameIndexBoxHeaderLength + NF * kFrameIndexBoxElementLength;
+  std::vector<uint8_t> buffer_vec(buffer_size);
+  uint8_t* buffer = buffer_vec.data();
+  size_t output_pos = 0;
+  ok &= jxl::EncodeVarInt(NF, buffer_vec.size(), &output_pos, buffer);
+  StoreBE32(frame_index_box.TNUM, &buffer[output_pos]);
+  output_pos += 4;
+  StoreBE32(frame_index_box.TDEN, &buffer[output_pos]);
+  output_pos += 4;
+  // When we record a frame in the index, the record needs to know
+  // how many frames until the next indexed frame. That is why
+  // we store the 'prev' record. That 'prev' record needs to store
+  // the offset byte position to previously recorded indexed frame,
+  // that's why we also trace previous to the previous frame.
+  int prev_prev_ix = -1;  // For position offset (OFFi) delta coding.
+  int prev_ix = 0;
+  int T_prev = 0;
+  int T = 0;
+  for (size_t i = 1; i < frame_index_box.entries.size(); ++i) {
+    if (frame_index_box.entries[i].to_be_indexed) {
+      // Now we can record the previous entry, since we need to store
+      // there how many frames until the next one.
+      int64_t OFFi = frame_index_box.entries[prev_ix].OFFi;
+      if (prev_prev_ix != -1) {
+        // Offi needs to be offset of start byte of this frame compared to start
+        // byte of previous frame from this index in the JPEG XL codestream. For
+        // the first frame, this is the offset from the first byte of the JPEG
+        // XL codestream.
+        OFFi -= frame_index_box.entries[prev_prev_ix].OFFi;
+      }
+      int32_t Ti = T_prev;
+      int32_t Fi = i - prev_ix;
+      ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+      ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+      ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+      prev_prev_ix = prev_ix;
+      prev_ix = i;
+      T_prev = T;
+      T += frame_index_box.entries[i].duration;
+    }
+  }
+  {
+    // Last frame.
+    size_t i = frame_index_box.entries.size();
+    int64_t OFFi = frame_index_box.entries[prev_ix].OFFi;
+    if (prev_prev_ix != -1) {
+      OFFi -= frame_index_box.entries[prev_prev_ix].OFFi;
+    }
+    int32_t Ti = T_prev;
+    int32_t Fi = i - prev_ix;
+    ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+    ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+    ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+  }
+  // Enough buffer has been allocated, this function should never fail in
+  // writing.
+  JXL_ASSERT(ok);
+  return ok;
+}
+
+}  // namespace
+
+JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
+  jxl::PaddedBytes bytes;
+
+  jxl::JxlEncoderQueuedInput& input = input_queue[0];
+
+  // TODO(lode): split this into 3 functions: for adding the signature and other
+  // initial headers (jbrd, ...), one for adding frame, and one for adding user
+  // box.
+
+  if (!wrote_bytes) {
+    // First time encoding any data, verify the level 5 vs level 10 settings
+    std::string level_message;
+    int required_level = VerifyLevelSettings(this, &level_message);
+    // Only level 5 and 10 are defined, and the function can return -1 to
+    // indicate full incompatibility.
+    JXL_ASSERT(required_level == -1 || required_level == 5 ||
+               required_level == 10);
+    // codestream_level == -1 means auto-set to the required level
+    if (codestream_level == -1) codestream_level = required_level;
+    if (codestream_level == 5 && required_level != 5) {
+      // If the required level is 10, return error rather than automatically
+      // setting the level to 10, to avoid inadvertently creating a level 10
+      // JXL file while intending to target a level 5 decoder.
+      return JXL_API_ERROR(
+          this, JXL_ENC_ERR_API_USAGE, "%s",
+          ("Codestream level verification for level 5 failed: " + level_message)
+              .c_str());
+    }
+    if (required_level == -1) {
+      return JXL_API_ERROR(
+          this, JXL_ENC_ERR_API_USAGE, "%s",
+          ("Codestream level verification for level 10 failed: " +
+           level_message)
+              .c_str());
+    }
+
+    jxl::BitWriter writer;
+    if (!WriteCodestreamHeaders(&metadata, &writer, nullptr)) {
+      return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                           "Failed to write codestream header");
+    }
+    // Only send ICC (at least several hundred bytes) if fields aren't enough.
+    if (metadata.m.color_encoding.WantICC()) {
+      if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer,
+                         jxl::kLayerHeader, nullptr)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Failed to write ICC profile");
+      }
+    }
+    // TODO(lode): preview should be added here if a preview image is added
+
+    writer.ZeroPadToByte();
+
+    // Not actually the end of frame, but the end of metadata/ICC, but helps
+    // the next frame to start here for indexing purposes.
+    codestream_bytes_written_end_of_frame +=
+        jxl::DivCeil(writer.BitsWritten(), 8);
+
+    bytes = std::move(writer).TakeBytes();
+
+    if (MustUseContainer()) {
+      // Add "JXL " and ftyp box.
+      output_byte_queue.insert(
+          output_byte_queue.end(), jxl::kContainerHeader,
+          jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+      if (codestream_level != 5) {
+        // Add jxll box directly after the ftyp box to indicate the codestream
+        // level.
+        output_byte_queue.insert(
+            output_byte_queue.end(), jxl::kLevelBoxHeader,
+            jxl::kLevelBoxHeader + sizeof(jxl::kLevelBoxHeader));
+        output_byte_queue.push_back(codestream_level);
+      }
+
+      // Whether to write the basic info and color profile header of the
+      // codestream into an early separate jxlp box, so that it comes before
+      // metadata or jpeg reconstruction boxes. In theory this could simply
+      // always be done, but there's no reason to add an extra box with box
+      // header overhead if the codestream will already come immediately after
+      // the signature and level boxes.
+      bool partial_header =
+          store_jpeg_metadata ||
+          (use_boxes && (!input.frame && !input.fast_lossless_frame));
+
+      if (partial_header) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), bytes.size() + 4,
+                             /*unbounded=*/false, &output_byte_queue);
+        AppendJxlpBoxCounter(jxlp_counter++, /*last=*/false,
+                             &output_byte_queue);
+        output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
+                                 bytes.data() + bytes.size());
+        bytes.clear();
+      }
+
+      if (store_jpeg_metadata && !jpeg_metadata.empty()) {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(),
+                             false, &output_byte_queue);
+        output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(),
+                                 jpeg_metadata.end());
+      }
+    }
+    wrote_bytes = true;
+  }
+
+  // Choose frame or box processing: exactly one of the two unique pointers (box
+  // or frame) in the input queue item is non-null.
+  if (input.frame || input.fast_lossless_frame) {
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedFrame> input_frame =
+        std::move(input.frame);
+    if (input.fast_lossless_frame) {
+      output_fast_frame_queue.push_back(std::move(input.fast_lossless_frame));
+    }
+    input_queue.erase(input_queue.begin());
+    num_queued_frames--;
+    if (input_frame) {
+      for (unsigned idx = 0; idx < input_frame->ec_initialized.size(); idx++) {
+        if (!input_frame->ec_initialized[idx]) {
+          return JXL_API_ERROR(this, JXL_ENC_ERR_API_USAGE,
+                               "Extra channel %u is not initialized", idx);
+        }
+      }
+
+      // TODO(zond): If the input queue is empty and the frames_closed is true,
+      // then mark this frame as the last.
+
+      // TODO(zond): Handle progressive mode like EncodeFile does it.
+      // TODO(zond): Handle animation like EncodeFile does it, by checking if
+      //             JxlEncoderCloseFrames has been called and if the frame
+      //             queue is empty (to see if it's the last animation frame).
+
+      if (metadata.m.xyb_encoded) {
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kXYB;
+      } else {
+        // TODO(zond): Figure out when to use kYCbCr instead.
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kNone;
+      }
+    }
+
+    uint32_t duration;
+    uint32_t timecode;
+    if (input_frame && metadata.m.have_animation) {
+      duration = input_frame->option_values.header.duration;
+      timecode = input_frame->option_values.header.timecode;
+    } else {
+      // If have_animation is false, the encoder should ignore the duration and
+      // timecode values. However, assigning them to ib will cause the encoder
+      // to write an invalid frame header that can't be decoded so ensure
+      // they're the default value of 0 here.
+      duration = 0;
+      timecode = 0;
+    }
+
+    bool last_frame = frames_closed && !num_queued_frames;
+
+    size_t codestream_byte_size = 0;
+
+    jxl::BitWriter writer;
+
+    if (input_frame) {
+      jxl::PassesEncoderState enc_state;
+
+      frame_index_box.AddFrame(codestream_bytes_written_end_of_frame, duration,
+                               input_frame->option_values.frame_index_box);
+
+      // EncodeFrame creates jxl::FrameHeader object internally based on the
+      // FrameInfo, imagebundle, cparams and metadata. Copy the information to
+      // these.
+      jxl::ImageBundle& ib = input_frame->frame;
+      ib.duration = duration;
+      ib.timecode = timecode;
+      ib.name = input_frame->option_values.frame_name;
+      ib.blendmode = static_cast<jxl::BlendMode>(
+          input_frame->option_values.header.layer_info.blend_info.blendmode);
+      ib.blend =
+          input_frame->option_values.header.layer_info.blend_info.blendmode !=
+          JXL_BLEND_REPLACE;
+
+      size_t save_as_reference =
+          input_frame->option_values.header.layer_info.save_as_reference;
+      if (save_as_reference >= 3) {
+        return JXL_API_ERROR(
+            this, JXL_ENC_ERR_API_USAGE,
+            "Cannot use save_as_reference values >=3 (found: %d)",
+            (int)save_as_reference);
+      }
+      ib.use_for_next_frame = !!save_as_reference;
+
+      jxl::FrameInfo frame_info;
+      frame_info.is_last = last_frame;
+      frame_info.save_as_reference = save_as_reference;
+      frame_info.source =
+          input_frame->option_values.header.layer_info.blend_info.source;
+      frame_info.clamp =
+          input_frame->option_values.header.layer_info.blend_info.clamp;
+      frame_info.alpha_channel =
+          input_frame->option_values.header.layer_info.blend_info.alpha;
+      frame_info.extra_channel_blending_info.resize(
+          metadata.m.num_extra_channels);
+      // If extra channel blend info has not been set, use the blend mode from
+      // the layer_info.
+      JxlBlendInfo default_blend_info =
+          input_frame->option_values.header.layer_info.blend_info;
+      for (size_t i = 0; i < metadata.m.num_extra_channels; ++i) {
+        auto& to = frame_info.extra_channel_blending_info[i];
+        const auto& from =
+            i < input_frame->option_values.extra_channel_blend_info.size()
+                ? input_frame->option_values.extra_channel_blend_info[i]
+                : default_blend_info;
+        to.mode = static_cast<jxl::BlendMode>(from.blendmode);
+        to.source = from.source;
+        to.alpha_channel = from.alpha;
+        to.clamp = (from.clamp != 0);
+      }
+
+      if (input_frame->option_values.header.layer_info.have_crop) {
+        ib.origin.x0 = input_frame->option_values.header.layer_info.crop_x0;
+        ib.origin.y0 = input_frame->option_values.header.layer_info.crop_y0;
+      }
+      JXL_ASSERT(writer.BitsWritten() == 0);
+      if (!jxl::EncodeFrame(input_frame->option_values.cparams, frame_info,
+                            &metadata, input_frame->frame, &enc_state, cms,
+                            thread_pool.get(), &writer,
+                            /*aux_out=*/nullptr)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Failed to encode frame");
+      }
+      codestream_bytes_written_beginning_of_frame =
+          codestream_bytes_written_end_of_frame;
+      codestream_bytes_written_end_of_frame +=
+          jxl::DivCeil(writer.BitsWritten(), 8);
+
+      // Possibly bytes already contains the codestream header: in case this is
+      // the first frame, and the codestream header was not encoded as jxlp
+      // above.
+      bytes.append(std::move(writer).TakeBytes());
+      codestream_byte_size = bytes.size();
+    } else {
+      JXL_CHECK(!output_fast_frame_queue.empty());
+      JxlFastLosslessPrepareHeader(output_fast_frame_queue.front().get(),
+                                   /*add_image_header=*/0, last_frame);
+      codestream_byte_size =
+          JxlFastLosslessOutputSize(output_fast_frame_queue.front().get()) +
+          bytes.size();
+    }
+
+    if (MustUseContainer()) {
+      if (last_frame && jxlp_counter == 0) {
+        // If this is the last frame and no jxlp boxes were used yet, it's
+        // slighly more efficient to write a jxlc box since it has 4 bytes
+        // less overhead.
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), codestream_byte_size,
+                             /*unbounded=*/false, &output_byte_queue);
+      } else {
+        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), codestream_byte_size + 4,
+                             /*unbounded=*/false, &output_byte_queue);
+        AppendJxlpBoxCounter(jxlp_counter++, last_frame, &output_byte_queue);
+      }
+    }
+
+    output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
+                             bytes.data() + bytes.size());
+
+    if (input_frame) {
+      last_used_cparams = input_frame->option_values.cparams;
+    }
+    if (last_frame && frame_index_box.StoreFrameIndexBox()) {
+      bytes.clear();
+      EncodeFrameIndexBox(frame_index_box, writer);
+      jxl::AppendBoxHeader(jxl::MakeBoxType("jxli"), bytes.size(),
+                           /*unbounded=*/false, &output_byte_queue);
+    }
+  } else {
+    // Not a frame, so is a box instead
+    jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedBox> box =
+        std::move(input.box);
+    input_queue.erase(input_queue.begin());
+    num_queued_boxes--;
+
+    if (box->compress_box) {
+      jxl::PaddedBytes compressed(4);
+      // Prepend the original box type in the brob box contents
+      for (size_t i = 0; i < 4; i++) {
+        compressed[i] = static_cast<uint8_t>(box->type[i]);
+      }
+      if (JXL_ENC_SUCCESS !=
+          BrotliCompress((brotli_effort >= 0 ? brotli_effort : 4),
+                         box->contents.data(), box->contents.size(),
+                         &compressed)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Brotli compression for brob box failed");
+      }
+      jxl::AppendBoxHeader(jxl::MakeBoxType("brob"), compressed.size(), false,
+                           &output_byte_queue);
+      output_byte_queue.insert(output_byte_queue.end(), compressed.data(),
+                               compressed.data() + compressed.size());
+    } else {
+      jxl::AppendBoxHeader(box->type, box->contents.size(), false,
+                           &output_byte_queue);
+      output_byte_queue.insert(output_byte_queue.end(), box->contents.data(),
+                               box->contents.data() + box->contents.size());
+    }
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc,
+                                            const JxlColorEncoding* color) {
+  if (!enc->basic_info_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set");
+  }
+  if (enc->color_encoding_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Color encoding is already set");
+  }
+  if (!jxl::ConvertExternalToInternalColorEncoding(
+          *color, &enc->metadata.m.color_encoding)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC, "Error in color conversion");
+  }
+  if (enc->metadata.m.color_encoding.GetColorSpace() ==
+      jxl::ColorSpace::kGray) {
+    if (enc->basic_info.num_color_channels != 1)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "Cannot use grayscale color encoding with num_color_channels != 1");
+  } else {
+    if (enc->basic_info.num_color_channels != 3)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "Cannot use RGB color encoding with num_color_channels != 3");
+  }
+  enc->color_encoding_set = true;
+  if (!enc->intensity_target_set) {
+    jxl::SetIntensityTarget(&enc->metadata.m);
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
+                                         const uint8_t* icc_profile,
+                                         size_t size) {
+  if (!enc->basic_info_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Basic info not yet set");
+  }
+  if (enc->color_encoding_set) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "ICC profile is already set");
+  }
+  jxl::PaddedBytes icc;
+  icc.assign(icc_profile, icc_profile + size);
+  if (!enc->metadata.m.color_encoding.SetICC(std::move(icc))) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT,
+                         "ICC profile could not be set");
+  }
+  if (enc->metadata.m.color_encoding.GetColorSpace() ==
+      jxl::ColorSpace::kGray) {
+    if (enc->basic_info.num_color_channels != 1)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_BAD_INPUT,
+          "Cannot use grayscale ICC profile with num_color_channels != 1");
+  } else {
+    if (enc->basic_info.num_color_channels != 3)
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_BAD_INPUT,
+          "Cannot use RGB ICC profile with num_color_channels != 3");
+    // TODO(jon): also check that a kBlack extra channel is provided in the CMYK
+    // case
+  }
+  enc->color_encoding_set = true;
+  if (!enc->intensity_target_set) {
+    jxl::SetIntensityTarget(&enc->metadata.m);
+  }
+
+  if (!enc->basic_info.uses_original_profile) {
+    enc->metadata.m.color_encoding.DecideIfWantICC();
+  }
+
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderInitBasicInfo(JxlBasicInfo* info) {
+  info->have_container = JXL_FALSE;
+  info->xsize = 0;
+  info->ysize = 0;
+  info->bits_per_sample = 8;
+  info->exponent_bits_per_sample = 0;
+  info->intensity_target = 0.f;
+  info->min_nits = 0.f;
+  info->relative_to_max_display = JXL_FALSE;
+  info->linear_below = 0.f;
+  info->uses_original_profile = JXL_FALSE;
+  info->have_preview = JXL_FALSE;
+  info->have_animation = JXL_FALSE;
+  info->orientation = JXL_ORIENT_IDENTITY;
+  info->num_color_channels = 3;
+  info->num_extra_channels = 0;
+  info->alpha_bits = 0;
+  info->alpha_exponent_bits = 0;
+  info->alpha_premultiplied = JXL_FALSE;
+  info->preview.xsize = 0;
+  info->preview.ysize = 0;
+  info->intrinsic_xsize = 0;
+  info->intrinsic_ysize = 0;
+  info->animation.tps_numerator = 10;
+  info->animation.tps_denominator = 1;
+  info->animation.num_loops = 0;
+  info->animation.have_timecodes = JXL_FALSE;
+}
+
+void JxlEncoderInitFrameHeader(JxlFrameHeader* frame_header) {
+  // For each field, the default value of the specification is used. Depending
+  // on whether an animation frame, or a composite still blending frame,
+  // is used, different fields have to be set up by the user after initing
+  // the frame header.
+  frame_header->duration = 0;
+  frame_header->timecode = 0;
+  frame_header->name_length = 0;
+  // In the specification, the default value of is_last is !frame_type, and the
+  // default frame_type is kRegularFrame which has value 0, so is_last is true
+  // by default. However, the encoder does not use this value (the field exists
+  // for the decoder to set) since last frame is determined by usage of
+  // JxlEncoderCloseFrames instead.
+  frame_header->is_last = JXL_TRUE;
+  frame_header->layer_info.have_crop = JXL_FALSE;
+  frame_header->layer_info.crop_x0 = 0;
+  frame_header->layer_info.crop_y0 = 0;
+  // These must be set if have_crop is enabled, but the default value has
+  // have_crop false, and these dimensions 0. The user must set these to the
+  // desired size after enabling have_crop (which is not yet implemented).
+  frame_header->layer_info.xsize = 0;
+  frame_header->layer_info.ysize = 0;
+  JxlEncoderInitBlendInfo(&frame_header->layer_info.blend_info);
+  frame_header->layer_info.save_as_reference = 0;
+}
+
+void JxlEncoderInitBlendInfo(JxlBlendInfo* blend_info) {
+  // Default blend mode in the specification is 0. Note that combining
+  // blend mode of replace with a duration is not useful, but the user has to
+  // manually set duration in case of animation, or manually change the blend
+  // mode in case of composite stills, so initing to a combination that is not
+  // useful on its own is not an issue.
+  blend_info->blendmode = JXL_BLEND_REPLACE;
+  blend_info->source = 0;
+  blend_info->alpha = 0;
+  blend_info->clamp = 0;
+}
+
+JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
+                                        const JxlBasicInfo* info) {
+  if (!enc->metadata.size.Set(info->xsize, info->ysize)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid dimensions");
+  }
+  if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample,
+                                            info->exponent_bits_per_sample)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth");
+  }
+  enc->metadata.m.bit_depth.bits_per_sample = info->bits_per_sample;
+  enc->metadata.m.bit_depth.exponent_bits_per_sample =
+      info->exponent_bits_per_sample;
+  enc->metadata.m.bit_depth.floating_point_sample =
+      (info->exponent_bits_per_sample != 0u);
+  enc->metadata.m.modular_16_bit_buffer_sufficient =
+      (!info->uses_original_profile || info->bits_per_sample <= 12) &&
+      info->alpha_bits <= 12;
+  if ((info->intrinsic_xsize > 0 || info->intrinsic_ysize > 0) &&
+      (info->intrinsic_xsize != info->xsize ||
+       info->intrinsic_ysize != info->ysize)) {
+    if (info->intrinsic_xsize > (1ull << 30ull) ||
+        info->intrinsic_ysize > (1ull << 30ull) ||
+        !enc->metadata.m.intrinsic_size.Set(info->intrinsic_xsize,
+                                            info->intrinsic_ysize)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "Invalid intrinsic dimensions");
+    }
+    enc->metadata.m.have_intrinsic_size = true;
+  }
+
+  // The number of extra channels includes the alpha channel, so for example and
+  // RGBA with no other extra channels, has exactly num_extra_channels == 1
+  enc->metadata.m.num_extra_channels = info->num_extra_channels;
+  enc->metadata.m.extra_channel_info.resize(enc->metadata.m.num_extra_channels);
+  if (info->num_extra_channels == 0 && info->alpha_bits) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE,
+        "when alpha_bits is non-zero, the number of channels must be at least "
+        "1");
+  }
+  // If the user provides non-zero alpha_bits, we make the channel info at index
+  // zero the appropriate alpha channel.
+  if (info->alpha_bits) {
+    JxlExtraChannelInfo channel_info;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &channel_info);
+    channel_info.bits_per_sample = info->alpha_bits;
+    channel_info.exponent_bits_per_sample = info->alpha_exponent_bits;
+    if (JxlEncoderSetExtraChannelInfo(enc, 0, &channel_info)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "Problem setting extra channel info for alpha");
+    }
+  }
+
+  enc->metadata.m.xyb_encoded = !info->uses_original_profile;
+  if (info->orientation > 0 && info->orientation <= 8) {
+    enc->metadata.m.orientation = info->orientation;
+  } else {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for orientation field");
+  }
+  if (info->num_color_channels != 1 && info->num_color_channels != 3) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid number of color channels");
+  }
+  if (info->intensity_target != 0) {
+    enc->metadata.m.SetIntensityTarget(info->intensity_target);
+    enc->intensity_target_set = true;
+  } else if (enc->color_encoding_set) {
+    // If this is false, JxlEncoderSetColorEncoding will be called later and we
+    // will get one more chance to call jxl::SetIntensityTarget, after the color
+    // encoding is indeed set.
+    jxl::SetIntensityTarget(&enc->metadata.m);
+    enc->intensity_target_set = true;
+  }
+  enc->metadata.m.tone_mapping.min_nits = info->min_nits;
+  enc->metadata.m.tone_mapping.relative_to_max_display =
+      info->relative_to_max_display;
+  enc->metadata.m.tone_mapping.linear_below = info->linear_below;
+  enc->basic_info = *info;
+  enc->basic_info_set = true;
+
+  enc->metadata.m.have_animation = info->have_animation;
+  if (info->have_animation) {
+    if (info->animation.tps_denominator < 1) {
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "If animation is used, tps_denominator must be >= 1");
+    }
+    if (info->animation.tps_numerator < 1) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "If animation is used, tps_numerator must be >= 1");
+    }
+    enc->metadata.m.animation.tps_numerator = info->animation.tps_numerator;
+    enc->metadata.m.animation.tps_denominator = info->animation.tps_denominator;
+    enc->metadata.m.animation.num_loops = info->animation.num_loops;
+    enc->metadata.m.animation.have_timecodes = info->animation.have_timecodes;
+  }
+  std::string level_message;
+  int required_level = VerifyLevelSettings(enc, &level_message);
+  if (required_level == -1 ||
+      (static_cast<int>(enc->codestream_level) < required_level &&
+       enc->codestream_level != -1)) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE, "%s",
+        ("Codestream level verification for level " +
+         std::to_string(enc->codestream_level) + " failed: " + level_message)
+            .c_str());
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type,
+                                    JxlExtraChannelInfo* info) {
+  info->type = type;
+  info->bits_per_sample = 8;
+  info->exponent_bits_per_sample = 0;
+  info->dim_shift = 0;
+  info->name_length = 0;
+  info->alpha_premultiplied = JXL_FALSE;
+  info->spot_color[0] = 0;
+  info->spot_color[1] = 0;
+  info->spot_color[2] = 0;
+  info->spot_color[3] = 0;
+  info->cfa_channel = 0;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
+    JxlEncoder* enc, size_t index, const JxlExtraChannelInfo* info) {
+  if (index >= enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (JXL_ENC_SUCCESS != CheckValidBitdepth(info->bits_per_sample,
+                                            info->exponent_bits_per_sample)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth");
+  }
+
+  jxl::ExtraChannelInfo& channel = enc->metadata.m.extra_channel_info[index];
+  channel.type = static_cast<jxl::ExtraChannel>(info->type);
+  channel.bit_depth.bits_per_sample = info->bits_per_sample;
+  enc->metadata.m.modular_16_bit_buffer_sufficient &=
+      info->bits_per_sample <= 12;
+  channel.bit_depth.exponent_bits_per_sample = info->exponent_bits_per_sample;
+  channel.bit_depth.floating_point_sample = info->exponent_bits_per_sample != 0;
+  channel.dim_shift = info->dim_shift;
+  channel.name = "";
+  channel.alpha_associated = (info->alpha_premultiplied != 0);
+  channel.cfa_channel = info->cfa_channel;
+  channel.spot_color[0] = info->spot_color[0];
+  channel.spot_color[1] = info->spot_color[1];
+  channel.spot_color[2] = info->spot_color[2];
+  channel.spot_color[3] = info->spot_color[3];
+  std::string level_message;
+  int required_level = VerifyLevelSettings(enc, &level_message);
+  if (required_level == -1 ||
+      (static_cast<int>(enc->codestream_level) < required_level &&
+       enc->codestream_level != -1)) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE, "%s",
+        ("Codestream level verification for level " +
+         std::to_string(enc->codestream_level) + " failed: " + level_message)
+            .c_str());
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
+                                                          size_t index,
+                                                          const char* name,
+                                                          size_t size) {
+  if (index >= enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  enc->metadata.m.extra_channel_info[index].name =
+      std::string(name, name + size);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
+    JxlEncoder* enc, const JxlEncoderFrameSettings* source) {
+  auto opts = jxl::MemoryManagerMakeUnique<JxlEncoderFrameSettings>(
+      &enc->memory_manager);
+  if (!opts) return nullptr;
+  opts->enc = enc;
+  if (source != nullptr) {
+    opts->values = source->values;
+  } else {
+    opts->values.lossless = false;
+  }
+  opts->values.cparams.level = enc->codestream_level;
+  opts->values.cparams.ec_distance.resize(enc->metadata.m.num_extra_channels,
+                                          -1);
+
+  JxlEncoderFrameSettings* ret = opts.get();
+  enc->encoder_options.emplace_back(std::move(opts));
+  return ret;
+}
+
+JxlEncoderFrameSettings* JxlEncoderOptionsCreate(
+    JxlEncoder* enc, const JxlEncoderFrameSettings* source) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderFrameSettingsCreate(enc, source);
+}
+
+JxlEncoderStatus JxlEncoderSetFrameLossless(
+    JxlEncoderFrameSettings* frame_settings, const JXL_BOOL lossless) {
+  if (lossless && frame_settings->enc->basic_info_set &&
+      frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
+  }
+  frame_settings->values.lossless = lossless;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetLossless(
+    JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderSetFrameLossless(frame_settings, lossless);
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetEffort(
+    JxlEncoderFrameSettings* frame_settings, const int effort) {
+  return JxlEncoderFrameSettingsSetOption(frame_settings,
+                                          JXL_ENC_FRAME_SETTING_EFFORT, effort);
+}
+
+JxlEncoderStatus JxlEncoderSetFrameDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance) {
+  if (distance < 0.f || distance > 25.f) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Distance has to be in [0.0..25.0] (corresponding to "
+                         "quality in [0.0..100.0])");
+  }
+  if (distance > 0.f && distance < 0.01f) {
+    distance = 0.01f;
+  }
+  frame_settings->values.cparams.butteraugli_distance = distance;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetExtraChannelDistance(
+    JxlEncoderFrameSettings* frame_settings, size_t index, float distance) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (distance != -1.f && (distance < 0.f || distance > 25.f)) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Distance has to be -1 or in [0.0..25.0] (corresponding to "
+        "quality in [0.0..100.0])");
+  }
+  if (distance > 0.f && distance < 0.01f) {
+    distance = 0.01f;
+  }
+
+  if (index >= frame_settings->values.cparams.ec_distance.size()) {
+    // This can only happen if JxlEncoderFrameSettingsCreate() was called before
+    // JxlEncoderSetBasicInfo().
+    frame_settings->values.cparams.ec_distance.resize(
+        frame_settings->enc->metadata.m.num_extra_channels, -1);
+  }
+
+  frame_settings->values.cparams.ec_distance[index] = distance;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance) {
+  // Deprecated function name, call the non-deprecated function
+  return JxlEncoderSetFrameDistance(frame_settings, distance);
+}
+
+JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(
+    JxlEncoderFrameSettings* frame_settings, int tier) {
+  return JxlEncoderFrameSettingsSetOption(
+      frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier);
+}
+
+JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    int64_t value) {
+  // check if value is -1, 0 or 1 for Override-type options
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_NOISE:
+    case JXL_ENC_FRAME_SETTING_DOTS:
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      if (value < -1 || value > 1) {
+        return JXL_API_ERROR(
+            frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+            "Option value has to be -1 (default), 0 (off) or 1 (on)");
+      }
+      break;
+    default:
+      break;
+  }
+
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_EFFORT:
+      if (frame_settings->enc->allow_expert_options) {
+        if (value < 1 || value > 10) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..10]");
+        }
+      } else {
+        if (value < 1 || value > 9) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..9]");
+        }
+      }
+      frame_settings->values.cparams.speed_tier =
+          static_cast<jxl::SpeedTier>(10 - value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
+      if (value < -1 || value > 11) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Brotli effort has to be in [-1..11]");
+      }
+      // set cparams for brotli use in JPEG frames
+      frame_settings->values.cparams.brotli_effort = value;
+      // set enc option for brotli use in brob boxes
+      frame_settings->enc->brotli_effort = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
+      if (value < 0 || value > 4) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                             "Decoding speed has to be in [0..4]");
+      }
+      frame_settings->values.cparams.decoding_speed_tier = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_RESAMPLING:
+      if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Resampling factor has to be 1, 2, 4 or 8");
+      }
+      frame_settings->values.cparams.resampling = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING:
+      // TODO(lode): the jxl codestream allows choosing a different resampling
+      // factor for each extra channel, independently per frame. Move this
+      // option to a JxlEncoderFrameSettings-option that can be set per extra
+      // channel, so needs its own function rather than
+      // JxlEncoderFrameSettingsSetOption due to the extra channel index
+      // argument required.
+      if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Resampling factor has to be 1, 2, 4 or 8");
+      }
+      frame_settings->values.cparams.ec_resampling = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED:
+      if (value < 0 || value > 1) {
+        return JXL_ENC_ERROR;
+      }
+      frame_settings->values.cparams.already_downsampled = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_NOISE:
+      frame_settings->values.cparams.noise = static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_DOTS:
+      frame_settings->values.cparams.dots = static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+      frame_settings->values.cparams.patches =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EPF:
+      if (value < -1 || value > 3) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "EPF value has to be in [-1..3]");
+      }
+      frame_settings->values.cparams.epf = static_cast<int>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+      frame_settings->values.cparams.gaborish =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+      frame_settings->values.cparams.modular_mode = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+      frame_settings->values.cparams.keep_invisible =
+          static_cast<jxl::Override>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+      frame_settings->values.cparams.centerfirst = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X:
+      if (value < -1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Center x coordinate has to be -1 or positive");
+      }
+      frame_settings->values.cparams.center_x = static_cast<size_t>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y:
+      if (value < -1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Center y coordinate has to be -1 or positive");
+      }
+      frame_settings->values.cparams.center_y = static_cast<size_t>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+      frame_settings->values.cparams.responsive = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+      frame_settings->values.cparams.progressive_mode = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+      frame_settings->values.cparams.qprogressive_mode = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
+      if (value < -1 || value > 2) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Progressive DC has to be in [-1..2]");
+      }
+      frame_settings->values.cparams.progressive_dc = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PALETTE_COLORS:
+      if (value < -1 || value > 70913) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..70913]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.palette_colors = 1 << 10;
+      } else {
+        frame_settings->values.cparams.palette_colors = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+      // TODO(lode): the defaults of some palette settings depend on others.
+      // See the logic in cjxl. Similar for other settings. This should be
+      // handled in the encoder during JxlEncoderProcessOutput (or,
+      // alternatively, in the cjxl binary like now)
+      frame_settings->values.cparams.lossy_palette = (value == 1);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
+      if (value < -1 || value > 2) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..2]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.color_transform =
+            jxl::ColorTransform::kXYB;
+      } else {
+        frame_settings->values.cparams.color_transform =
+            static_cast<jxl::ColorTransform>(value);
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE:
+      if (value < -1 || value > 41) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..41]");
+      }
+      frame_settings->values.cparams.colorspace = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE:
+      if (value < -1 || value > 3) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..3]");
+      }
+      // TODO(lode): the default behavior of this parameter for cjxl is
+      // to choose 1 or 2 depending on the situation. This behavior needs to be
+      // implemented either in the C++ library by allowing to set this to -1, or
+      // kept in cjxl and set it to 1 or 2 using this API.
+      if (value == -1) {
+        frame_settings->values.cparams.modular_group_size_shift = 1;
+      } else {
+        frame_settings->values.cparams.modular_group_size_shift = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR:
+      if (value < -1 || value > 15) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..15]");
+      }
+      frame_settings->values.cparams.options.predictor =
+          static_cast<jxl::Predictor>(value);
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS:
+      // The max allowed value can in theory be higher. However, it depends on
+      // the effort setting. 11 is the highest safe value that doesn't cause
+      // tree_samples to be >= 64 in the encoder. The specification may allow
+      // more than this. With more fine tuning higher values could be allowed.
+      // For N-channel images, the largest useful value is N-1.
+      if (value < -1 || value > 11) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..11]");
+      }
+      if (value == -1) {
+        frame_settings->values.cparams.options.max_properties = 0;
+      } else {
+        frame_settings->values.cparams.options.max_properties = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+      if (value == -1) {
+        frame_settings->values.cparams.force_cfl_jpeg_recompression = true;
+      } else {
+        frame_settings->values.cparams.force_cfl_jpeg_recompression = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_INDEX_BOX:
+      frame_settings->values.frame_index_box = true;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_PHOTON_NOISE:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Float option, try setting it with "
+                           "JxlEncoderFrameSettingsSetFloatOption");
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      frame_settings->values.cparams.jpeg_compress_boxes = value;
+      return JXL_ENC_SUCCESS;
+    default:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Unknown option");
+  }
+}
+
+JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
+    JxlEncoderFrameSettings* frame_settings, JxlEncoderFrameSettingId option,
+    float value) {
+  switch (option) {
+    case JXL_ENC_FRAME_SETTING_PHOTON_NOISE:
+      if (value < 0) return JXL_ENC_ERROR;
+      // TODO(lode): add encoder setting to set the 8 floating point values of
+      // the noise synthesis parameters per frame for more fine grained control.
+      frame_settings->values.cparams.photon_noise_iso = value;
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be smaller than 100");
+      }
+      // This value is called "iterations" or "nb_repeats" in cjxl, but is in
+      // fact a fraction in range 0.0-1.0, with the default value 0.5.
+      // Convert from floating point percentage to floating point fraction here.
+      if (value < -.5f) {
+        // TODO(lode): for this and many other settings (also in
+        // JxlEncoderFrameSettingsSetOption), avoid duplicating the default
+        // values here and in enc_params.h and options.h, have one location
+        // where the defaults are specified.
+        frame_settings->values.cparams.options.nb_repeats = 0.5f;
+      } else {
+        frame_settings->values.cparams.options.nb_repeats = value * 0.01f;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..100]");
+      }
+      if (value < -.5f) {
+        frame_settings->values.cparams.channel_colors_pre_transform_percent =
+            95.0f;
+      } else {
+        frame_settings->values.cparams.channel_colors_pre_transform_percent =
+            value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT:
+      if (value < -1.f || value > 100.f) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                             "Option value has to be in [-1..100]");
+      }
+      if (value < -.5f) {
+        frame_settings->values.cparams.channel_colors_percent = 80.0f;
+      } else {
+        frame_settings->values.cparams.channel_colors_percent = value;
+      }
+      return JXL_ENC_SUCCESS;
+    case JXL_ENC_FRAME_SETTING_EFFORT:
+    case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
+    case JXL_ENC_FRAME_SETTING_RESAMPLING:
+    case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING:
+    case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED:
+    case JXL_ENC_FRAME_SETTING_NOISE:
+    case JXL_ENC_FRAME_SETTING_DOTS:
+    case JXL_ENC_FRAME_SETTING_PATCHES:
+    case JXL_ENC_FRAME_SETTING_EPF:
+    case JXL_ENC_FRAME_SETTING_GABORISH:
+    case JXL_ENC_FRAME_SETTING_MODULAR:
+    case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X:
+    case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y:
+    case JXL_ENC_FRAME_SETTING_RESPONSIVE:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
+    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
+    case JXL_ENC_FRAME_SETTING_PALETTE_COLORS:
+    case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
+    case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
+    case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE:
+    case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE:
+    case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR:
+    case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS:
+    case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+    case JXL_ENC_FRAME_INDEX_BOX:
+    case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
+    case JXL_ENC_FRAME_SETTING_FILL_ENUM:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Int option, try setting it with "
+                           "JxlEncoderFrameSettingsSetOption");
+    default:
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                           "Unknown option");
+  }
+}
+JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) {
+  JxlMemoryManager local_memory_manager;
+  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager)) {
+    return nullptr;
+  }
+
+  void* alloc =
+      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlEncoder));
+  if (!alloc) return nullptr;
+  JxlEncoder* enc = new (alloc) JxlEncoder();
+  enc->memory_manager = local_memory_manager;
+  // TODO(sboukortt): add an API function to set this.
+  enc->cms = jxl::GetJxlCms();
+
+  // Initialize all the field values.
+  JxlEncoderReset(enc);
+
+  return enc;
+}
+
+void JxlEncoderReset(JxlEncoder* enc) {
+  enc->thread_pool.reset();
+  enc->input_queue.clear();
+  enc->num_queued_frames = 0;
+  enc->num_queued_boxes = 0;
+  enc->encoder_options.clear();
+  enc->output_byte_queue.clear();
+  enc->output_fast_frame_queue.clear();
+  enc->codestream_bytes_written_beginning_of_frame = 0;
+  enc->codestream_bytes_written_end_of_frame = 0;
+  enc->wrote_bytes = false;
+  enc->jxlp_counter = 0;
+  enc->metadata = jxl::CodecMetadata();
+  enc->last_used_cparams = jxl::CompressParams();
+  enc->frames_closed = false;
+  enc->boxes_closed = false;
+  enc->basic_info_set = false;
+  enc->color_encoding_set = false;
+  enc->intensity_target_set = false;
+  enc->use_container = false;
+  enc->use_boxes = false;
+  enc->codestream_level = -1;
+  JxlEncoderInitBasicInfo(&enc->basic_info);
+}
+
+void JxlEncoderDestroy(JxlEncoder* enc) {
+  if (enc) {
+    JxlMemoryManager local_memory_manager = enc->memory_manager;
+    // Call destructor directly since custom free function is used.
+    enc->~JxlEncoder();
+    jxl::MemoryManagerFree(&local_memory_manager, enc);
+  }
+}
+
+JxlEncoderError JxlEncoderGetError(JxlEncoder* enc) { return enc->error; }
+
+JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc,
+                                        JXL_BOOL use_container) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->use_container = static_cast<bool>(use_container);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc,
+                                             JXL_BOOL store_jpeg_metadata) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->store_jpeg_metadata = static_cast<bool>(store_jpeg_metadata);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc, int level) {
+  if (level != -1 && level != 5 && level != 10) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_NOT_SUPPORTED, "invalid level");
+  }
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->codestream_level = level;
+  return JXL_ENC_SUCCESS;
+}
+
+int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc) {
+  return VerifyLevelSettings(enc, nullptr);
+}
+
+void JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms) {
+  jxl::msan::MemoryIsInitialized(&cms, sizeof(cms));
+  enc->cms = cms;
+}
+
+JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc,
+                                             JxlParallelRunner parallel_runner,
+                                             void* parallel_runner_opaque) {
+  if (enc->thread_pool) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "parallel runner already set");
+  }
+  enc->thread_pool = jxl::MemoryManagerMakeUnique<jxl::ThreadPool>(
+      &enc->memory_manager, parallel_runner, parallel_runner_opaque);
+  if (!enc->thread_pool) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC,
+                         "error setting parallel runner");
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+namespace {
+JxlEncoderStatus GetCurrentDimensions(
+    const JxlEncoderFrameSettings* frame_settings, size_t& xsize,
+    size_t& ysize) {
+  xsize = frame_settings->enc->metadata.xsize();
+  ysize = frame_settings->enc->metadata.ysize();
+  if (frame_settings->values.header.layer_info.have_crop) {
+    xsize = frame_settings->values.header.layer_info.xsize;
+    ysize = frame_settings->values.header.layer_info.ysize;
+  }
+  if (frame_settings->values.cparams.already_downsampled) {
+    size_t factor = frame_settings->values.cparams.resampling;
+    xsize = jxl::DivCeil(xsize, factor);
+    ysize = jxl::DivCeil(ysize, factor);
+  }
+  if (xsize == 0 || ysize == 0) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "zero-sized frame is not allowed");
+  }
+  return JXL_ENC_SUCCESS;
+}
+}  // namespace
+
+JxlEncoderStatus JxlEncoderAddJPEGFrame(
+    const JxlEncoderFrameSettings* frame_settings, const uint8_t* buffer,
+    size_t size) {
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input is already closed");
+  }
+
+  jxl::CodecInOut io;
+  if (!jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(buffer, size), &io)) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
+                         "Error during decode of input JPEG");
+  }
+
+  if (!frame_settings->enc->color_encoding_set) {
+    if (!SetColorEncodingFromJpegData(
+            *io.Main().jpeg_data,
+            &frame_settings->enc->metadata.m.color_encoding)) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
+                           "Error in input JPEG color space");
+    }
+  }
+
+  if (!frame_settings->enc->basic_info_set) {
+    JxlBasicInfo basic_info;
+    JxlEncoderInitBasicInfo(&basic_info);
+    basic_info.xsize = io.Main().jpeg_data->width;
+    basic_info.ysize = io.Main().jpeg_data->height;
+    basic_info.uses_original_profile = true;
+    if (JxlEncoderSetBasicInfo(frame_settings->enc, &basic_info) !=
+        JXL_ENC_SUCCESS) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                           "Error setting basic info");
+    }
+  }
+
+  if (frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Can't XYB encode a lossless JPEG");
+  }
+  if (!io.blobs.exif.empty()) {
+    JxlOrientation orientation = static_cast<JxlOrientation>(
+        frame_settings->enc->metadata.m.orientation);
+    jxl::InterpretExif(io.blobs.exif, &orientation);
+    frame_settings->enc->metadata.m.orientation = orientation;
+
+    size_t exif_size = io.blobs.exif.size();
+    // Exif data in JPEG is limited to 64k
+    if (exif_size > 0xFFFF) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                           "Exif larger than possible in JPEG?");
+    }
+    exif_size += 4;  // prefix 4 zero bytes for tiff offset
+    std::vector<uint8_t> exif(exif_size);
+    memcpy(exif.data() + 4, io.blobs.exif.data(), io.blobs.exif.size());
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "Exif", exif.data(), exif_size,
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (!io.blobs.xmp.empty()) {
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "xml ", io.blobs.xmp.data(),
+                     io.blobs.xmp.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (!io.blobs.jumbf.empty()) {
+    JxlEncoderUseBoxes(frame_settings->enc);
+    JxlEncoderAddBox(frame_settings->enc, "jumb", io.blobs.jumbf.data(),
+                     io.blobs.jumbf.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
+  }
+  if (frame_settings->enc->store_jpeg_metadata) {
+    jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data;
+    jxl::PaddedBytes jpeg_data;
+    if (!jxl::jpeg::EncodeJPEGData(data_in, &jpeg_data,
+                                   frame_settings->values.cparams)) {
+      return JXL_API_ERROR(
+          frame_settings->enc, JXL_ENC_ERR_JBRD,
+          "JPEG bitstream reconstruction data cannot be encoded");
+    }
+    frame_settings->enc->jpeg_metadata = std::vector<uint8_t>(
+        jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+  }
+
+  auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
+      &frame_settings->enc->memory_manager,
+      // JxlEncoderQueuedFrame is a struct with no constructors, so we use the
+      // default move constructor there.
+      jxl::JxlEncoderQueuedFrame{
+          frame_settings->values,
+          jxl::ImageBundle(&frame_settings->enc->metadata.m),
+          {}});
+  if (!queued_frame) {
+    // TODO(jon): when can this happen? is this an API usage error?
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "No frame queued?");
+  }
+  queued_frame->frame.SetFromImage(std::move(*io.Main().color()),
+                                   io.Main().c_current());
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+  if (xsize != static_cast<size_t>(io.Main().jpeg_data->width) ||
+      ysize != static_cast<size_t>(io.Main().jpeg_data->height)) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "JPEG dimensions don't match frame dimensions");
+  }
+  std::vector<jxl::ImageF> extra_channels(
+      frame_settings->enc->metadata.m.num_extra_channels);
+  for (auto& extra_channel : extra_channels) {
+    extra_channel = jxl::ImageF(xsize, ysize);
+    queued_frame->ec_initialized.push_back(0);
+  }
+  queued_frame->frame.SetExtraChannels(std::move(extra_channels));
+  queued_frame->frame.jpeg_data = std::move(io.Main().jpeg_data);
+  queued_frame->frame.color_transform = io.Main().color_transform;
+  queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling;
+
+  QueueFrame(frame_settings, queued_frame);
+  return JXL_ENC_SUCCESS;
+}
+
+static bool CanDoFastLossless(const JxlEncoderFrameSettings* frame_settings,
+                              const JxlPixelFormat* pixel_format,
+                              bool has_alpha) {
+  if (!frame_settings->values.lossless) {
+    return false;
+  }
+  // TODO(veluca): many of the following options could be made to work, but are
+  // just not implemented in FJXL's frame header handling yet.
+  if (frame_settings->values.frame_index_box) {
+    return false;
+  }
+  if (frame_settings->values.header.layer_info.have_crop) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.have_animation) {
+    return false;
+  }
+  if (frame_settings->values.cparams.speed_tier != jxl::SpeedTier::kLightning) {
+    return false;
+  }
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_CUSTOM &&
+      frame_settings->values.image_bit_depth.bits_per_sample !=
+          frame_settings->enc->metadata.m.bit_depth.bits_per_sample) {
+    return false;
+  }
+  // TODO(veluca): implement support for LSB-padded input in fast_lossless.
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      frame_settings->values.image_bit_depth.bits_per_sample % 8 != 0) {
+    return false;
+  }
+  if (!frame_settings->values.frame_name.empty()) {
+    return false;
+  }
+  // No extra channels other than alpha.
+  if (!(has_alpha && frame_settings->enc->metadata.m.num_extra_channels == 1) &&
+      frame_settings->enc->metadata.m.num_extra_channels != 0) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 16) {
+    return false;
+  }
+  if (pixel_format->data_type != JxlDataType::JXL_TYPE_FLOAT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT8) {
+    return false;
+  }
+  if ((frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 8) !=
+      (pixel_format->data_type == JxlDataType::JXL_TYPE_UINT16 ||
+       pixel_format->data_type == JxlDataType::JXL_TYPE_FLOAT16)) {
+    return false;
+  }
+  if (!((pixel_format->num_channels == 1 || pixel_format->num_channels == 3) &&
+        !has_alpha) &&
+      !((pixel_format->num_channels == 2 || pixel_format->num_channels == 4) &&
+        has_alpha)) {
+    return false;
+  }
+
+  return true;
+}
+
+JxlEncoderStatus JxlEncoderAddImageFrame(
+    const JxlEncoderFrameSettings* frame_settings,
+    const JxlPixelFormat* pixel_format, const void* buffer, size_t size) {
+  if (!frame_settings->enc->basic_info_set ||
+      (!frame_settings->enc->color_encoding_set &&
+       !frame_settings->enc->metadata.m.xyb_encoded)) {
+    // Basic Info must be set, and color encoding must be set directly,
+    // or set to XYB via JxlBasicInfo.uses_original_profile = JXL_FALSE
+    // Otherwise, this is an API misuse.
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Basic info or color encoding not set yet");
+  }
+
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input already closed");
+  }
+  if (pixel_format->num_channels < 3) {
+    if (frame_settings->enc->basic_info.num_color_channels != 1) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "Grayscale pixel format input for an RGB image");
+    }
+  } else {
+    if (frame_settings->enc->basic_info.num_color_channels != 3) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "RGB pixel format input for a grayscale image");
+    }
+  }
+
+  bool has_alpha = frame_settings->enc->metadata.m.HasAlpha();
+
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+
+  // All required conditions to do fast-lossless.
+  if (CanDoFastLossless(frame_settings, pixel_format, has_alpha)) {
+    const size_t bytes_per_pixel =
+        pixel_format->data_type == JxlDataType::JXL_TYPE_UINT8
+            ? pixel_format->num_channels
+            : pixel_format->num_channels * 2;
+    const size_t last_row_size = xsize * bytes_per_pixel;
+    const size_t align = pixel_format->align;
+    const size_t row_size =
+        (align > 1 ? jxl::DivCeil(last_row_size, align) * align
+                   : last_row_size);
+    const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
+    if (bytes_to_read > size) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "provided image buffer too small");
+    }
+    const bool big_endian =
+        pixel_format->endianness == JXL_BIG_ENDIAN ||
+        (pixel_format->endianness == JXL_NATIVE_ENDIAN && !IsLittleEndian());
+
+    auto runner = +[](void* void_pool, void* opaque, void fun(void*, size_t),
+                      size_t count) {
+      auto* pool = reinterpret_cast<jxl::ThreadPool*>(void_pool);
+      JXL_CHECK(jxl::RunOnPool(
+          pool, 0, count, jxl::ThreadPool::NoInit,
+          [&](size_t i, size_t) { fun(opaque, i); }, "Encode fast lossless"));
+    };
+    QueueFastLosslessFrame(
+        frame_settings,
+        JxlFastLosslessPrepareFrame(
+            reinterpret_cast<const unsigned char*>(buffer), xsize, row_size,
+            ysize, pixel_format->num_channels,
+            frame_settings->enc->metadata.m.bit_depth.bits_per_sample,
+            big_endian, /*effort=*/2, frame_settings->enc->thread_pool.get(),
+            runner));
+    return JXL_ENC_SUCCESS;
+  }
+
+  auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
+      &frame_settings->enc->memory_manager,
+      // JxlEncoderQueuedFrame is a struct with no constructors, so we use the
+      // default move constructor there.
+      jxl::JxlEncoderQueuedFrame{
+          frame_settings->values,
+          jxl::ImageBundle(&frame_settings->enc->metadata.m),
+          {}});
+
+  if (!queued_frame) {
+    // TODO(jon): when can this happen? is this an API usage error?
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "No frame queued?");
+  }
+
+  jxl::ColorEncoding c_current;
+  if (!frame_settings->enc->color_encoding_set) {
+    if ((pixel_format->data_type == JXL_TYPE_FLOAT) ||
+        (pixel_format->data_type == JXL_TYPE_FLOAT16)) {
+      c_current =
+          jxl::ColorEncoding::LinearSRGB(pixel_format->num_channels < 3);
+    } else {
+      c_current = jxl::ColorEncoding::SRGB(pixel_format->num_channels < 3);
+    }
+  } else {
+    c_current = frame_settings->enc->metadata.m.color_encoding;
+  }
+  uint32_t num_channels = pixel_format->num_channels;
+  size_t has_interleaved_alpha =
+      static_cast<size_t>(num_channels == 2 || num_channels == 4);
+  if (has_interleaved_alpha >
+      frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "number of extra channels mismatch (need 1 extra channel for alpha)");
+  }
+  std::vector<jxl::ImageF> extra_channels(
+      frame_settings->enc->metadata.m.num_extra_channels);
+  for (auto& extra_channel : extra_channels) {
+    extra_channel = jxl::ImageF(xsize, ysize);
+  }
+  queued_frame->frame.SetExtraChannels(std::move(extra_channels));
+  for (auto& ec_info : frame_settings->enc->metadata.m.extra_channel_info) {
+    if (has_interleaved_alpha && ec_info.type == jxl::ExtraChannel::kAlpha) {
+      queued_frame->ec_initialized.push_back(1);
+      has_interleaved_alpha = 0;  // only first Alpha is initialized
+    } else {
+      queued_frame->ec_initialized.push_back(0);
+    }
+  }
+  queued_frame->frame.origin.x0 =
+      frame_settings->values.header.layer_info.crop_x0;
+  queued_frame->frame.origin.y0 =
+      frame_settings->values.header.layer_info.crop_y0;
+  queued_frame->frame.use_for_next_frame =
+      (frame_settings->values.header.layer_info.save_as_reference != 0u);
+  queued_frame->frame.blendmode =
+      frame_settings->values.header.layer_info.blend_info.blendmode ==
+              JXL_BLEND_REPLACE
+          ? jxl::BlendMode::kReplace
+          : jxl::BlendMode::kBlend;
+  queued_frame->frame.blend =
+      frame_settings->values.header.layer_info.blend_info.source > 0;
+
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth,
+                          *pixel_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample =
+      GetBitDepth(frame_settings->values.image_bit_depth,
+                  frame_settings->enc->metadata.m, *pixel_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  if (!jxl::ConvertFromExternal(
+          jxl::Span<const uint8_t>(uint8_buffer, size), xsize, ysize, c_current,
+          bits_per_sample, *pixel_format,
+          frame_settings->enc->thread_pool.get(), &(queued_frame->frame))) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid input buffer");
+  }
+  if (frame_settings->values.lossless &&
+      frame_settings->enc->metadata.m.xyb_encoded) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
+  }
+  queued_frame->option_values.cparams.level =
+      frame_settings->enc->codestream_level;
+
+  QueueFrame(frame_settings, queued_frame);
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc) {
+  if (enc->wrote_bytes) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "this setting can only be set at the beginning");
+  }
+  enc->use_boxes = true;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type,
+                                  const uint8_t* contents, size_t size,
+                                  JXL_BOOL compress_box) {
+  if (!enc->use_boxes) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE,
+        "must set JxlEncoderUseBoxes at the beginning to add boxes");
+  }
+  if (compress_box) {
+    if (memcmp("jxl", type, 3) == 0) {
+      return JXL_API_ERROR(
+          enc, JXL_ENC_ERR_API_USAGE,
+          "brob box may not contain a type starting with \"jxl\"");
+    }
+    if (memcmp("jbrd", type, 4) == 0) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "jbrd box may not be brob compressed");
+    }
+    if (memcmp("brob", type, 4) == 0) {
+      // The compress_box will compress an existing non-brob box into a brob
+      // box. If already giving a valid brotli-compressed brob box, set
+      // compress_box to false since it is already compressed.
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "a brob box cannot contain another brob box");
+    }
+  }
+
+  auto box = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedBox>(
+      &enc->memory_manager);
+
+  box->type = jxl::MakeBoxType(type);
+  box->contents.assign(contents, contents + size);
+  box->compress_box = !!compress_box;
+  QueueBox(enc, box);
+  return JXL_ENC_SUCCESS;
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
+    const JxlEncoderOptions* frame_settings, const JxlPixelFormat* pixel_format,
+    const void* buffer, size_t size, uint32_t index) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (!frame_settings->enc->basic_info_set ||
+      !frame_settings->enc->color_encoding_set) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Basic info has to be set first");
+  }
+  if (frame_settings->enc->input_queue.empty()) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "First add image frame, then extra channels");
+  }
+  if (frame_settings->enc->frames_closed) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Frame input already closed");
+  }
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+  JxlPixelFormat ec_format = *pixel_format;
+  ec_format.num_channels = 1;
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth, ec_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample = GetBitDepth(
+      frame_settings->values.image_bit_depth,
+      frame_settings->enc->metadata.m.extra_channel_info[index], ec_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  auto queued_frame = frame_settings->enc->input_queue.back().frame.get();
+  if (!jxl::ConvertFromExternal(jxl::Span<const uint8_t>(uint8_buffer, size),
+                                xsize, ysize, bits_per_sample, ec_format, 0,
+                                frame_settings->enc->thread_pool.get(),
+                                &queued_frame->frame.extra_channels()[index])) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Failed to set buffer for extra channel");
+  }
+  queued_frame->ec_initialized[index] = 1;
+
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlEncoderCloseFrames(JxlEncoder* enc) { enc->frames_closed = true; }
+
+void JxlEncoderCloseBoxes(JxlEncoder* enc) { enc->boxes_closed = true; }
+
+void JxlEncoderCloseInput(JxlEncoder* enc) {
+  JxlEncoderCloseFrames(enc);
+  JxlEncoderCloseBoxes(enc);
+}
+JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out,
+                                         size_t* avail_out) {
+  while (*avail_out >= 32 &&
+         (!enc->output_byte_queue.empty() ||
+          !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty())) {
+    if (!enc->output_byte_queue.empty()) {
+      size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size());
+      std::copy_n(enc->output_byte_queue.begin(), to_copy, *next_out);
+      *next_out += to_copy;
+      *avail_out -= to_copy;
+      enc->output_byte_queue.erase(enc->output_byte_queue.begin(),
+                                   enc->output_byte_queue.begin() + to_copy);
+    } else if (!enc->output_fast_frame_queue.empty()) {
+      size_t count = JxlFastLosslessWriteOutput(
+          enc->output_fast_frame_queue.front().get(), *next_out, *avail_out);
+      *next_out += count;
+      *avail_out -= count;
+      if (count == 0) {
+        enc->output_fast_frame_queue.pop_front();
+      }
+
+    } else if (!enc->input_queue.empty()) {
+      if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) {
+        return JXL_ENC_ERROR;
+      }
+    }
+  }
+
+  if (!enc->output_byte_queue.empty() ||
+      !enc->output_fast_frame_queue.empty() || !enc->input_queue.empty()) {
+    return JXL_ENC_NEED_MORE_OUTPUT;
+  }
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameHeader(JxlEncoderOptions* frame_settings,
+                                          const JxlFrameHeader* frame_header) {
+  if (frame_header->layer_info.blend_info.source > 3) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "invalid blending source index");
+  }
+  // If there are no extra channels, it's ok for the value to be 0.
+  if (frame_header->layer_info.blend_info.alpha != 0 &&
+      frame_header->layer_info.blend_info.alpha >=
+          frame_settings->enc->metadata.m.extra_channel_info.size()) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "alpha blend channel index out of bounds");
+  }
+
+  frame_settings->values.header = *frame_header;
+  // Setting the frame header resets the frame name, it must be set again with
+  // JxlEncoderSetFrameName if desired.
+  frame_settings->values.frame_name = "";
+
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
+    JxlEncoderOptions* frame_settings, size_t index,
+    const JxlBlendInfo* blend_info) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+
+  if (frame_settings->values.extra_channel_blend_info.size() !=
+      frame_settings->enc->metadata.m.num_extra_channels) {
+    JxlBlendInfo default_blend_info;
+    JxlEncoderInitBlendInfo(&default_blend_info);
+    frame_settings->values.extra_channel_blend_info.resize(
+        frame_settings->enc->metadata.m.num_extra_channels, default_blend_info);
+  }
+  frame_settings->values.extra_channel_blend_info[index] = *blend_info;
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameName(JxlEncoderFrameSettings* frame_settings,
+                                        const char* frame_name) {
+  std::string str = frame_name ? frame_name : "";
+  if (str.size() > 1071) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "frame name can be max 1071 bytes long");
+  }
+  frame_settings->values.frame_name = str;
+  frame_settings->values.header.name_length = str.size();
+  return JXL_ENC_SUCCESS;
+}
+
+JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth) {
+  if (bit_depth->type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      bit_depth->type != JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return JXL_API_ERROR_NOSET(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT and "
+        "JXL_BIT_DEPTH_FROM_CODESTREAM is implemented "
+        "for input buffers.");
+  }
+  frame_settings->values.image_bit_depth = *bit_depth;
+  return JXL_ENC_SUCCESS;
+}
+
+void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding,
+                               JXL_BOOL is_gray) {
+  ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray),
+                                         color_encoding);
+}
+
+void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding,
+                                     JXL_BOOL is_gray) {
+  ConvertInternalToExternalColorEncoding(
+      jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding);
+}
+
+void JxlEncoderAllowExpertOptions(JxlEncoder* enc) {
+  enc->allow_expert_options = true;
+}
diff --git a/third_party/jpeg-xl/lib/jxl/encode_internal.h b/third_party/jpeg-xl/lib/jxl/encode_internal.h
new file mode 100644
index 0000000000..7713c5cab6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode_internal.h
@@ -0,0 +1,275 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+#ifndef LIB_JXL_ENCODE_INTERNAL_H_
+#define LIB_JXL_ENCODE_INTERNAL_H_
+
+#include <jxl/encode.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+
+#include <deque>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/memory_manager_internal.h"
+
+namespace jxl {
+
+/* Frame index box 'jxli' will start with Varint() for
+NF: has type Varint(): number of frames listed in the index.
+TNUM: has type u32: numerator of tick unit.
+TDEN: has type u32: denominator of tick unit. Value 0 means the file is
+ill-formed. per frame i listed: OFFi: has type Varint(): offset of start byte of
+this frame compared to start byte of previous frame from this index in the JPEG
+XL codestream. For the first frame, this is the offset from the first byte of
+the JPEG XL codestream. Ti: has type Varint(): duration in ticks between the
+start of this frame and the start of the next frame in the index. If this is the
+last frame in the index, this is the duration in ticks between the start of this
+frame and the end of the stream. A tick lasts TNUM / TDEN seconds. Fi: has type
+Varint(): amount of frames the next frame in the index occurs after this frame.
+If this is the last frame in the index, this is the amount of frames after this
+frame in the remainder of the stream. Only frames that are presented by the
+decoder are counted for this purpose, this excludes frames that are not intended
+for display but for compositing with other frames, such as frames that aren't
+the last frame with a duration of 0 ticks.
+
+All the frames listed in jxli are keyframes and the first frame is
+present in the list.
+There shall be either zero or one Frame Index boxes in a JPEG XL file.
+The offsets OFFi per frame are given as bytes in the codestream, not as
+bytes in the file format using the box structure. This means if JPEG XL Partial
+Codestream boxes are used, the offset is counted within the concatenated
+codestream, bytes from box headers or non-codestream boxes are not counted.
+*/
+
+typedef struct JxlEncoderFrameIndexBoxEntryStruct {
+  bool to_be_indexed;
+  uint32_t duration;
+  uint64_t OFFi;
+} JxlEncoderFrameIndexBoxEntry;
+
+typedef struct JxlEncoderFrameIndexBoxStruct {
+  // We always need to record the first frame entry, so presence of the
+  // first entry alone is not an indication if it was requested to be
+  // stored.
+  bool index_box_requested_through_api = false;
+
+  int64_t NF() const { return entries.size(); }
+  bool StoreFrameIndexBox() {
+    for (auto e : entries) {
+      if (e.to_be_indexed) {
+        return true;
+      }
+    }
+    return false;
+  }
+  int32_t TNUM = 1;
+  int32_t TDEN = 1000;
+
+  std::vector<JxlEncoderFrameIndexBoxEntry> entries;
+
+  // That way we can ensure that every index box will have the first frame.
+  // If the API user decides to mark it as an indexed frame, we call
+  // the AddFrame again, this time with requested.
+  void AddFrame(uint64_t OFFi, uint32_t duration, bool to_be_indexed) {
+    // We call AddFrame to every frame.
+    // Recording the first frame is required by the standard.
+    // Knowing the last frame is required, since the last indexed frame
+    // needs to know how many frames until the end.
+    // To be able to tell how many frames there are between each index
+    // entry we just record every frame here.
+    if (entries.size() == 1) {
+      if (OFFi == entries[0].OFFi) {
+        // API use for the first frame, let's clear the already recorded first
+        // frame.
+        entries.clear();
+      }
+    }
+    JxlEncoderFrameIndexBoxEntry e;
+    e.to_be_indexed = to_be_indexed;
+    e.OFFi = OFFi;
+    e.duration = duration;
+    entries.push_back(e);
+  }
+} JxlEncoderFrameIndexBox;
+
+// The encoder options (such as quality, compression speed, ...) for a single
+// frame, but not encoder-wide options such as box-related options.
+typedef struct JxlEncoderFrameSettingsValuesStruct {
+  // lossless is a separate setting from cparams because it is a combination
+  // setting that overrides multiple settings inside of cparams.
+  bool lossless;
+  CompressParams cparams;
+  JxlFrameHeader header;
+  std::vector<JxlBlendInfo> extra_channel_blend_info;
+  std::string frame_name;
+  JxlBitDepth image_bit_depth;
+  bool frame_index_box = false;
+} JxlEncoderFrameSettingsValues;
+
+typedef std::array<uint8_t, 4> BoxType;
+
+// Utility function that makes a BoxType from a string literal. The string must
+// have 4 characters, a 5th null termination character is optional.
+constexpr BoxType MakeBoxType(const char* type) {
+  return BoxType(
+      {{static_cast<uint8_t>(type[0]), static_cast<uint8_t>(type[1]),
+        static_cast<uint8_t>(type[2]), static_cast<uint8_t>(type[3])}});
+}
+
+constexpr unsigned char kContainerHeader[] = {
+    0,   0,   0, 0xc, 'J',  'X', 'L', ' ', 0xd, 0xa, 0x87,
+    0xa, 0,   0, 0,   0x14, 'f', 't', 'y', 'p', 'j', 'x',
+    'l', ' ', 0, 0,   0,    0,   'j', 'x', 'l', ' '};
+
+constexpr unsigned char kLevelBoxHeader[] = {0, 0, 0, 0x9, 'j', 'x', 'l', 'l'};
+
+struct JxlEncoderQueuedFrame {
+  JxlEncoderFrameSettingsValues option_values;
+  ImageBundle frame;
+  std::vector<uint8_t> ec_initialized;
+};
+
+struct JxlEncoderQueuedBox {
+  BoxType type;
+  std::vector<uint8_t> contents;
+  bool compress_box;
+};
+
+using FJXLFrameUniquePtr =
+    std::unique_ptr<JxlFastLosslessFrameState,
+                    decltype(&JxlFastLosslessFreeFrameState)>;
+
+// Either a frame, or a box, not both.
+// Can also be a FJXL frame.
+struct JxlEncoderQueuedInput {
+  explicit JxlEncoderQueuedInput(const JxlMemoryManager& memory_manager)
+      : frame(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)),
+        box(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)) {}
+  MemoryManagerUniquePtr<JxlEncoderQueuedFrame> frame;
+  MemoryManagerUniquePtr<JxlEncoderQueuedBox> box;
+  FJXLFrameUniquePtr fast_lossless_frame = {nullptr,
+                                            JxlFastLosslessFreeFrameState};
+};
+
+// Appends a JXL container box header with given type, size, and unbounded
+// properties to output.
+template <typename T>
+void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded,
+                     T* output) {
+  uint64_t box_size = 0;
+  bool large_size = false;
+  if (!unbounded) {
+    box_size = size + 8;
+    if (box_size >= 0x100000000ull) {
+      large_size = true;
+    }
+  }
+
+  {
+    const uint64_t store = large_size ? 1 : box_size;
+    for (size_t i = 0; i < 4; i++) {
+      output->push_back(store >> (8 * (3 - i)) & 0xff);
+    }
+  }
+  for (size_t i = 0; i < 4; i++) {
+    output->push_back(type[i]);
+  }
+
+  if (large_size) {
+    for (size_t i = 0; i < 8; i++) {
+      output->push_back(box_size >> (8 * (7 - i)) & 0xff);
+    }
+  }
+}
+
+}  // namespace jxl
+
+// Internal use only struct, can only be initialized correctly by
+// JxlEncoderCreate.
+struct JxlEncoderStruct {
+  JxlEncoderError error = JxlEncoderError::JXL_ENC_ERR_OK;
+  JxlMemoryManager memory_manager;
+  jxl::MemoryManagerUniquePtr<jxl::ThreadPool> thread_pool{
+      nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)};
+  JxlCmsInterface cms;
+  std::vector<jxl::MemoryManagerUniquePtr<JxlEncoderFrameSettings>>
+      encoder_options;
+
+  size_t num_queued_frames;
+  size_t num_queued_boxes;
+  std::vector<jxl::JxlEncoderQueuedInput> input_queue;
+  std::deque<uint8_t> output_byte_queue;
+  std::deque<jxl::FJXLFrameUniquePtr> output_fast_frame_queue;
+
+  // How many codestream bytes have been written, i.e.,
+  // content of jxlc and jxlp boxes. Frame index box jxli
+  // requires position indices to point to codestream bytes,
+  // so we need to keep track of the total of flushed or queue
+  // codestream bytes. These bytes may be in a single jxlc box
+  // or across multiple jxlp boxes.
+  size_t codestream_bytes_written_beginning_of_frame;
+  size_t codestream_bytes_written_end_of_frame;
+  jxl::JxlEncoderFrameIndexBox frame_index_box;
+
+  // Force using the container even if not needed
+  bool use_container;
+  // User declared they will add metadata boxes
+  bool use_boxes;
+
+  // TODO(lode): move level into jxl::CompressParams since some C++
+  // implementation decisions should be based on it: level 10 allows more
+  // features to be used.
+  int32_t codestream_level;
+  bool store_jpeg_metadata;
+  jxl::CodecMetadata metadata;
+  std::vector<uint8_t> jpeg_metadata;
+
+  // Wrote any output at all, so wrote the data before the first user added
+  // frame or box, such as signature, basic info, ICC profile or jpeg
+  // reconstruction box.
+  bool wrote_bytes;
+  jxl::CompressParams last_used_cparams;
+  JxlBasicInfo basic_info;
+
+  // Encoder wrote a jxlp (partial codestream) box, so any next codestream
+  // parts must also be written in jxlp boxes, a single jxlc box cannot be
+  // used. The counter is used for the 4-byte jxlp box index header.
+  size_t jxlp_counter;
+
+  bool frames_closed;
+  bool boxes_closed;
+  bool basic_info_set;
+  bool color_encoding_set;
+  bool intensity_target_set;
+  bool allow_expert_options = false;
+  int brotli_effort = -1;
+
+  // Takes the first frame in the input_queue, encodes it, and appends
+  // the bytes to the output_byte_queue.
+  JxlEncoderStatus RefillOutputByteQueue();
+
+  bool MustUseContainer() const {
+    return use_container || (codestream_level != 5 && codestream_level != -1) ||
+           store_jpeg_metadata || use_boxes;
+  }
+
+  // Appends the bytes of a JXL box header with the provided type and size to
+  // the end of the output_byte_queue. If unbounded is true, the size won't be
+  // added to the header and the box will be assumed to continue until EOF.
+  void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded);
+};
+
+struct JxlEncoderFrameSettingsStruct {
+  JxlEncoder* enc;
+  jxl::JxlEncoderFrameSettingsValues values;
+};
+
+#endif  // LIB_JXL_ENCODE_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/encode_test.cc b/third_party/jpeg-xl/lib/jxl/encode_test.cc
new file mode 100644
index 0000000000..3f1d77fd62
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/encode_test.cc
@@ -0,0 +1,1405 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/jxl.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+TEST(EncodeTest, AddFrameAfterCloseInputTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderCloseInput(enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, AddJPEGAfterCloseTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderCloseInput(enc.get());
+
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+}
+
+TEST(EncodeTest, AddFrameBeforeColorEncodingTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = true;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, AddFrameBeforeBasicInfoTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+}
+
+TEST(EncodeTest, DefaultAllocTest) {
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+  JxlEncoderDestroy(enc);
+}
+
+TEST(EncodeTest, CustomAllocTest) {
+  struct CalledCounters {
+    int allocs = 0;
+    int frees = 0;
+  } counters;
+
+  JxlMemoryManager mm;
+  mm.opaque = &counters;
+  mm.alloc = [](void* opaque, size_t size) {
+    reinterpret_cast<CalledCounters*>(opaque)->allocs++;
+    return malloc(size);
+  };
+  mm.free = [](void* opaque, void* address) {
+    reinterpret_cast<CalledCounters*>(opaque)->frees++;
+    free(address);
+  };
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(&mm);
+    EXPECT_NE(nullptr, enc.get());
+    EXPECT_LE(1, counters.allocs);
+    EXPECT_EQ(0, counters.frees);
+  }
+  EXPECT_LE(1, counters.frees);
+}
+
+TEST(EncodeTest, DefaultParallelRunnerTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetParallelRunner(enc.get(), nullptr, nullptr));
+}
+
+void VerifyFrameEncoding(size_t xsize, size_t ysize, JxlEncoder* enc,
+                         const JxlEncoderFrameSettings* frame_settings,
+                         size_t max_compressed_size,
+                         bool lossy_use_original_profile) {
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  if (frame_settings->values.lossless || lossy_use_original_profile) {
+    basic_info.uses_original_profile = true;
+  } else {
+    basic_info.uses_original_profile = false;
+  }
+  // 16-bit alpha means this requires level 10
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, true);
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlColorEncodingSetToSRGB(&color_encoding, false);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  pixel_format.num_channels = 1;
+  EXPECT_EQ(JXL_ENC_ERROR,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  pixel_format.num_channels = 4;
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_LE(compressed.size(), max_compressed_size);
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+  jxl::CodecInOut decoded_io;
+  EXPECT_TRUE(jxl::test::DecodeFile(
+      {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+      &decoded_io));
+
+  EXPECT_LE(
+      ComputeDistance2(input_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
+#if JXL_HIGH_PRECISION
+      1.84);
+#else
+      8.7);
+#endif
+}
+
+void VerifyFrameEncoding(JxlEncoder* enc,
+                         const JxlEncoderFrameSettings* frame_settings) {
+  VerifyFrameEncoding(63, 129, enc, frame_settings, 2700,
+                      /*lossy_use_original_profile=*/false);
+}
+
+TEST(EncodeTest, FrameEncodingTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  VerifyFrameEncoding(enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr));
+}
+
+TEST(EncodeTest, EncoderResetTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  VerifyFrameEncoding(50, 200, enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 4300,
+                      false);
+  // Encoder should become reusable for a new image from scratch after using
+  // reset.
+  JxlEncoderReset(enc.get());
+  VerifyFrameEncoding(157, 77, enc.get(),
+                      JxlEncoderFrameSettingsCreate(enc.get(), nullptr), 2300,
+                      false);
+}
+
+TEST(EncodeTest, CmsTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  bool cms_called = false;
+  JxlCmsInterface cms = jxl::GetJxlCms();
+  struct InitData {
+    void* original_init_data;
+    jpegxl_cms_init_func original_init;
+    bool* cms_called;
+  };
+  InitData init_data = {/*original_init_data=*/cms.init_data,
+                        /*original_init=*/cms.init,
+                        /*cms_called=*/&cms_called};
+  cms.init_data = &init_data;
+  cms.init = +[](void* raw_init_data, size_t num_threads,
+                 size_t pixels_per_thread, const JxlColorProfile* input_profile,
+                 const JxlColorProfile* output_profile,
+                 float intensity_target) {
+    const InitData* init_data = static_cast<const InitData*>(raw_init_data);
+    *init_data->cms_called = true;
+    return init_data->original_init(init_data->original_init_data, num_threads,
+                                    pixels_per_thread, input_profile,
+                                    output_profile, intensity_target);
+  };
+  JxlEncoderSetCms(enc.get(), cms);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, false);
+  ASSERT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderFrameSettingsSetOption(frame_settings,
+                                             JXL_ENC_FRAME_SETTING_EFFORT, 8));
+  VerifyFrameEncoding(enc.get(), frame_settings);
+  EXPECT_TRUE(cms_called);
+}
+
+TEST(EncodeTest, frame_settingsTest) {
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 5));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(jxl::SpeedTier::kHare, enc->last_used_cparams.speed_tier);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    // Lower than currently supported values
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 0));
+    // Higher than currently supported values
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 11));
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3600, false);
+    EXPECT_EQ(true, enc->last_used_cparams.IsLossless());
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetFrameDistance(frame_settings, 0.5));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3030, false);
+    EXPECT_EQ(0.5, enc->last_used_cparams.butteraugli_distance);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    // Disallowed negative distance
+    EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetFrameDistance(frame_settings, -1));
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, 2));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(2u, enc->last_used_cparams.decoding_speed_tier);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 100));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER, 1));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X, 5));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(true, enc->last_used_cparams.centerfirst);
+    EXPECT_EQ(5, enc->last_used_cparams.center_x);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_RESPONSIVE, 0));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC, -1));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2830,
+                        /*lossy_use_original_profile=*/false);
+    EXPECT_EQ(false, enc->last_used_cparams.responsive);
+    EXPECT_EQ(true, enc->last_used_cparams.progressive_mode);
+    EXPECT_EQ(2, enc->last_used_cparams.progressive_dc);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 1777.777));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_NEAR(1777.777f, enc->last_used_cparams.photon_noise_iso, 1E-4);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, 55.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 25.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PALETTE_COLORS, 70000));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_LOSSY_PALETTE, 1));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_NEAR(55.0f,
+                enc->last_used_cparams.channel_colors_pre_transform_percent,
+                1E-6);
+    EXPECT_NEAR(25.0f, enc->last_used_cparams.channel_colors_percent, 1E-6);
+    EXPECT_EQ(70000, enc->last_used_cparams.palette_colors);
+    EXPECT_EQ(true, enc->last_used_cparams.lossy_palette);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, 30));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, 2));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 14));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings,
+            JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 77.0f));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, 7));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(30, enc->last_used_cparams.colorspace);
+    EXPECT_EQ(2, enc->last_used_cparams.modular_group_size_shift);
+    EXPECT_EQ(jxl::Predictor::Best, enc->last_used_cparams.options.predictor);
+    EXPECT_NEAR(0.77f, enc->last_used_cparams.options.nb_repeats, 1E-6);
+    EXPECT_EQ(7, enc->last_used_cparams.options.max_properties);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 0));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(false, enc->last_used_cparams.force_cfl_jpeg_recompression);
+  }
+
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, 1));
+    VerifyFrameEncoding(enc.get(), frame_settings);
+    EXPECT_EQ(true, enc->last_used_cparams.force_cfl_jpeg_recompression);
+  }
+}
+
+TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) {
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7897, true);
+  }
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 8310, true);
+  }
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    ASSERT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 8));
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7173, true);
+  }
+}
+
+namespace {
+// Returns a copy of buf from offset to offset+size, or a new zeroed vector if
+// the result would have been out of bounds taking integer overflow into
+// account.
+std::vector<uint8_t> SliceSpan(const jxl::Span<const uint8_t>& buf,
+                               size_t offset, size_t size) {
+  if (offset + size >= buf.size()) {
+    return std::vector<uint8_t>(size, 0);
+  }
+  if (offset + size < offset) {
+    return std::vector<uint8_t>(size, 0);
+  }
+  return std::vector<uint8_t>(buf.data() + offset, buf.data() + offset + size);
+}
+
+struct Box {
+  // The type of the box.
+  // If "uuid", use extended_type instead
+  char type[4] = {0, 0, 0, 0};
+
+  // The extended_type is only used when type == "uuid".
+  // Extended types are not used in JXL. However, the box format itself
+  // supports this so they are handled correctly.
+  char extended_type[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Box data.
+  jxl::Span<const uint8_t> data = jxl::Span<const uint8_t>(nullptr, 0);
+
+  // If the size is not given, the datasize extends to the end of the file.
+  // If this field is false, the size field is not encoded when the box is
+  // serialized.
+  bool data_size_given = true;
+
+  // If successful, returns true and sets `in` to be the rest data (if any).
+  // If `in` contains a box with a size larger than `in.size()`, will not
+  // modify `in`, and will return true but the data `Span<uint8_t>` will
+  // remain set to nullptr.
+  // If unsuccessful, returns error and doesn't modify `in`.
+  jxl::Status Decode(jxl::Span<const uint8_t>* in) {
+    // Total box_size including this header itself.
+    uint64_t box_size = LoadBE32(SliceSpan(*in, 0, 4).data());
+    size_t pos = 4;
+
+    memcpy(type, SliceSpan(*in, pos, 4).data(), 4);
+    pos += 4;
+
+    if (box_size == 1) {
+      // If the size is 1, it indicates extended size read from 64-bit integer.
+      box_size = LoadBE64(SliceSpan(*in, pos, 8).data());
+      pos += 8;
+    }
+
+    if (!memcmp("uuid", type, 4)) {
+      memcpy(extended_type, SliceSpan(*in, pos, 16).data(), 16);
+      pos += 16;
+    }
+
+    // This is the end of the box header, the box data begins here. Handle
+    // the data size now.
+    const size_t header_size = pos;
+
+    if (box_size != 0) {
+      if (box_size < header_size) {
+        return JXL_FAILURE("Invalid box size");
+      }
+      if (box_size > in->size()) {
+        // The box is fine, but the input is too short.
+        return true;
+      }
+      data_size_given = true;
+      data = jxl::Span<const uint8_t>(in->data() + header_size,
+                                      box_size - header_size);
+    } else {
+      data_size_given = false;
+      data = jxl::Span<const uint8_t>(in->data() + header_size,
+                                      in->size() - header_size);
+    }
+
+    *in = jxl::Span<const uint8_t>(in->data() + header_size + data.size(),
+                                   in->size() - header_size - data.size());
+    return true;
+  }
+};
+
+struct Container {
+  std::vector<Box> boxes;
+
+  // If successful, returns true and sets `in` to be the rest data (if any).
+  // If unsuccessful, returns error and doesn't modify `in`.
+  jxl::Status Decode(jxl::Span<const uint8_t>* in) {
+    boxes.clear();
+
+    Box signature_box;
+    JXL_RETURN_IF_ERROR(signature_box.Decode(in));
+    if (memcmp("JXL ", signature_box.type, 4) != 0) {
+      return JXL_FAILURE("Invalid magic signature");
+    }
+    if (signature_box.data.size() != 4)
+      return JXL_FAILURE("Invalid magic signature");
+    if (signature_box.data[0] != 0xd || signature_box.data[1] != 0xa ||
+        signature_box.data[2] != 0x87 || signature_box.data[3] != 0xa) {
+      return JXL_FAILURE("Invalid magic signature");
+    }
+
+    Box ftyp_box;
+    JXL_RETURN_IF_ERROR(ftyp_box.Decode(in));
+    if (memcmp("ftyp", ftyp_box.type, 4) != 0) {
+      return JXL_FAILURE("Invalid ftyp");
+    }
+    if (ftyp_box.data.size() != 12) return JXL_FAILURE("Invalid ftyp");
+    const char* expected = "jxl \0\0\0\0jxl ";
+    if (memcmp(expected, ftyp_box.data.data(), 12) != 0)
+      return JXL_FAILURE("Invalid ftyp");
+
+    while (!in->empty()) {
+      Box box = {};
+      JXL_RETURN_IF_ERROR(box.Decode(in));
+      if (box.data.data() == nullptr) {
+        // The decoding encountered a box, but not enough data yet.
+        return true;
+      }
+      boxes.emplace_back(box);
+    }
+
+    return true;
+  }
+};
+
+}  // namespace
+
+TEST(EncodeTest, SingleFrameBoundedJXLCTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), true));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  size_t xsize = 71;
+  size_t ysize = 23;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  Container container = {};
+  jxl::Span<const uint8_t> encoded_span =
+      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+  EXPECT_TRUE(container.Decode(&encoded_span));
+  EXPECT_EQ(0u, encoded_span.size());
+  bool found_jxlc = false;
+  bool found_jxlp = false;
+  // The encoder is allowed to either emit a jxlc or one or more jxlp.
+  for (size_t i = 0; i < container.boxes.size(); ++i) {
+    if (memcmp("jxlc", container.boxes[i].type, 4) == 0) {
+      EXPECT_EQ(false, found_jxlc);  // Max 1 jxlc
+      EXPECT_EQ(false, found_jxlp);  // Can't mix jxlc and jxlp
+      found_jxlc = true;
+    }
+    if (memcmp("jxlp", container.boxes[i].type, 4) == 0) {
+      EXPECT_EQ(false, found_jxlc);  // Can't mix jxlc and jxlp
+      found_jxlp = true;
+    }
+    // The encoder shouldn't create an unbounded box in this case, with the
+    // single frame it knows the full size in time, so can help make decoding
+    // more efficient by giving the full box size of the final box.
+    EXPECT_EQ(true, container.boxes[i].data_size_given);
+  }
+  EXPECT_EQ(true, found_jxlc || found_jxlp);
+}
+
+TEST(EncodeTest, CodestreamLevelTest) {
+  size_t xsize = 64;
+  size_t ysize = 64;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+
+  jxl::CodecInOut input_io =
+      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/pixel_format.num_channels < 3);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  Container container = {};
+  jxl::Span<const uint8_t> encoded_span =
+      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+  EXPECT_TRUE(container.Decode(&encoded_span));
+  EXPECT_EQ(0u, encoded_span.size());
+  EXPECT_EQ(0, memcmp("jxll", container.boxes[0].type, 4));
+}
+
+TEST(EncodeTest, CodestreamLevelVerificationTest) {
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = 64;
+  basic_info.ysize = 64;
+  basic_info.uses_original_profile = false;
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+
+  EXPECT_EQ(5, JxlEncoderGetRequiredCodestreamLevel(enc.get()));
+
+  // Set an image dimension that is too large for level 5, but fits in level 10
+
+  basic_info.xsize = 1ull << 30ull;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 5));
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  EXPECT_EQ(10, JxlEncoderGetRequiredCodestreamLevel(enc.get()));
+
+  // Set an image dimension that is too large even for level 10
+
+  basic_info.xsize = 1ull << 31ull;
+  EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+}
+
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  jxl::extras::JXLDecompressParams dparams;
+  std::vector<uint8_t> decoded_jpeg_bytes;
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &decoded_jpeg_bytes));
+
+  EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size());
+  EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size()));
+}
+
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(ProgressiveJPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  frame_settings->values.cparams.progressive_mode = true;
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  jxl::extras::JXLDecompressParams dparams;
+  std::vector<uint8_t> decoded_jpeg_bytes;
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &decoded_jpeg_bytes));
+
+  EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size());
+  EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size()));
+}
+
+static void ProcessEncoder(JxlEncoder* enc, std::vector<uint8_t>& compressed,
+                           uint8_t*& next_out, size_t& avail_out) {
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  size_t offset = next_out - compressed.data();
+  compressed.resize(next_out - compressed.data());
+  next_out = compressed.data() + offset;
+  avail_out = compressed.size() - offset;
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+}
+
+TEST(EncodeTest, BasicInfoTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 1;
+  size_t ysize = 1;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  basic_info.have_animation = true;
+  basic_info.intensity_target = 123.4;
+  basic_info.min_nits = 5.0;
+  basic_info.linear_below = 12.7;
+  basic_info.orientation = JXL_ORIENT_ROTATE_90_CW;
+  basic_info.intrinsic_xsize = 88;
+  basic_info.intrinsic_ysize = 99;
+  basic_info.animation.tps_numerator = 55;
+  basic_info.animation.tps_denominator = 77;
+  basic_info.animation.num_loops = 10;
+  basic_info.animation.have_timecodes = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO));
+  // Allow testing the orientation field, without this setting it will be
+  // overridden to identity.
+  JxlDecoderSetKeepOrientation(dec.get(), JXL_TRUE);
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_BASIC_INFO) {
+      JxlBasicInfo basic_info2;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderGetBasicInfo(dec.get(), &basic_info2));
+      EXPECT_EQ(basic_info.xsize, basic_info2.xsize);
+      EXPECT_EQ(basic_info.ysize, basic_info2.ysize);
+      EXPECT_EQ(basic_info.bits_per_sample, basic_info2.bits_per_sample);
+      EXPECT_EQ(basic_info.exponent_bits_per_sample,
+                basic_info2.exponent_bits_per_sample);
+      EXPECT_NEAR(basic_info.intensity_target, basic_info2.intensity_target,
+                  0.5);
+      EXPECT_NEAR(basic_info.min_nits, basic_info2.min_nits, 0.5);
+      EXPECT_NEAR(basic_info.linear_below, basic_info2.linear_below, 0.5);
+      EXPECT_EQ(basic_info.relative_to_max_display,
+                basic_info2.relative_to_max_display);
+      EXPECT_EQ(basic_info.uses_original_profile,
+                basic_info2.uses_original_profile);
+      EXPECT_EQ(basic_info.orientation, basic_info2.orientation);
+      EXPECT_EQ(basic_info.intrinsic_xsize, basic_info2.intrinsic_xsize);
+      EXPECT_EQ(basic_info.intrinsic_ysize, basic_info2.intrinsic_ysize);
+      EXPECT_EQ(basic_info.num_color_channels, basic_info2.num_color_channels);
+      // TODO(lode): also test num_extra_channels, but currently there may be a
+      // mismatch between 0 and 1 if there is alpha, until encoder support for
+      // extra channels is fully implemented.
+      EXPECT_EQ(basic_info.alpha_bits, basic_info2.alpha_bits);
+      EXPECT_EQ(basic_info.alpha_exponent_bits,
+                basic_info2.alpha_exponent_bits);
+      EXPECT_EQ(basic_info.alpha_premultiplied,
+                basic_info2.alpha_premultiplied);
+
+      EXPECT_EQ(basic_info.have_preview, basic_info2.have_preview);
+      if (basic_info.have_preview) {
+        EXPECT_EQ(basic_info.preview.xsize, basic_info2.preview.xsize);
+        EXPECT_EQ(basic_info.preview.ysize, basic_info2.preview.ysize);
+      }
+
+      EXPECT_EQ(basic_info.have_animation, basic_info2.have_animation);
+      if (basic_info.have_animation) {
+        EXPECT_EQ(basic_info.animation.tps_numerator,
+                  basic_info2.animation.tps_numerator);
+        EXPECT_EQ(basic_info.animation.tps_denominator,
+                  basic_info2.animation.tps_denominator);
+        EXPECT_EQ(basic_info.animation.num_loops,
+                  basic_info2.animation.num_loops);
+        EXPECT_EQ(basic_info.animation.have_timecodes,
+                  basic_info2.animation.have_timecodes);
+      }
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+}
+
+TEST(EncodeTest, AnimationHeaderTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 1;
+  size_t ysize = 1;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.have_animation = true;
+  basic_info.animation.tps_numerator = 1000;
+  basic_info.animation.tps_denominator = 1;
+  basic_info.animation.have_timecodes = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  std::string frame_name = "test frame";
+  JxlFrameHeader header;
+  JxlEncoderInitFrameHeader(&header);
+  header.duration = 50;
+  header.timecode = 800;
+  header.layer_info.blend_info.blendmode = JXL_BLEND_BLEND;
+  header.layer_info.blend_info.source = 2;
+  header.layer_info.blend_info.clamp = 1;
+  JxlBlendInfo extra_channel_blend_info;
+  JxlEncoderInitBlendInfo(&extra_channel_blend_info);
+  extra_channel_blend_info.blendmode = JXL_BLEND_MULADD;
+  JxlEncoderSetFrameHeader(frame_settings, &header);
+  JxlEncoderSetExtraChannelBlendInfo(frame_settings, 0,
+                                     &extra_channel_blend_info);
+  JxlEncoderSetFrameName(frame_settings, frame_name.c_str());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+
+  // To test the blend_info fields, coalescing must be set to false in the
+  // decoder.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  bool seen_frame = false;
+
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_FRAME) {
+      seen_frame = true;
+      JxlFrameHeader header2;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2));
+      EXPECT_EQ(header.duration, header2.duration);
+      EXPECT_EQ(header.timecode, header2.timecode);
+      EXPECT_EQ(header.layer_info.blend_info.blendmode,
+                header2.layer_info.blend_info.blendmode);
+      EXPECT_EQ(header.layer_info.blend_info.clamp,
+                header2.layer_info.blend_info.clamp);
+      EXPECT_EQ(header.layer_info.blend_info.source,
+                header2.layer_info.blend_info.source);
+      EXPECT_EQ(frame_name.size(), header2.name_length);
+      JxlBlendInfo extra_channel_blend_info2;
+      JxlDecoderGetExtraChannelBlendInfo(dec.get(), 0,
+                                         &extra_channel_blend_info2);
+      EXPECT_EQ(extra_channel_blend_info.blendmode,
+                extra_channel_blend_info2.blendmode);
+      if (header2.name_length > 0) {
+        std::string frame_name2(header2.name_length + 1, '\0');
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderGetFrameName(dec.get(), &frame_name2.front(),
+                                         frame_name2.size()));
+        frame_name2.resize(header2.name_length);
+        EXPECT_EQ(frame_name, frame_name2);
+      }
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+
+  EXPECT_EQ(true, seen_frame);
+}
+TEST(EncodeTest, CroppedFrameTest) {
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
+
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 300;
+  size_t ysize = 300;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  std::vector<uint8_t> pixels2(pixels.size());
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  // Encoding a 300x300 frame in an image that is only 100x100
+  basic_info.xsize = 100;
+  basic_info.ysize = 100;
+  basic_info.uses_original_profile = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+  JxlFrameHeader header;
+  JxlEncoderInitFrameHeader(&header);
+  header.layer_info.have_crop = JXL_TRUE;
+  header.layer_info.xsize = xsize;
+  header.layer_info.ysize = ysize;
+  header.layer_info.crop_x0 = -50;
+  header.layer_info.crop_y0 = -250;
+  JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE);
+  JxlEncoderSetFrameHeader(frame_settings, &header);
+  JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_EFFORT,
+                                   1);
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(100);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  JxlEncoderCloseFrames(enc.get());
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+  // Non-coalesced decoding so we can get the full uncropped frame
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec.get(), JXL_FALSE));
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  bool seen_frame = false;
+  bool checked_frame = false;
+  for (;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      break;
+    } else if (status == JXL_DEC_FRAME) {
+      seen_frame = true;
+      JxlFrameHeader header2;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec.get(), &header2));
+      EXPECT_EQ(header.layer_info.xsize, header2.layer_info.xsize);
+      EXPECT_EQ(header.layer_info.ysize, header2.layer_info.ysize);
+      EXPECT_EQ(header.layer_info.crop_x0, header2.layer_info.crop_x0);
+      EXPECT_EQ(header.layer_info.crop_y0, header2.layer_info.crop_y0);
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec.get(), &pixel_format,
+                                            pixels2.data(), pixels2.size()));
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      EXPECT_EQ(0, memcmp(pixels.data(), pixels2.data(), pixels.size()));
+      checked_frame = true;
+    } else {
+      FAIL();  // unexpected status
+    }
+  }
+  EXPECT_EQ(true, checked_frame);
+  EXPECT_EQ(true, seen_frame);
+}
+
+TEST(EncodeTest, JXL_BOXES_TEST(BoxTest)) {
+  // Test with uncompressed boxes and with brob boxes
+  for (int compress_box = 0; compress_box <= 1; ++compress_box) {
+    // Tests adding two metadata boxes with the encoder: an exif box before the
+    // image frame, and an xml box after the image frame. Then verifies the
+    // decoder can decode them, they are in the expected place, and have the
+    // correct content after decoding.
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    EXPECT_NE(nullptr, enc.get());
+
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseBoxes(enc.get()));
+
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    size_t xsize = 50;
+    size_t ysize = 17;
+    JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    std::vector<uint8_t> pixels =
+        jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+    JxlBasicInfo basic_info;
+    jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+    basic_info.xsize = xsize;
+    basic_info.ysize = ysize;
+    basic_info.uses_original_profile = false;
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+    JxlColorEncoding color_encoding;
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/false);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+
+    std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+    uint8_t* next_out = compressed.data();
+    size_t avail_out = compressed.size() - (next_out - compressed.data());
+
+    // Add an early metadata box. Also add a valid 4-byte TIFF offset header
+    // before the fake exif data of these box contents.
+    constexpr const char* exif_test_string = "\0\0\0\0exif test data";
+    const uint8_t* exif_data =
+        reinterpret_cast<const uint8_t*>(exif_test_string);
+    // Skip the 4 zeroes for strlen
+    const size_t exif_size = 4 + strlen(exif_test_string + 4);
+    JxlEncoderAddBox(enc.get(), "Exif", exif_data, exif_size, compress_box);
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Add image frame
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                      pixels.data(), pixels.size()));
+    // Indicate this is the last frame
+    JxlEncoderCloseFrames(enc.get());
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Add a late metadata box
+    constexpr const char* xml_test_string = "<some random xml data>";
+    const uint8_t* xml_data = reinterpret_cast<const uint8_t*>(xml_test_string);
+    size_t xml_size = strlen(xml_test_string);
+    JxlEncoderAddBox(enc.get(), "XML ", xml_data, xml_size, compress_box);
+
+    // Indicate this is the last box
+    JxlEncoderCloseBoxes(enc.get());
+
+    // Write to output
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+
+    // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+    JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+    EXPECT_NE(nullptr, dec.get());
+
+    if (compress_box) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE));
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec.get(), JXL_DEC_FRAME | JXL_DEC_BOX));
+
+    JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+    JxlDecoderCloseInput(dec.get());
+
+    std::vector<uint8_t> dec_exif_box(exif_size);
+    std::vector<uint8_t> dec_xml_box(xml_size);
+
+    for (bool post_frame = false;;) {
+      JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+      if (status == JXL_DEC_ERROR) {
+        FAIL();
+      } else if (status == JXL_DEC_SUCCESS) {
+        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+        break;
+      } else if (status == JXL_DEC_FRAME) {
+        post_frame = true;
+      } else if (status == JXL_DEC_BOX) {
+        // Since we gave the exif/xml box output buffer of the exact known
+        // correct size, 0 bytes should be released. Same when no buffer was
+        // set.
+        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+        JxlBoxType type;
+        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec.get(), type, true));
+        if (!memcmp(type, "Exif", 4)) {
+          // This box should have been encoded before the image frame
+          EXPECT_EQ(false, post_frame);
+          JxlDecoderSetBoxBuffer(dec.get(), dec_exif_box.data(),
+                                 dec_exif_box.size());
+        } else if (!memcmp(type, "XML ", 4)) {
+          // This box should have been encoded after the image frame
+          EXPECT_EQ(true, post_frame);
+          JxlDecoderSetBoxBuffer(dec.get(), dec_xml_box.data(),
+                                 dec_xml_box.size());
+        }
+      } else {
+        FAIL();  // unexpected status
+      }
+    }
+
+    EXPECT_EQ(0, memcmp(exif_data, dec_exif_box.data(), exif_size));
+    EXPECT_EQ(0, memcmp(xml_data, dec_xml_box.data(), xml_size));
+  }
+}
+
+#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) {
+  for (int skip_basic_info = 0; skip_basic_info < 2; skip_basic_info++) {
+    for (int skip_color_encoding = 0; skip_color_encoding < 2;
+         skip_color_encoding++) {
+      // cannot set color encoding if basic info is not set
+      if (skip_basic_info && !skip_color_encoding) continue;
+      const std::string jpeg_path = "jxl/flower/flower_cropped.jpg";
+      const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+      jxl::CodecInOut orig_io;
+      ASSERT_TRUE(SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io,
+                               /*pool=*/nullptr));
+
+      JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+      JxlEncoderFrameSettings* frame_settings =
+          JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+      JxlEncoderFrameSettingsSetOption(frame_settings,
+                                       JXL_ENC_FRAME_SETTING_EFFORT, 1);
+      if (!skip_basic_info) {
+        JxlBasicInfo basic_info;
+        JxlEncoderInitBasicInfo(&basic_info);
+        basic_info.xsize = orig_io.xsize();
+        basic_info.ysize = orig_io.ysize();
+        basic_info.uses_original_profile = true;
+        EXPECT_EQ(JXL_ENC_SUCCESS,
+                  JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+      }
+      if (!skip_color_encoding) {
+        JxlColorEncoding color_encoding;
+        JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+        EXPECT_EQ(JXL_ENC_SUCCESS,
+                  JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+      }
+      EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderAddJPEGFrame(
+                                     frame_settings, orig.data(), orig.size()));
+      JxlEncoderCloseInput(enc.get());
+
+      std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+      uint8_t* next_out = compressed.data();
+      size_t avail_out = compressed.size() - (next_out - compressed.data());
+      JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+      while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+        process_result =
+            JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+        if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+          size_t offset = next_out - compressed.data();
+          compressed.resize(compressed.size() * 2);
+          next_out = compressed.data() + offset;
+          avail_out = compressed.size() - offset;
+        }
+      }
+      compressed.resize(next_out - compressed.data());
+      EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+      jxl::CodecInOut decoded_io;
+      EXPECT_TRUE(jxl::test::DecodeFile(
+          {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
+          &decoded_io));
+
+      EXPECT_LE(
+          ComputeDistance2(orig_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
+          3.5);
+    }
+  }
+}
+#endif  // JPEGXL_ENABLE_JPEG
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc
new file mode 100644
index 0000000000..0043c2d31e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.cc
@@ -0,0 +1,70 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/entropy_coder.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_context_map.h"
+#include "lib/jxl/epf.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map) {
+  auto& dct = block_ctx_map->dc_thresholds;
+  auto& qft = block_ctx_map->qf_thresholds;
+  auto& ctx_map = block_ctx_map->ctx_map;
+  bool is_default = br->ReadFixedBits<1>();
+  if (is_default) {
+    *block_ctx_map = BlockCtxMap();
+    return true;
+  }
+  block_ctx_map->num_dc_ctxs = 1;
+  for (int j : {0, 1, 2}) {
+    dct[j].resize(br->ReadFixedBits<4>());
+    block_ctx_map->num_dc_ctxs *= dct[j].size() + 1;
+    for (int& i : dct[j]) {
+      i = UnpackSigned(U32Coder::Read(kDCThresholdDist, br));
+    }
+  }
+  qft.resize(br->ReadFixedBits<4>());
+  for (uint32_t& i : qft) {
+    i = U32Coder::Read(kQFThresholdDist, br) + 1;
+  }
+
+  if (block_ctx_map->num_dc_ctxs * (qft.size() + 1) > 64) {
+    return JXL_FAILURE("Invalid block context map: too big");
+  }
+
+  ctx_map.resize(3 * kNumOrders * block_ctx_map->num_dc_ctxs *
+                 (qft.size() + 1));
+  JXL_RETURN_IF_ERROR(DecodeContextMap(&ctx_map, &block_ctx_map->num_ctxs, br));
+  if (block_ctx_map->num_ctxs > 16) {
+    return JXL_FAILURE("Invalid block context map: too many distinct contexts");
+  }
+  return true;
+}
+
+constexpr uint8_t BlockCtxMap::kDefaultCtxMap[];  // from ac_context.h
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder.h b/third_party/jpeg-xl/lib/jxl/entropy_coder.h
new file mode 100644
index 0000000000..e4afa7a631
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENTROPY_CODER_H_
+#define LIB_JXL_ENTROPY_CODER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+// Entropy coding and context modeling of DC and AC coefficients, as well as AC
+// strategy and quantization field.
+
+namespace jxl {
+
+static JXL_INLINE int32_t PredictFromTopAndLeft(
+    const int32_t* const JXL_RESTRICT row_top,
+    const int32_t* const JXL_RESTRICT row, size_t x, int32_t default_val) {
+  if (x == 0) {
+    return row_top == nullptr ? default_val : row_top[x];
+  }
+  if (row_top == nullptr) {
+    return row[x - 1];
+  }
+  return (row_top[x] + row[x - 1] + 1) / 2;
+}
+
+static constexpr U32Enc kDCThresholdDist(Bits(4), BitsOffset(8, 16),
+                                         BitsOffset(16, 272),
+                                         BitsOffset(32, 65808));
+
+static constexpr U32Enc kQFThresholdDist(Bits(2), BitsOffset(3, 4),
+                                         BitsOffset(5, 12), BitsOffset(8, 44));
+
+Status DecodeBlockCtxMap(BitReader* br, BlockCtxMap* block_ctx_map);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENTROPY_CODER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc
new file mode 100644
index 0000000000..9dbeb137af
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/entropy_coder_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// TODO(deymo): Move these tests to dec_ans.h and common.h
+
+#include <stdint.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(EntropyCoderTest, PackUnpack) {
+  for (int32_t i = -31; i < 32; ++i) {
+    uint32_t packed = PackSigned(i);
+    EXPECT_LT(packed, 63u);
+    int32_t unpacked = UnpackSigned(packed);
+    EXPECT_EQ(i, unpacked);
+  }
+}
+
+struct DummyBitReader {
+  uint32_t nbits, bits;
+  void Consume(uint32_t nbits) {}
+  uint32_t PeekBits(uint32_t n) {
+    EXPECT_EQ(n, nbits);
+    return bits;
+  }
+};
+
+void HybridUintRoundtrip(HybridUintConfig config, size_t limit = 1 << 24) {
+  Rng rng(0);
+  constexpr size_t kNumIntegers = 1 << 20;
+  std::vector<uint32_t> integers(kNumIntegers);
+  std::vector<uint32_t> token(kNumIntegers);
+  std::vector<uint32_t> nbits(kNumIntegers);
+  std::vector<uint32_t> bits(kNumIntegers);
+  for (size_t i = 0; i < kNumIntegers; i++) {
+    integers[i] = rng.UniformU(0, limit + 1);
+    config.Encode(integers[i], &token[i], &nbits[i], &bits[i]);
+  }
+  for (size_t i = 0; i < kNumIntegers; i++) {
+    DummyBitReader br{nbits[i], bits[i]};
+    EXPECT_EQ(integers[i],
+              ANSSymbolReader::ReadHybridUintConfig(config, token[i], &br));
+  }
+}
+
+TEST(HybridUintTest, Test000) {
+  HybridUintRoundtrip(HybridUintConfig{0, 0, 0});
+}
+TEST(HybridUintTest, Test411) {
+  HybridUintRoundtrip(HybridUintConfig{4, 1, 1});
+}
+TEST(HybridUintTest, Test420) {
+  HybridUintRoundtrip(HybridUintConfig{4, 2, 0});
+}
+TEST(HybridUintTest, Test421) {
+  HybridUintRoundtrip(HybridUintConfig{4, 2, 1}, 256);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/epf.cc b/third_party/jpeg-xl/lib/jxl/epf.cc
new file mode 100644
index 0000000000..7288ed9ca6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/epf.cc
@@ -0,0 +1,146 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Edge-preserving smoothing: weighted average based on L1 patch similarity.
+
+#include "lib/jxl/epf.h"
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <atomic>
+#include <numeric>  // std::accumulate
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+// Mirror n floats starting at *p and store them before p.
+JXL_INLINE void LeftMirror(float* p, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    *(p - 1 - i) = p[i];
+  }
+}
+
+// Mirror n floats starting at *(p - n) and store them at *p.
+JXL_INLINE void RightMirror(float* p, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    p[i] = *(p - 1 - i);
+  }
+}
+
+void ComputeSigma(const Rect& block_rect, PassesDecoderState* state) {
+  const LoopFilter& lf = state->shared->frame_header.loop_filter;
+  JXL_CHECK(lf.epf_iters > 0);
+  const AcStrategyImage& ac_strategy = state->shared->ac_strategy;
+  const float quant_scale = state->shared->quantizer.Scale();
+
+  const size_t sigma_stride = state->sigma.PixelsPerRow();
+  const size_t sharpness_stride = state->shared->epf_sharpness.PixelsPerRow();
+
+  for (size_t by = 0; by < block_rect.ysize(); ++by) {
+    float* JXL_RESTRICT sigma_row = block_rect.Row(&state->sigma, by);
+    const uint8_t* JXL_RESTRICT sharpness_row =
+        block_rect.ConstRow(state->shared->epf_sharpness, by);
+    AcStrategyRow acs_row = ac_strategy.ConstRow(block_rect, by);
+    const int32_t* const JXL_RESTRICT row_quant =
+        block_rect.ConstRow(state->shared->raw_quant_field, by);
+
+    for (size_t bx = 0; bx < block_rect.xsize(); bx++) {
+      AcStrategy acs = acs_row[bx];
+      size_t llf_x = acs.covered_blocks_x();
+      if (!acs.IsFirstBlock()) continue;
+      // quant_scale is smaller for low quality.
+      // quant_scale is roughly 0.08 / butteraugli score.
+      //
+      // row_quant is smaller for low quality.
+      // row_quant is a quantization multiplier of form 1.0 /
+      // row_quant[bx]
+      //
+      // lf.epf_quant_mul is a parameter in the format
+      // kInvSigmaNum is a constant
+      float sigma_quant =
+          lf.epf_quant_mul / (quant_scale * row_quant[bx] * kInvSigmaNum);
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          float sigma =
+              sigma_quant *
+              lf.epf_sharp_lut[sharpness_row[bx + ix + iy * sharpness_stride]];
+          // Avoid infinities.
+          sigma = std::min(-1e-4f, sigma);  // TODO(veluca): remove this.
+          sigma_row[bx + ix + kSigmaPadding +
+                    (iy + kSigmaPadding) * sigma_stride] = 1.0f / sigma;
+        }
+      }
+      // TODO(veluca): remove this padding.
+      // Left padding with mirroring.
+      if (bx + block_rect.x0() == 0) {
+        for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+          LeftMirror(
+              sigma_row + kSigmaPadding + (iy + kSigmaPadding) * sigma_stride,
+              kSigmaBorder);
+        }
+      }
+      // Right padding with mirroring.
+      if (bx + block_rect.x0() + llf_x ==
+          state->shared->frame_dim.xsize_blocks) {
+        for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+          RightMirror(sigma_row + kSigmaPadding + bx + llf_x +
+                          (iy + kSigmaPadding) * sigma_stride,
+                      kSigmaBorder);
+        }
+      }
+      // Offsets for row copying, in blocks.
+      size_t offset_before = bx + block_rect.x0() == 0 ? 1 : bx + kSigmaPadding;
+      size_t offset_after =
+          bx + block_rect.x0() + llf_x == state->shared->frame_dim.xsize_blocks
+              ? kSigmaPadding + llf_x + bx + kSigmaBorder
+              : kSigmaPadding + llf_x + bx;
+      size_t num = offset_after - offset_before;
+      // Above
+      if (by + block_rect.y0() == 0) {
+        for (size_t iy = 0; iy < kSigmaBorder; iy++) {
+          memcpy(
+              sigma_row + offset_before +
+                  (kSigmaPadding - 1 - iy) * sigma_stride,
+              sigma_row + offset_before + (kSigmaPadding + iy) * sigma_stride,
+              num * sizeof(*sigma_row));
+        }
+      }
+      // Below
+      if (by + block_rect.y0() + acs.covered_blocks_y() ==
+          state->shared->frame_dim.ysize_blocks) {
+        for (size_t iy = 0; iy < kSigmaBorder; iy++) {
+          memcpy(
+              sigma_row + offset_before +
+                  sigma_stride * (acs.covered_blocks_y() + kSigmaPadding + iy),
+              sigma_row + offset_before +
+                  sigma_stride *
+                      (acs.covered_blocks_y() + kSigmaPadding - 1 - iy),
+              num * sizeof(*sigma_row));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/epf.h b/third_party/jpeg-xl/lib/jxl/epf.h
new file mode 100644
index 0000000000..7a0834ed97
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/epf.h
@@ -0,0 +1,33 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_EPF_H_
+#define LIB_JXL_EPF_H_
+
+// Fast SIMD "in-loop" edge preserving filter (adaptive, nonlinear).
+
+#include <stddef.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/passes_state.h"
+
+namespace jxl {
+
+// 4 * (sqrt(0.5)-1), so that Weight(sigma) = 0.5.
+static constexpr float kInvSigmaNum = -1.1715728752538099024f;
+
+// kInvSigmaNum / 0.3
+constexpr float kMinSigma = -3.90524291751269967465540850526868f;
+
+// Fills the `state->filter_weights.sigma` image with the precomputed sigma
+// values in the area inside `block_rect`. Accesses the AC strategy, quant field
+// and epf_sharpness fields in the corresponding positions.
+void ComputeSigma(const Rect& block_rect, PassesDecoderState* state);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/exif.h b/third_party/jpeg-xl/lib/jxl/exif.h
new file mode 100644
index 0000000000..0cf493fc71
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/exif.h
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_EXIF_H_
+#define LIB_JXL_EXIF_H_
+
+// Basic parsing of Exif (just enough for the render-impacting things
+// like orientation)
+
+#include <jxl/codestream_header.h>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/image_metadata.h"
+
+namespace jxl {
+
+constexpr uint16_t kExifOrientationTag = 274;
+
+// Checks if a blob looks like Exif, and if so, sets bigendian
+// according to the tiff endianness
+inline bool IsExif(const std::vector<uint8_t>& exif, bool* bigendian) {
+  if (exif.size() < 12) return false;  // not enough bytes for a valid exif blob
+  const uint8_t* t = exif.data();
+  if (LoadLE32(t) == 0x2A004D4D) {
+    *bigendian = true;
+    return true;
+  } else if (LoadLE32(t) == 0x002A4949) {
+    *bigendian = false;
+    return true;
+  }
+  return false;  // not a valid tiff header
+}
+
+// Finds the position of an Exif tag, or 0 if it is not found
+inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
+                                  uint16_t tagname) {
+  bool bigendian;
+  if (!IsExif(exif, &bigendian)) return 0;
+  const uint8_t* t = exif.data() + 4;
+  uint64_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
+  if (exif.size() < 12 + offset + 2 || offset < 8) return 0;
+  t += offset - 4;
+  if (offset + 2 >= exif.size()) return 0;
+  uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t));
+  t += 2;
+  while (nb_tags > 0) {
+    if (t + 12 >= exif.data() + exif.size()) return 0;
+    uint16_t tag = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 2;
+    if (tag == tagname) return static_cast<size_t>(t - exif.data());
+    t += 10;
+    nb_tags--;
+  }
+  return 0;
+}
+
+// TODO (jon): tag 1 can be used to represent Adobe RGB 1998 if it has value
+// "R03"
+// TODO (jon): set intrinsic dimensions according to
+// https://discourse.wicg.io/t/proposal-exif-image-resolution-auto-and-from-image/4326/24
+// Parses the Exif data just enough to extract any render-impacting info.
+// If the Exif data is invalid or could not be parsed, then it is treated
+// as a no-op.
+inline void InterpretExif(const std::vector<uint8_t>& exif,
+                          JxlOrientation* orientation) {
+  bool bigendian;
+  if (!IsExif(exif, &bigendian)) return;
+  size_t o_pos = FindExifTagPosition(exif, kExifOrientationTag);
+  if (o_pos) {
+    const uint8_t* t = exif.data() + o_pos;
+    uint16_t type = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 2;
+    uint32_t count = (bigendian ? LoadBE32(t) : LoadLE32(t));
+    t += 4;
+    uint16_t value = (bigendian ? LoadBE16(t) : LoadLE16(t));
+    t += 4;
+    if (type == 3 && count == 1 && value >= 1 && value <= 8) {
+      *orientation = static_cast<JxlOrientation>(value);
+    }
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_EXIF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h
new file mode 100644
index 0000000000..508d808cc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fake_parallel_runner_testonly.h
@@ -0,0 +1,79 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
+#define LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
+
+#include <jxl/parallel_runner.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/random.h"
+
+namespace jxl {
+
+// A parallel runner implementation that runs all the jobs in a single thread
+// (the caller thread) but runs them pretending to use multiple threads and
+// potentially out of order. This is useful for testing conditions that only
+// occur under heavy load where the order of operations is different.
+class FakeParallelRunner {
+ public:
+  FakeParallelRunner(uint32_t order_seed, uint32_t num_threads)
+      : order_seed_(order_seed), rng_(order_seed), num_threads_(num_threads) {
+    if (num_threads_ < 1) num_threads_ = 1;
+  }
+
+  JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init,
+                         JxlParallelRunFunction func, uint32_t start,
+                         uint32_t end) {
+    JxlParallelRetCode ret = init(jxl_opaque, num_threads_);
+    if (ret != 0) return ret;
+
+    if (order_seed_ == 0) {
+      for (uint32_t i = start; i < end; i++) {
+        func(jxl_opaque, i, i % num_threads_);
+      }
+    } else {
+      std::vector<uint32_t> order(end - start);
+      for (uint32_t i = start; i < end; i++) {
+        order[i - start] = i;
+      }
+      rng_.Shuffle(order.data(), order.size());
+      for (uint32_t i = start; i < end; i++) {
+        func(jxl_opaque, order[i - start], i % num_threads_);
+      }
+    }
+    return ret;
+  }
+
+ private:
+  // Seed for the RNG for defining the execution order. A value of 0 means
+  // sequential order from start to end.
+  uint32_t order_seed_;
+
+  // The PRNG object, initialized with the order_seed_. Only used if the seed is
+  // not 0.
+  Rng rng_;
+
+  // Number of fake threads. All the tasks are run on the same thread, but using
+  // different thread_id values based on this num_threads.
+  uint32_t num_threads_;
+};
+
+}  // namespace jxl
+
+extern "C" {
+// Function to pass as the parallel runner.
+JXL_INLINE JxlParallelRetCode JxlFakeParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  return static_cast<jxl::FakeParallelRunner*>(runner_opaque)
+      ->Run(jpegxl_opaque, init, func, start_range, end_range);
+}
+}
+
+#endif  // LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h
new file mode 100644
index 0000000000..d2453b0e10
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct-inl.h
@@ -0,0 +1,238 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_FAST_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_FAST_DCT_INL_H_
+#undef LIB_JXL_FAST_DCT_INL_H_
+#else
+#define LIB_JXL_FAST_DCT_INL_H_
+#endif
+
+#include <cmath>
+
+#include <hwy/aligned_allocator.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/status.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#if HWY_TARGET == HWY_NEON
+HWY_NOINLINE void FastTransposeBlock(const int16_t* JXL_RESTRICT data_in,
+                                     size_t stride_in, size_t N, size_t M,
+                                     int16_t* JXL_RESTRICT data_out,
+                                     size_t stride_out) {
+  JXL_DASSERT(N % 8 == 0);
+  JXL_DASSERT(M % 8 == 0);
+  for (size_t i = 0; i < N; i += 8) {
+    for (size_t j = 0; j < M; j += 8) {
+      // TODO(veluca): one could optimize the M==8, stride_in==8 case further
+      // with vld4.
+      // This code is about 40% faster for N == M == stride_in ==
+      // stride_out == 8
+      // Using loads + stores to reshuffle things to be able to
+      // use vld4 doesn't help.
+      /*
+      auto a0 = vld4q_s16(data_in); auto a1 = vld4q_s16(data_in + 32);
+      int16x8x4_t out0;
+      int16x8x4_t out1;
+      out0.val[0] = vuzp1q_s16(a0.val[0], a1.val[0]);
+      out0.val[1] = vuzp1q_s16(a0.val[1], a1.val[1]);
+      out0.val[2] = vuzp1q_s16(a0.val[2], a1.val[2]);
+      out0.val[3] = vuzp1q_s16(a0.val[3], a1.val[3]);
+      out1.val[0] = vuzp2q_s16(a0.val[0], a1.val[0]);
+      out1.val[1] = vuzp2q_s16(a0.val[1], a1.val[1]);
+      out1.val[2] = vuzp2q_s16(a0.val[2], a1.val[2]);
+      out1.val[3] = vuzp2q_s16(a0.val[3], a1.val[3]);
+      vst1q_s16_x4(data_out, out0);
+      vst1q_s16_x4(data_out + 32, out1);
+      */
+      auto a0 = vld1q_s16(data_in + i * stride_in + j);
+      auto a1 = vld1q_s16(data_in + (i + 1) * stride_in + j);
+      auto a2 = vld1q_s16(data_in + (i + 2) * stride_in + j);
+      auto a3 = vld1q_s16(data_in + (i + 3) * stride_in + j);
+
+      auto a01 = vtrnq_s16(a0, a1);
+      auto a23 = vtrnq_s16(a2, a3);
+
+      auto four0 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[0]),
+                             vreinterpretq_s32_s16(a23.val[0]));
+      auto four1 = vtrnq_s32(vreinterpretq_s32_s16(a01.val[1]),
+                             vreinterpretq_s32_s16(a23.val[1]));
+
+      auto a4 = vld1q_s16(data_in + (i + 4) * stride_in + j);
+      auto a5 = vld1q_s16(data_in + (i + 5) * stride_in + j);
+      auto a6 = vld1q_s16(data_in + (i + 6) * stride_in + j);
+      auto a7 = vld1q_s16(data_in + (i + 7) * stride_in + j);
+
+      auto a45 = vtrnq_s16(a4, a5);
+      auto a67 = vtrnq_s16(a6, a7);
+
+      auto four2 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[0]),
+                             vreinterpretq_s32_s16(a67.val[0]));
+      auto four3 = vtrnq_s32(vreinterpretq_s32_s16(a45.val[1]),
+                             vreinterpretq_s32_s16(a67.val[1]));
+
+      auto out0 =
+          vcombine_s32(vget_low_s32(four0.val[0]), vget_low_s32(four2.val[0]));
+      auto out1 =
+          vcombine_s32(vget_low_s32(four1.val[0]), vget_low_s32(four3.val[0]));
+      auto out2 =
+          vcombine_s32(vget_low_s32(four0.val[1]), vget_low_s32(four2.val[1]));
+      auto out3 =
+          vcombine_s32(vget_low_s32(four1.val[1]), vget_low_s32(four3.val[1]));
+      auto out4 = vcombine_s32(vget_high_s32(four0.val[0]),
+                               vget_high_s32(four2.val[0]));
+      auto out5 = vcombine_s32(vget_high_s32(four1.val[0]),
+                               vget_high_s32(four3.val[0]));
+      auto out6 = vcombine_s32(vget_high_s32(four0.val[1]),
+                               vget_high_s32(four2.val[1]));
+      auto out7 = vcombine_s32(vget_high_s32(four1.val[1]),
+                               vget_high_s32(four3.val[1]));
+      vst1q_s16(data_out + j * stride_out + i, vreinterpretq_s16_s32(out0));
+      vst1q_s16(data_out + (j + 1) * stride_out + i,
+                vreinterpretq_s16_s32(out1));
+      vst1q_s16(data_out + (j + 2) * stride_out + i,
+                vreinterpretq_s16_s32(out2));
+      vst1q_s16(data_out + (j + 3) * stride_out + i,
+                vreinterpretq_s16_s32(out3));
+      vst1q_s16(data_out + (j + 4) * stride_out + i,
+                vreinterpretq_s16_s32(out4));
+      vst1q_s16(data_out + (j + 5) * stride_out + i,
+                vreinterpretq_s16_s32(out5));
+      vst1q_s16(data_out + (j + 6) * stride_out + i,
+                vreinterpretq_s16_s32(out6));
+      vst1q_s16(data_out + (j + 7) * stride_out + i,
+                vreinterpretq_s16_s32(out7));
+    }
+  }
+}
+
+template <size_t N>
+struct FastDCTTag {};
+
+#include "lib/jxl/fast_dct128-inl.h"
+#include "lib/jxl/fast_dct16-inl.h"
+#include "lib/jxl/fast_dct256-inl.h"
+#include "lib/jxl/fast_dct32-inl.h"
+#include "lib/jxl/fast_dct64-inl.h"
+#include "lib/jxl/fast_dct8-inl.h"
+
+template <size_t ROWS, size_t COLS>
+struct ComputeFastScaledIDCT {
+  // scratch_space must be aligned, and should have space for ROWS*COLS
+  // int16_ts.
+  HWY_MAYBE_UNUSED void operator()(int16_t* JXL_RESTRICT from, int16_t* to,
+                                   size_t to_stride,
+                                   int16_t* JXL_RESTRICT scratch_space) {
+    // Reverse the steps done in ComputeScaledDCT.
+    if (ROWS < COLS) {
+      FastTransposeBlock(from, COLS, ROWS, COLS, scratch_space, ROWS);
+      FastIDCT(FastDCTTag<COLS>(), scratch_space, ROWS, from, ROWS, ROWS);
+      FastTransposeBlock(from, ROWS, COLS, ROWS, scratch_space, COLS);
+      FastIDCT(FastDCTTag<ROWS>(), scratch_space, COLS, to, to_stride, COLS);
+    } else {
+      FastIDCT(FastDCTTag<COLS>(), from, ROWS, scratch_space, ROWS, ROWS);
+      FastTransposeBlock(scratch_space, ROWS, COLS, ROWS, from, COLS);
+      FastIDCT(FastDCTTag<ROWS>(), from, COLS, to, to_stride, COLS);
+    }
+  }
+};
+#endif
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFastIDCT() {
+#if HWY_TARGET == HWY_NEON
+  auto pixels_mem = hwy::AllocateAligned<float>(N * M);
+  float* pixels = pixels_mem.get();
+  auto dct_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct = dct_mem.get();
+  auto dct_i_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* dct_i = dct_i_mem.get();
+  auto dct_in_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* dct_in = dct_in_mem.get();
+  auto idct_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* idct = idct_mem.get();
+
+  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  float* scratch_space = scratch_space_mem.get();
+  auto scratch_space_i_mem = hwy::AllocateAligned<int16_t>(N * M * 2);
+  int16_t* scratch_space_i = scratch_space_i_mem.get();
+
+  Rng rng(0);
+  for (size_t i = 0; i < N * M; i++) {
+    pixels[i] = rng.UniformF(-1, 1);
+  }
+  ComputeScaledDCT<M, N>()(DCTFrom(pixels, N), dct, scratch_space);
+  size_t integer_bits = std::max(FastIDCTIntegerBits(FastDCTTag<N>()),
+                                 FastIDCTIntegerBits(FastDCTTag<M>()));
+  // Enough range for [-2, 2] output values.
+  JXL_ASSERT(integer_bits <= 14);
+  float scale = (1 << (14 - integer_bits));
+  for (size_t i = 0; i < N * M; i++) {
+    dct_i[i] = std::round(dct[i] * scale);
+  }
+
+  for (size_t j = 0; j < 40000000 / (M * N); j++) {
+    memcpy(dct_in, dct_i, sizeof(*dct_i) * N * M);
+    ComputeFastScaledIDCT<M, N>()(dct_in, idct, N, scratch_space_i);
+  }
+  float max_error = 0;
+  for (size_t i = 0; i < M * N; i++) {
+    float err = std::abs(idct[i] * (1.0f / scale) - pixels[i]);
+    if (std::abs(err) > max_error) {
+      max_error = std::abs(err);
+    }
+  }
+  printf("max error: %f mantissa bits: %d\n", max_error,
+         14 - (int)integer_bits);
+#endif
+}
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFloatIDCT() {
+  auto pixels_mem = hwy::AllocateAligned<float>(N * M);
+  float* pixels = pixels_mem.get();
+  auto dct_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct = dct_mem.get();
+  auto idct_mem = hwy::AllocateAligned<float>(N * M);
+  float* idct = idct_mem.get();
+
+  auto dct_in_mem = hwy::AllocateAligned<float>(N * M);
+  float* dct_in = dct_mem.get();
+
+  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  float* scratch_space = scratch_space_mem.get();
+
+  Rng rng(0);
+  for (size_t i = 0; i < N * M; i++) {
+    pixels[i] = rng.UniformF(-1, 1);
+  }
+  ComputeScaledDCT<M, N>()(DCTFrom(pixels, N), dct, scratch_space);
+
+  for (size_t j = 0; j < 40000000 / (M * N); j++) {
+    memcpy(dct_in, dct, sizeof(*dct) * N * M);
+    ComputeScaledIDCT<M, N>()(dct_in, DCTTo(idct, N), scratch_space);
+  }
+  float max_error = 0;
+  for (size_t i = 0; i < M * N; i++) {
+    float err = std::abs(idct[i] - pixels[i]);
+    if (std::abs(err) > max_error) {
+      max_error = std::abs(err);
+    }
+  }
+  printf("max error: %e\n", max_error);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_FAST_DCT_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.cc b/third_party/jpeg-xl/lib/jxl/fast_dct.cc
new file mode 100644
index 0000000000..d796018fd0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/fast_dct-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+void BenchmarkFloatIDCT32x32() { TestFloatIDCT<32, 32>(); }
+void BenchmarkFastIDCT32x32() { TestFastIDCT<32, 32>(); }
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(BenchmarkFloatIDCT32x32);
+HWY_EXPORT(BenchmarkFastIDCT32x32);
+void BenchmarkFloatIDCT32x32() {
+  HWY_DYNAMIC_DISPATCH(BenchmarkFloatIDCT32x32)();
+}
+void BenchmarkFastIDCT32x32() {
+  HWY_DYNAMIC_DISPATCH(BenchmarkFastIDCT32x32)();
+}
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct.h b/third_party/jpeg-xl/lib/jxl/fast_dct.h
new file mode 100644
index 0000000000..641933d8a0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct.h
@@ -0,0 +1,9 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+namespace jxl {
+void BenchmarkFloatIDCT32x32();
+void BenchmarkFastIDCT32x32();
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
new file mode 100644
index 0000000000..1a94d3ee92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
@@ -0,0 +1,2137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; }
+
+void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride,
+              int16_t* out, size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
+    int16x8_t v17 = vaddq_s16(v17_tmp, v16);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v19, v16);
+    int16x8_t v21 = vaddq_s16(v17, v20);
+    int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
+    int16x8_t v35 = vaddq_s16(v35_tmp, v34);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vaddq_s16(v35, v39);
+    int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vaddq_s16(v46, v43);
+    int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
+    int16x8_t v50 = vaddq_s16(v50_tmp, v49);
+    int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i);
+    int16x8_t v52 = vaddq_s16(v51, v36);
+    int16x8_t v53 = vaddq_s16(v52, v45);
+    int16x8_t v54 = vaddq_s16(v53, v49);
+    int16x8_t v55 = vaddq_s16(v50, v54);
+    int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
+    int16x8_t v57 = vaddq_s16(v48, v56);
+    int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
+    int16x8_t v71 = vaddq_s16(v71_tmp, v70);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vaddq_s16(v71, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vaddq_s16(v88, v81);
+    int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
+    int16x8_t v92 = vaddq_s16(v92_tmp, v91);
+    int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i);
+    int16x8_t v95 = vaddq_s16(v93, v94);
+    int16x8_t v96 = vaddq_s16(v95, v85);
+    int16x8_t v97 = vaddq_s16(v96, v91);
+    int16x8_t v98 = vaddq_s16(v92, v97);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v90, v99);
+    int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v80, v62);
+    int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
+    int16x8_t v104 = vaddq_s16(v104_tmp, v103);
+    int16x8_t v105 = vaddq_s16(v84, v64);
+    int16x8_t v106 = vaddq_s16(v65, v86);
+    int16x8_t v107 = vaddq_s16(v105, v106);
+    int16x8_t v108 = vaddq_s16(v104, v107);
+    int16x8_t v109 = vaddq_s16(v87, v68);
+    int16x8_t v110 = vaddq_s16(v69, v79);
+    int16x8_t v111 = vaddq_s16(v109, v110);
+    int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
+    int16x8_t v112 = vaddq_s16(v112_tmp, v111);
+    int16x8_t v113 = vaddq_s16(v94, v72);
+    int16x8_t v114 = vaddq_s16(v73, v83);
+    int16x8_t v115 = vaddq_s16(v113, v114);
+    int16x8_t v116 = vaddq_s16(v115, v111);
+    int16x8_t v117 = vaddq_s16(v112, v116);
+    int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
+    int16x8_t v119 = vaddq_s16(v108, v118);
+    int16x8_t v120 = vaddq_s16(v110, v103);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
+    int16x8_t v121 = vaddq_s16(v121_tmp, v120);
+    int16x8_t v122 = vaddq_s16(v114, v105);
+    int16x8_t v123 = vaddq_s16(v106, v109);
+    int16x8_t v124 = vaddq_s16(v122, v123);
+    int16x8_t v125 = vaddq_s16(v121, v124);
+    int16x8_t v126 = vaddq_s16(v123, v120);
+    int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
+    int16x8_t v127 = vaddq_s16(v127_tmp, v126);
+    int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i);
+    int16x8_t v129 = vaddq_s16(v128, v93);
+    int16x8_t v130 = vaddq_s16(v129, v113);
+    int16x8_t v131 = vaddq_s16(v130, v122);
+    int16x8_t v132 = vaddq_s16(v131, v126);
+    int16x8_t v133 = vaddq_s16(v127, v132);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v125, v134);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
+    int16x8_t v137 = vaddq_s16(v119, v136);
+    int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
+    int16x8_t v139 = vaddq_s16(v102, v138);
+    int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
+    int16x8_t v141 = vaddq_s16(v61, v140);
+    int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
+    int16x8_t v143 = vaddq_s16(v143_tmp, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i);
+    int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v146 = vaddq_s16(v144, v145);
+    int16x8_t v147 = vaddq_s16(v143, v146);
+    int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v150 = vaddq_s16(v148, v149);
+    int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
+    int16x8_t v151 = vaddq_s16(v151_tmp, v150);
+    int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i);
+    int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i);
+    int16x8_t v154 = vaddq_s16(v152, v153);
+    int16x8_t v155 = vaddq_s16(v154, v150);
+    int16x8_t v156 = vaddq_s16(v151, v155);
+    int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
+    int16x8_t v158 = vaddq_s16(v147, v157);
+    int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v161 = vaddq_s16(v159, v160);
+    int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
+    int16x8_t v162 = vaddq_s16(v162_tmp, v161);
+    int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i);
+    int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i);
+    int16x8_t v165 = vaddq_s16(v163, v164);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v168 = vaddq_s16(v166, v167);
+    int16x8_t v169 = vaddq_s16(v165, v168);
+    int16x8_t v170 = vaddq_s16(v162, v169);
+    int16x8_t v171 = vaddq_s16(v168, v161);
+    int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
+    int16x8_t v172 = vaddq_s16(v172_tmp, v171);
+    int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i);
+    int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i);
+    int16x8_t v175 = vaddq_s16(v173, v174);
+    int16x8_t v176 = vaddq_s16(v175, v165);
+    int16x8_t v177 = vaddq_s16(v176, v171);
+    int16x8_t v178 = vaddq_s16(v172, v177);
+    int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
+    int16x8_t v180 = vaddq_s16(v170, v179);
+    int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
+    int16x8_t v182 = vaddq_s16(v158, v181);
+    int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v185 = vaddq_s16(v183, v184);
+    int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
+    int16x8_t v186 = vaddq_s16(v186_tmp, v185);
+    int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i);
+    int16x8_t v189 = vaddq_s16(v187, v188);
+    int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v192 = vaddq_s16(v190, v191);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vaddq_s16(v186, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v200 = vaddq_s16(v198, v199);
+    int16x8_t v201 = vaddq_s16(v197, v200);
+    int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
+    int16x8_t v202 = vaddq_s16(v202_tmp, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i);
+    int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i);
+    int16x8_t v208 = vaddq_s16(v206, v207);
+    int16x8_t v209 = vaddq_s16(v205, v208);
+    int16x8_t v210 = vaddq_s16(v209, v201);
+    int16x8_t v211 = vaddq_s16(v202, v210);
+    int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
+    int16x8_t v213 = vaddq_s16(v194, v212);
+    int16x8_t v214 = vaddq_s16(v200, v185);
+    int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
+    int16x8_t v215 = vaddq_s16(v215_tmp, v214);
+    int16x8_t v216 = vaddq_s16(v208, v189);
+    int16x8_t v217 = vaddq_s16(v192, v197);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v217, v214);
+    int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
+    int16x8_t v221 = vaddq_s16(v221_tmp, v220);
+    int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i);
+    int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i);
+    int16x8_t v224 = vaddq_s16(v222, v223);
+    int16x8_t v225 = vaddq_s16(v224, v205);
+    int16x8_t v226 = vaddq_s16(v225, v216);
+    int16x8_t v227 = vaddq_s16(v226, v220);
+    int16x8_t v228 = vaddq_s16(v221, v227);
+    int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
+    int16x8_t v230 = vaddq_s16(v219, v229);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
+    int16x8_t v232 = vaddq_s16(v213, v231);
+    int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
+    int16x8_t v234 = vaddq_s16(v182, v233);
+    int16x8_t v235 = vaddq_s16(v184, v142);
+    int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
+    int16x8_t v236 = vaddq_s16(v236_tmp, v235);
+    int16x8_t v237 = vaddq_s16(v188, v144);
+    int16x8_t v238 = vaddq_s16(v145, v190);
+    int16x8_t v239 = vaddq_s16(v237, v238);
+    int16x8_t v240 = vaddq_s16(v236, v239);
+    int16x8_t v241 = vaddq_s16(v196, v148);
+    int16x8_t v242 = vaddq_s16(v149, v198);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
+    int16x8_t v244 = vaddq_s16(v244_tmp, v243);
+    int16x8_t v245 = vaddq_s16(v204, v152);
+    int16x8_t v246 = vaddq_s16(v153, v206);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248 = vaddq_s16(v247, v243);
+    int16x8_t v249 = vaddq_s16(v244, v248);
+    int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
+    int16x8_t v251 = vaddq_s16(v240, v250);
+    int16x8_t v252 = vaddq_s16(v199, v159);
+    int16x8_t v253 = vaddq_s16(v160, v183);
+    int16x8_t v254 = vaddq_s16(v252, v253);
+    int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
+    int16x8_t v255 = vaddq_s16(v255_tmp, v254);
+    int16x8_t v256 = vaddq_s16(v207, v163);
+    int16x8_t v257 = vaddq_s16(v164, v187);
+    int16x8_t v258 = vaddq_s16(v256, v257);
+    int16x8_t v259 = vaddq_s16(v191, v166);
+    int16x8_t v260 = vaddq_s16(v167, v195);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v258, v261);
+    int16x8_t v263 = vaddq_s16(v255, v262);
+    int16x8_t v264 = vaddq_s16(v261, v254);
+    int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
+    int16x8_t v265 = vaddq_s16(v265_tmp, v264);
+    int16x8_t v266 = vaddq_s16(v223, v173);
+    int16x8_t v267 = vaddq_s16(v174, v203);
+    int16x8_t v268 = vaddq_s16(v266, v267);
+    int16x8_t v269 = vaddq_s16(v268, v258);
+    int16x8_t v270 = vaddq_s16(v269, v264);
+    int16x8_t v271 = vaddq_s16(v265, v270);
+    int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
+    int16x8_t v273 = vaddq_s16(v263, v272);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
+    int16x8_t v275 = vaddq_s16(v251, v274);
+    int16x8_t v276 = vaddq_s16(v253, v235);
+    int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
+    int16x8_t v277 = vaddq_s16(v277_tmp, v276);
+    int16x8_t v278 = vaddq_s16(v257, v237);
+    int16x8_t v279 = vaddq_s16(v238, v259);
+    int16x8_t v280 = vaddq_s16(v278, v279);
+    int16x8_t v281 = vaddq_s16(v277, v280);
+    int16x8_t v282 = vaddq_s16(v260, v241);
+    int16x8_t v283 = vaddq_s16(v242, v252);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
+    int16x8_t v285 = vaddq_s16(v285_tmp, v284);
+    int16x8_t v286 = vaddq_s16(v267, v245);
+    int16x8_t v287 = vaddq_s16(v246, v256);
+    int16x8_t v288 = vaddq_s16(v286, v287);
+    int16x8_t v289 = vaddq_s16(v288, v284);
+    int16x8_t v290 = vaddq_s16(v285, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v292 = vaddq_s16(v281, v291);
+    int16x8_t v293 = vaddq_s16(v283, v276);
+    int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
+    int16x8_t v294 = vaddq_s16(v294_tmp, v293);
+    int16x8_t v295 = vaddq_s16(v287, v278);
+    int16x8_t v296 = vaddq_s16(v279, v282);
+    int16x8_t v297 = vaddq_s16(v295, v296);
+    int16x8_t v298 = vaddq_s16(v294, v297);
+    int16x8_t v299 = vaddq_s16(v296, v293);
+    int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
+    int16x8_t v300 = vaddq_s16(v300_tmp, v299);
+    int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i);
+    int16x8_t v302 = vaddq_s16(v301, v222);
+    int16x8_t v303 = vaddq_s16(v302, v266);
+    int16x8_t v304 = vaddq_s16(v303, v286);
+    int16x8_t v305 = vaddq_s16(v304, v295);
+    int16x8_t v306 = vaddq_s16(v305, v299);
+    int16x8_t v307 = vaddq_s16(v300, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
+    int16x8_t v309 = vaddq_s16(v298, v308);
+    int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
+    int16x8_t v311 = vaddq_s16(v292, v310);
+    int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
+    int16x8_t v313 = vaddq_s16(v275, v312);
+    int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
+    int16x8_t v315 = vaddq_s16(v234, v314);
+    int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
+    int16x8_t v317 = vaddq_s16(v141, v316);
+    int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
+    int16x8_t v319 = vaddq_s16(v319_tmp, v318);
+    int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i);
+    int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v322 = vaddq_s16(v320, v321);
+    int16x8_t v323 = vaddq_s16(v319, v322);
+    int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v326 = vaddq_s16(v324, v325);
+    int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
+    int16x8_t v327 = vaddq_s16(v327_tmp, v326);
+    int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i);
+    int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i);
+    int16x8_t v330 = vaddq_s16(v328, v329);
+    int16x8_t v331 = vaddq_s16(v330, v326);
+    int16x8_t v332 = vaddq_s16(v327, v331);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
+    int16x8_t v334 = vaddq_s16(v323, v333);
+    int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v337 = vaddq_s16(v335, v336);
+    int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
+    int16x8_t v338 = vaddq_s16(v338_tmp, v337);
+    int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i);
+    int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i);
+    int16x8_t v341 = vaddq_s16(v339, v340);
+    int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v344 = vaddq_s16(v342, v343);
+    int16x8_t v345 = vaddq_s16(v341, v344);
+    int16x8_t v346 = vaddq_s16(v338, v345);
+    int16x8_t v347 = vaddq_s16(v344, v337);
+    int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
+    int16x8_t v348 = vaddq_s16(v348_tmp, v347);
+    int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i);
+    int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i);
+    int16x8_t v351 = vaddq_s16(v349, v350);
+    int16x8_t v352 = vaddq_s16(v351, v341);
+    int16x8_t v353 = vaddq_s16(v352, v347);
+    int16x8_t v354 = vaddq_s16(v348, v353);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
+    int16x8_t v356 = vaddq_s16(v346, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
+    int16x8_t v358 = vaddq_s16(v334, v357);
+    int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v361 = vaddq_s16(v359, v360);
+    int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
+    int16x8_t v362 = vaddq_s16(v362_tmp, v361);
+    int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i);
+    int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i);
+    int16x8_t v365 = vaddq_s16(v363, v364);
+    int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v368 = vaddq_s16(v366, v367);
+    int16x8_t v369 = vaddq_s16(v365, v368);
+    int16x8_t v370 = vaddq_s16(v362, v369);
+    int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v373 = vaddq_s16(v371, v372);
+    int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v376 = vaddq_s16(v374, v375);
+    int16x8_t v377 = vaddq_s16(v373, v376);
+    int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
+    int16x8_t v378 = vaddq_s16(v378_tmp, v377);
+    int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i);
+    int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i);
+    int16x8_t v381 = vaddq_s16(v379, v380);
+    int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i);
+    int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i);
+    int16x8_t v384 = vaddq_s16(v382, v383);
+    int16x8_t v385 = vaddq_s16(v381, v384);
+    int16x8_t v386 = vaddq_s16(v385, v377);
+    int16x8_t v387 = vaddq_s16(v378, v386);
+    int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
+    int16x8_t v389 = vaddq_s16(v370, v388);
+    int16x8_t v390 = vaddq_s16(v376, v361);
+    int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
+    int16x8_t v391 = vaddq_s16(v391_tmp, v390);
+    int16x8_t v392 = vaddq_s16(v384, v365);
+    int16x8_t v393 = vaddq_s16(v368, v373);
+    int16x8_t v394 = vaddq_s16(v392, v393);
+    int16x8_t v395 = vaddq_s16(v391, v394);
+    int16x8_t v396 = vaddq_s16(v393, v390);
+    int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
+    int16x8_t v397 = vaddq_s16(v397_tmp, v396);
+    int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i);
+    int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i);
+    int16x8_t v400 = vaddq_s16(v398, v399);
+    int16x8_t v401 = vaddq_s16(v400, v381);
+    int16x8_t v402 = vaddq_s16(v401, v392);
+    int16x8_t v403 = vaddq_s16(v402, v396);
+    int16x8_t v404 = vaddq_s16(v397, v403);
+    int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
+    int16x8_t v406 = vaddq_s16(v395, v405);
+    int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
+    int16x8_t v408 = vaddq_s16(v389, v407);
+    int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
+    int16x8_t v410 = vaddq_s16(v358, v409);
+    int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v413 = vaddq_s16(v411, v412);
+    int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
+    int16x8_t v414 = vaddq_s16(v414_tmp, v413);
+    int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i);
+    int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i);
+    int16x8_t v417 = vaddq_s16(v415, v416);
+    int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v420 = vaddq_s16(v418, v419);
+    int16x8_t v421 = vaddq_s16(v417, v420);
+    int16x8_t v422 = vaddq_s16(v414, v421);
+    int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v425 = vaddq_s16(v423, v424);
+    int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v428 = vaddq_s16(v426, v427);
+    int16x8_t v429 = vaddq_s16(v425, v428);
+    int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
+    int16x8_t v430 = vaddq_s16(v430_tmp, v429);
+    int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i);
+    int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i);
+    int16x8_t v433 = vaddq_s16(v431, v432);
+    int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i);
+    int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i);
+    int16x8_t v436 = vaddq_s16(v434, v435);
+    int16x8_t v437 = vaddq_s16(v433, v436);
+    int16x8_t v438 = vaddq_s16(v437, v429);
+    int16x8_t v439 = vaddq_s16(v430, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
+    int16x8_t v441 = vaddq_s16(v422, v440);
+    int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v444 = vaddq_s16(v442, v443);
+    int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v447 = vaddq_s16(v445, v446);
+    int16x8_t v448 = vaddq_s16(v444, v447);
+    int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
+    int16x8_t v449 = vaddq_s16(v449_tmp, v448);
+    int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i);
+    int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i);
+    int16x8_t v452 = vaddq_s16(v450, v451);
+    int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i);
+    int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i);
+    int16x8_t v455 = vaddq_s16(v453, v454);
+    int16x8_t v456 = vaddq_s16(v452, v455);
+    int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v459 = vaddq_s16(v457, v458);
+    int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v462 = vaddq_s16(v460, v461);
+    int16x8_t v463 = vaddq_s16(v459, v462);
+    int16x8_t v464 = vaddq_s16(v456, v463);
+    int16x8_t v465 = vaddq_s16(v449, v464);
+    int16x8_t v466 = vaddq_s16(v463, v448);
+    int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
+    int16x8_t v467 = vaddq_s16(v467_tmp, v466);
+    int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i);
+    int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i);
+    int16x8_t v470 = vaddq_s16(v468, v469);
+    int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i);
+    int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i);
+    int16x8_t v473 = vaddq_s16(v471, v472);
+    int16x8_t v474 = vaddq_s16(v470, v473);
+    int16x8_t v475 = vaddq_s16(v474, v456);
+    int16x8_t v476 = vaddq_s16(v475, v466);
+    int16x8_t v477 = vaddq_s16(v467, v476);
+    int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
+    int16x8_t v479 = vaddq_s16(v465, v478);
+    int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
+    int16x8_t v481 = vaddq_s16(v441, v480);
+    int16x8_t v482 = vaddq_s16(v447, v413);
+    int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
+    int16x8_t v483 = vaddq_s16(v483_tmp, v482);
+    int16x8_t v484 = vaddq_s16(v455, v417);
+    int16x8_t v485 = vaddq_s16(v420, v459);
+    int16x8_t v486 = vaddq_s16(v484, v485);
+    int16x8_t v487 = vaddq_s16(v483, v486);
+    int16x8_t v488 = vaddq_s16(v462, v425);
+    int16x8_t v489 = vaddq_s16(v428, v444);
+    int16x8_t v490 = vaddq_s16(v488, v489);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v490);
+    int16x8_t v492 = vaddq_s16(v473, v433);
+    int16x8_t v493 = vaddq_s16(v436, v452);
+    int16x8_t v494 = vaddq_s16(v492, v493);
+    int16x8_t v495 = vaddq_s16(v494, v490);
+    int16x8_t v496 = vaddq_s16(v491, v495);
+    int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
+    int16x8_t v498 = vaddq_s16(v487, v497);
+    int16x8_t v499 = vaddq_s16(v489, v482);
+    int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
+    int16x8_t v500 = vaddq_s16(v500_tmp, v499);
+    int16x8_t v501 = vaddq_s16(v493, v484);
+    int16x8_t v502 = vaddq_s16(v485, v488);
+    int16x8_t v503 = vaddq_s16(v501, v502);
+    int16x8_t v504 = vaddq_s16(v500, v503);
+    int16x8_t v505 = vaddq_s16(v502, v499);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
+    int16x8_t v506 = vaddq_s16(v506_tmp, v505);
+    int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i);
+    int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i);
+    int16x8_t v509 = vaddq_s16(v507, v508);
+    int16x8_t v510 = vaddq_s16(v509, v470);
+    int16x8_t v511 = vaddq_s16(v510, v492);
+    int16x8_t v512 = vaddq_s16(v511, v501);
+    int16x8_t v513 = vaddq_s16(v512, v505);
+    int16x8_t v514 = vaddq_s16(v506, v513);
+    int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
+    int16x8_t v516 = vaddq_s16(v504, v515);
+    int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
+    int16x8_t v518 = vaddq_s16(v498, v517);
+    int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
+    int16x8_t v520 = vaddq_s16(v481, v519);
+    int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
+    int16x8_t v522 = vaddq_s16(v410, v521);
+    int16x8_t v523 = vaddq_s16(v412, v318);
+    int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
+    int16x8_t v524 = vaddq_s16(v524_tmp, v523);
+    int16x8_t v525 = vaddq_s16(v416, v320);
+    int16x8_t v526 = vaddq_s16(v321, v418);
+    int16x8_t v527 = vaddq_s16(v525, v526);
+    int16x8_t v528 = vaddq_s16(v524, v527);
+    int16x8_t v529 = vaddq_s16(v424, v324);
+    int16x8_t v530 = vaddq_s16(v325, v426);
+    int16x8_t v531 = vaddq_s16(v529, v530);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
+    int16x8_t v532 = vaddq_s16(v532_tmp, v531);
+    int16x8_t v533 = vaddq_s16(v432, v328);
+    int16x8_t v534 = vaddq_s16(v329, v434);
+    int16x8_t v535 = vaddq_s16(v533, v534);
+    int16x8_t v536 = vaddq_s16(v535, v531);
+    int16x8_t v537 = vaddq_s16(v532, v536);
+    int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
+    int16x8_t v539 = vaddq_s16(v528, v538);
+    int16x8_t v540 = vaddq_s16(v443, v335);
+    int16x8_t v541 = vaddq_s16(v336, v445);
+    int16x8_t v542 = vaddq_s16(v540, v541);
+    int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
+    int16x8_t v543 = vaddq_s16(v543_tmp, v542);
+    int16x8_t v544 = vaddq_s16(v451, v339);
+    int16x8_t v545 = vaddq_s16(v340, v453);
+    int16x8_t v546 = vaddq_s16(v544, v545);
+    int16x8_t v547 = vaddq_s16(v458, v342);
+    int16x8_t v548 = vaddq_s16(v343, v460);
+    int16x8_t v549 = vaddq_s16(v547, v548);
+    int16x8_t v550 = vaddq_s16(v546, v549);
+    int16x8_t v551 = vaddq_s16(v543, v550);
+    int16x8_t v552 = vaddq_s16(v549, v542);
+    int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
+    int16x8_t v553 = vaddq_s16(v553_tmp, v552);
+    int16x8_t v554 = vaddq_s16(v469, v349);
+    int16x8_t v555 = vaddq_s16(v350, v471);
+    int16x8_t v556 = vaddq_s16(v554, v555);
+    int16x8_t v557 = vaddq_s16(v556, v546);
+    int16x8_t v558 = vaddq_s16(v557, v552);
+    int16x8_t v559 = vaddq_s16(v553, v558);
+    int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
+    int16x8_t v561 = vaddq_s16(v551, v560);
+    int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
+    int16x8_t v563 = vaddq_s16(v539, v562);
+    int16x8_t v564 = vaddq_s16(v446, v359);
+    int16x8_t v565 = vaddq_s16(v360, v411);
+    int16x8_t v566 = vaddq_s16(v564, v565);
+    int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
+    int16x8_t v567 = vaddq_s16(v567_tmp, v566);
+    int16x8_t v568 = vaddq_s16(v454, v363);
+    int16x8_t v569 = vaddq_s16(v364, v415);
+    int16x8_t v570 = vaddq_s16(v568, v569);
+    int16x8_t v571 = vaddq_s16(v419, v366);
+    int16x8_t v572 = vaddq_s16(v367, v457);
+    int16x8_t v573 = vaddq_s16(v571, v572);
+    int16x8_t v574 = vaddq_s16(v570, v573);
+    int16x8_t v575 = vaddq_s16(v567, v574);
+    int16x8_t v576 = vaddq_s16(v461, v371);
+    int16x8_t v577 = vaddq_s16(v372, v423);
+    int16x8_t v578 = vaddq_s16(v576, v577);
+    int16x8_t v579 = vaddq_s16(v427, v374);
+    int16x8_t v580 = vaddq_s16(v375, v442);
+    int16x8_t v581 = vaddq_s16(v579, v580);
+    int16x8_t v582 = vaddq_s16(v578, v581);
+    int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
+    int16x8_t v583 = vaddq_s16(v583_tmp, v582);
+    int16x8_t v584 = vaddq_s16(v472, v379);
+    int16x8_t v585 = vaddq_s16(v380, v431);
+    int16x8_t v586 = vaddq_s16(v584, v585);
+    int16x8_t v587 = vaddq_s16(v435, v382);
+    int16x8_t v588 = vaddq_s16(v383, v450);
+    int16x8_t v589 = vaddq_s16(v587, v588);
+    int16x8_t v590 = vaddq_s16(v586, v589);
+    int16x8_t v591 = vaddq_s16(v590, v582);
+    int16x8_t v592 = vaddq_s16(v583, v591);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
+    int16x8_t v594 = vaddq_s16(v575, v593);
+    int16x8_t v595 = vaddq_s16(v581, v566);
+    int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
+    int16x8_t v596 = vaddq_s16(v596_tmp, v595);
+    int16x8_t v597 = vaddq_s16(v589, v570);
+    int16x8_t v598 = vaddq_s16(v573, v578);
+    int16x8_t v599 = vaddq_s16(v597, v598);
+    int16x8_t v600 = vaddq_s16(v596, v599);
+    int16x8_t v601 = vaddq_s16(v598, v595);
+    int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
+    int16x8_t v602 = vaddq_s16(v602_tmp, v601);
+    int16x8_t v603 = vaddq_s16(v508, v398);
+    int16x8_t v604 = vaddq_s16(v399, v468);
+    int16x8_t v605 = vaddq_s16(v603, v604);
+    int16x8_t v606 = vaddq_s16(v605, v586);
+    int16x8_t v607 = vaddq_s16(v606, v597);
+    int16x8_t v608 = vaddq_s16(v607, v601);
+    int16x8_t v609 = vaddq_s16(v602, v608);
+    int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
+    int16x8_t v611 = vaddq_s16(v600, v610);
+    int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
+    int16x8_t v613 = vaddq_s16(v594, v612);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
+    int16x8_t v615 = vaddq_s16(v563, v614);
+    int16x8_t v616 = vaddq_s16(v565, v523);
+    int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
+    int16x8_t v617 = vaddq_s16(v617_tmp, v616);
+    int16x8_t v618 = vaddq_s16(v569, v525);
+    int16x8_t v619 = vaddq_s16(v526, v571);
+    int16x8_t v620 = vaddq_s16(v618, v619);
+    int16x8_t v621 = vaddq_s16(v617, v620);
+    int16x8_t v622 = vaddq_s16(v577, v529);
+    int16x8_t v623 = vaddq_s16(v530, v579);
+    int16x8_t v624 = vaddq_s16(v622, v623);
+    int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
+    int16x8_t v625 = vaddq_s16(v625_tmp, v624);
+    int16x8_t v626 = vaddq_s16(v585, v533);
+    int16x8_t v627 = vaddq_s16(v534, v587);
+    int16x8_t v628 = vaddq_s16(v626, v627);
+    int16x8_t v629 = vaddq_s16(v628, v624);
+    int16x8_t v630 = vaddq_s16(v625, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
+    int16x8_t v632 = vaddq_s16(v621, v631);
+    int16x8_t v633 = vaddq_s16(v580, v540);
+    int16x8_t v634 = vaddq_s16(v541, v564);
+    int16x8_t v635 = vaddq_s16(v633, v634);
+    int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
+    int16x8_t v636 = vaddq_s16(v636_tmp, v635);
+    int16x8_t v637 = vaddq_s16(v588, v544);
+    int16x8_t v638 = vaddq_s16(v545, v568);
+    int16x8_t v639 = vaddq_s16(v637, v638);
+    int16x8_t v640 = vaddq_s16(v572, v547);
+    int16x8_t v641 = vaddq_s16(v548, v576);
+    int16x8_t v642 = vaddq_s16(v640, v641);
+    int16x8_t v643 = vaddq_s16(v639, v642);
+    int16x8_t v644 = vaddq_s16(v636, v643);
+    int16x8_t v645 = vaddq_s16(v642, v635);
+    int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
+    int16x8_t v646 = vaddq_s16(v646_tmp, v645);
+    int16x8_t v647 = vaddq_s16(v604, v554);
+    int16x8_t v648 = vaddq_s16(v555, v584);
+    int16x8_t v649 = vaddq_s16(v647, v648);
+    int16x8_t v650 = vaddq_s16(v649, v639);
+    int16x8_t v651 = vaddq_s16(v650, v645);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
+    int16x8_t v654 = vaddq_s16(v644, v653);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
+    int16x8_t v656 = vaddq_s16(v632, v655);
+    int16x8_t v657 = vaddq_s16(v634, v616);
+    int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
+    int16x8_t v658 = vaddq_s16(v658_tmp, v657);
+    int16x8_t v659 = vaddq_s16(v638, v618);
+    int16x8_t v660 = vaddq_s16(v619, v640);
+    int16x8_t v661 = vaddq_s16(v659, v660);
+    int16x8_t v662 = vaddq_s16(v658, v661);
+    int16x8_t v663 = vaddq_s16(v641, v622);
+    int16x8_t v664 = vaddq_s16(v623, v633);
+    int16x8_t v665 = vaddq_s16(v663, v664);
+    int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
+    int16x8_t v666 = vaddq_s16(v666_tmp, v665);
+    int16x8_t v667 = vaddq_s16(v648, v626);
+    int16x8_t v668 = vaddq_s16(v627, v637);
+    int16x8_t v669 = vaddq_s16(v667, v668);
+    int16x8_t v670 = vaddq_s16(v669, v665);
+    int16x8_t v671 = vaddq_s16(v666, v670);
+    int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
+    int16x8_t v673 = vaddq_s16(v662, v672);
+    int16x8_t v674 = vaddq_s16(v664, v657);
+    int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
+    int16x8_t v675 = vaddq_s16(v675_tmp, v674);
+    int16x8_t v676 = vaddq_s16(v668, v659);
+    int16x8_t v677 = vaddq_s16(v660, v663);
+    int16x8_t v678 = vaddq_s16(v676, v677);
+    int16x8_t v679 = vaddq_s16(v675, v678);
+    int16x8_t v680 = vaddq_s16(v677, v674);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i);
+    int16x8_t v683 = vaddq_s16(v682, v507);
+    int16x8_t v684 = vaddq_s16(v683, v603);
+    int16x8_t v685 = vaddq_s16(v684, v647);
+    int16x8_t v686 = vaddq_s16(v685, v667);
+    int16x8_t v687 = vaddq_s16(v686, v676);
+    int16x8_t v688 = vaddq_s16(v687, v680);
+    int16x8_t v689 = vaddq_s16(v681, v688);
+    int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
+    int16x8_t v691 = vaddq_s16(v679, v690);
+    int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
+    int16x8_t v693 = vaddq_s16(v673, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
+    int16x8_t v695 = vaddq_s16(v656, v694);
+    int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
+    int16x8_t v697 = vaddq_s16(v615, v696);
+    int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
+    int16x8_t v699 = vaddq_s16(v522, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
+    int16x8_t v701 = vaddq_s16(v317, v700);
+    int16x8_t v702 = vsubq_s16(v0, v1);
+    int16x8_t v703 = vsubq_s16(v4, v6);
+    int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045);
+    int16x8_t v704 = vaddq_s16(v704_tmp, v703);
+    int16x8_t v705 = vaddq_s16(v702, v704);
+    int16x8_t v706 = vsubq_s16(v11, v14);
+    int16x8_t v707 = vsubq_s16(v17, v20);
+    int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045);
+    int16x8_t v708 = vaddq_s16(v708_tmp, v707);
+    int16x8_t v709 = vaddq_s16(v706, v708);
+    int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705);
+    int16x8_t v711 = vaddq_s16(v705, v710);
+    int16x8_t v712 = vsubq_s16(v27, v30);
+    int16x8_t v713 = vsubq_s16(v35, v39);
+    int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045);
+    int16x8_t v714 = vaddq_s16(v714_tmp, v713);
+    int16x8_t v715 = vaddq_s16(v712, v714);
+    int16x8_t v716 = vsubq_s16(v44, v47);
+    int16x8_t v717 = vsubq_s16(v50, v54);
+    int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045);
+    int16x8_t v718 = vaddq_s16(v718_tmp, v717);
+    int16x8_t v719 = vaddq_s16(v716, v718);
+    int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705);
+    int16x8_t v721 = vaddq_s16(v715, v720);
+    int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121);
+    int16x8_t v723 = vaddq_s16(v711, v722);
+    int16x8_t v724 = vsubq_s16(v63, v66);
+    int16x8_t v725 = vsubq_s16(v71, v75);
+    int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045);
+    int16x8_t v726 = vaddq_s16(v726_tmp, v725);
+    int16x8_t v727 = vaddq_s16(v724, v726);
+    int16x8_t v728 = vsubq_s16(v82, v89);
+    int16x8_t v729 = vsubq_s16(v92, v97);
+    int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045);
+    int16x8_t v730 = vaddq_s16(v730_tmp, v729);
+    int16x8_t v731 = vaddq_s16(v728, v730);
+    int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705);
+    int16x8_t v733 = vaddq_s16(v727, v732);
+    int16x8_t v734 = vsubq_s16(v104, v107);
+    int16x8_t v735 = vsubq_s16(v112, v116);
+    int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045);
+    int16x8_t v736 = vaddq_s16(v736_tmp, v735);
+    int16x8_t v737 = vaddq_s16(v734, v736);
+    int16x8_t v738 = vsubq_s16(v121, v124);
+    int16x8_t v739 = vsubq_s16(v127, v132);
+    int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045);
+    int16x8_t v740 = vaddq_s16(v740_tmp, v739);
+    int16x8_t v741 = vaddq_s16(v738, v740);
+    int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705);
+    int16x8_t v743 = vaddq_s16(v737, v742);
+    int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121);
+    int16x8_t v745 = vaddq_s16(v733, v744);
+    int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563);
+    int16x8_t v747 = vaddq_s16(v723, v746);
+    int16x8_t v748 = vsubq_s16(v143, v146);
+    int16x8_t v749 = vsubq_s16(v151, v155);
+    int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045);
+    int16x8_t v750 = vaddq_s16(v750_tmp, v749);
+    int16x8_t v751 = vaddq_s16(v748, v750);
+    int16x8_t v752 = vsubq_s16(v162, v169);
+    int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705);
+    int16x8_t v754 = vsubq_s16(v172, v177);
+    int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746);
+    int16x8_t v756 = vaddq_s16(v753, v755);
+    int16x8_t v757 = vaddq_s16(v751, v756);
+    int16x8_t v758 = vsubq_s16(v186, v193);
+    int16x8_t v759 = vsubq_s16(v202, v210);
+    int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045);
+    int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+    int16x8_t v761 = vaddq_s16(v758, v760);
+    int16x8_t v762 = vsubq_s16(v215, v218);
+    int16x8_t v763 = vsubq_s16(v221, v227);
+    int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045);
+    int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+    int16x8_t v765 = vaddq_s16(v762, v764);
+    int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705);
+    int16x8_t v767 = vaddq_s16(v761, v766);
+    int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121);
+    int16x8_t v769 = vaddq_s16(v757, v768);
+    int16x8_t v770 = vsubq_s16(v236, v239);
+    int16x8_t v771 = vsubq_s16(v244, v248);
+    int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045);
+    int16x8_t v772 = vaddq_s16(v772_tmp, v771);
+    int16x8_t v773 = vaddq_s16(v770, v772);
+    int16x8_t v774 = vsubq_s16(v255, v262);
+    int16x8_t v775 = vsubq_s16(v265, v270);
+    int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045);
+    int16x8_t v776 = vaddq_s16(v776_tmp, v775);
+    int16x8_t v777 = vaddq_s16(v774, v776);
+    int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705);
+    int16x8_t v779 = vaddq_s16(v773, v778);
+    int16x8_t v780 = vsubq_s16(v277, v280);
+    int16x8_t v781 = vsubq_s16(v285, v289);
+    int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045);
+    int16x8_t v782 = vaddq_s16(v782_tmp, v781);
+    int16x8_t v783 = vaddq_s16(v780, v782);
+    int16x8_t v784 = vsubq_s16(v294, v297);
+    int16x8_t v785 = vsubq_s16(v300, v306);
+    int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045);
+    int16x8_t v786 = vaddq_s16(v786_tmp, v785);
+    int16x8_t v787 = vaddq_s16(v784, v786);
+    int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705);
+    int16x8_t v789 = vaddq_s16(v783, v788);
+    int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121);
+    int16x8_t v791 = vaddq_s16(v779, v790);
+    int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563);
+    int16x8_t v793 = vaddq_s16(v769, v792);
+    int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429);
+    int16x8_t v795 = vaddq_s16(v747, v794);
+    int16x8_t v796 = vsubq_s16(v319, v322);
+    int16x8_t v797 = vsubq_s16(v327, v331);
+    int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045);
+    int16x8_t v798 = vaddq_s16(v798_tmp, v797);
+    int16x8_t v799 = vaddq_s16(v796, v798);
+    int16x8_t v800 = vsubq_s16(v338, v345);
+    int16x8_t v801 = vsubq_s16(v348, v353);
+    int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045);
+    int16x8_t v802 = vaddq_s16(v802_tmp, v801);
+    int16x8_t v803 = vaddq_s16(v800, v802);
+    int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705);
+    int16x8_t v805 = vaddq_s16(v799, v804);
+    int16x8_t v806 = vsubq_s16(v362, v369);
+    int16x8_t v807 = vsubq_s16(v378, v386);
+    int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045);
+    int16x8_t v808 = vaddq_s16(v808_tmp, v807);
+    int16x8_t v809 = vaddq_s16(v806, v808);
+    int16x8_t v810 = vsubq_s16(v391, v394);
+    int16x8_t v811 = vsubq_s16(v397, v403);
+    int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045);
+    int16x8_t v812 = vaddq_s16(v812_tmp, v811);
+    int16x8_t v813 = vaddq_s16(v810, v812);
+    int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705);
+    int16x8_t v815 = vaddq_s16(v809, v814);
+    int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121);
+    int16x8_t v817 = vaddq_s16(v805, v816);
+    int16x8_t v818 = vsubq_s16(v414, v421);
+    int16x8_t v819 = vsubq_s16(v430, v438);
+    int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045);
+    int16x8_t v820 = vaddq_s16(v820_tmp, v819);
+    int16x8_t v821 = vaddq_s16(v818, v820);
+    int16x8_t v822 = vsubq_s16(v449, v464);
+    int16x8_t v823 = vsubq_s16(v467, v476);
+    int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045);
+    int16x8_t v824 = vaddq_s16(v824_tmp, v823);
+    int16x8_t v825 = vaddq_s16(v822, v824);
+    int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705);
+    int16x8_t v827 = vaddq_s16(v821, v826);
+    int16x8_t v828 = vsubq_s16(v483, v486);
+    int16x8_t v829 = vsubq_s16(v491, v495);
+    int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045);
+    int16x8_t v830 = vaddq_s16(v830_tmp, v829);
+    int16x8_t v831 = vaddq_s16(v828, v830);
+    int16x8_t v832 = vsubq_s16(v500, v503);
+    int16x8_t v833 = vsubq_s16(v506, v513);
+    int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045);
+    int16x8_t v834 = vaddq_s16(v834_tmp, v833);
+    int16x8_t v835 = vaddq_s16(v832, v834);
+    int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705);
+    int16x8_t v837 = vaddq_s16(v831, v836);
+    int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121);
+    int16x8_t v839 = vaddq_s16(v827, v838);
+    int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563);
+    int16x8_t v841 = vaddq_s16(v817, v840);
+    int16x8_t v842 = vsubq_s16(v524, v527);
+    int16x8_t v843 = vsubq_s16(v532, v536);
+    int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045);
+    int16x8_t v844 = vaddq_s16(v844_tmp, v843);
+    int16x8_t v845 = vaddq_s16(v842, v844);
+    int16x8_t v846 = vsubq_s16(v543, v550);
+    int16x8_t v847 = vsubq_s16(v553, v558);
+    int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045);
+    int16x8_t v848 = vaddq_s16(v848_tmp, v847);
+    int16x8_t v849 = vaddq_s16(v846, v848);
+    int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705);
+    int16x8_t v851 = vaddq_s16(v845, v850);
+    int16x8_t v852 = vsubq_s16(v567, v574);
+    int16x8_t v853 = vsubq_s16(v583, v591);
+    int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045);
+    int16x8_t v854 = vaddq_s16(v854_tmp, v853);
+    int16x8_t v855 = vaddq_s16(v852, v854);
+    int16x8_t v856 = vsubq_s16(v596, v599);
+    int16x8_t v857 = vsubq_s16(v602, v608);
+    int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045);
+    int16x8_t v858 = vaddq_s16(v858_tmp, v857);
+    int16x8_t v859 = vaddq_s16(v856, v858);
+    int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705);
+    int16x8_t v861 = vaddq_s16(v855, v860);
+    int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121);
+    int16x8_t v863 = vaddq_s16(v851, v862);
+    int16x8_t v864 = vsubq_s16(v617, v620);
+    int16x8_t v865 = vsubq_s16(v625, v629);
+    int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045);
+    int16x8_t v866 = vaddq_s16(v866_tmp, v865);
+    int16x8_t v867 = vaddq_s16(v864, v866);
+    int16x8_t v868 = vsubq_s16(v636, v643);
+    int16x8_t v869 = vsubq_s16(v646, v651);
+    int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045);
+    int16x8_t v870 = vaddq_s16(v870_tmp, v869);
+    int16x8_t v871 = vaddq_s16(v868, v870);
+    int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705);
+    int16x8_t v873 = vaddq_s16(v867, v872);
+    int16x8_t v874 = vsubq_s16(v658, v661);
+    int16x8_t v875 = vsubq_s16(v666, v670);
+    int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045);
+    int16x8_t v876 = vaddq_s16(v876_tmp, v875);
+    int16x8_t v877 = vaddq_s16(v874, v876);
+    int16x8_t v878 = vsubq_s16(v675, v678);
+    int16x8_t v879 = vsubq_s16(v681, v688);
+    int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045);
+    int16x8_t v880 = vaddq_s16(v880_tmp, v879);
+    int16x8_t v881 = vaddq_s16(v878, v880);
+    int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705);
+    int16x8_t v883 = vaddq_s16(v877, v882);
+    int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121);
+    int16x8_t v885 = vaddq_s16(v873, v884);
+    int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563);
+    int16x8_t v887 = vaddq_s16(v863, v886);
+    int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429);
+    int16x8_t v889 = vaddq_s16(v841, v888);
+    int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395);
+    int16x8_t v891 = vaddq_s16(v795, v890);
+    int16x8_t v892 = vsubq_s16(v702, v704);
+    int16x8_t v893 = vsubq_s16(v706, v708);
+    int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490);
+    int16x8_t v895 = vaddq_s16(v892, v894);
+    int16x8_t v896 = vsubq_s16(v712, v714);
+    int16x8_t v897 = vsubq_s16(v716, v718);
+    int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490);
+    int16x8_t v899 = vaddq_s16(v896, v898);
+    int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578);
+    int16x8_t v901 = vaddq_s16(v895, v900);
+    int16x8_t v902 = vsubq_s16(v724, v726);
+    int16x8_t v903 = vsubq_s16(v728, v730);
+    int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490);
+    int16x8_t v905 = vaddq_s16(v902, v904);
+    int16x8_t v906 = vsubq_s16(v734, v736);
+    int16x8_t v907 = vsubq_s16(v738, v740);
+    int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490);
+    int16x8_t v909 = vaddq_s16(v906, v908);
+    int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578);
+    int16x8_t v911 = vaddq_s16(v905, v910);
+    int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890);
+    int16x8_t v913 = vaddq_s16(v901, v912);
+    int16x8_t v914 = vsubq_s16(v748, v750);
+    int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045);
+    int16x8_t v915 = vaddq_s16(v915_tmp, v754);
+    int16x8_t v916 = vsubq_s16(v752, v915);
+    int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490);
+    int16x8_t v918 = vaddq_s16(v914, v917);
+    int16x8_t v919 = vsubq_s16(v758, v760);
+    int16x8_t v920 = vsubq_s16(v762, v764);
+    int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490);
+    int16x8_t v922 = vaddq_s16(v919, v921);
+    int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578);
+    int16x8_t v924 = vaddq_s16(v918, v923);
+    int16x8_t v925 = vsubq_s16(v770, v772);
+    int16x8_t v926 = vsubq_s16(v774, v776);
+    int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490);
+    int16x8_t v928 = vaddq_s16(v925, v927);
+    int16x8_t v929 = vsubq_s16(v780, v782);
+    int16x8_t v930 = vsubq_s16(v784, v786);
+    int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490);
+    int16x8_t v932 = vaddq_s16(v929, v931);
+    int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578);
+    int16x8_t v934 = vaddq_s16(v928, v933);
+    int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890);
+    int16x8_t v936 = vaddq_s16(v924, v935);
+    int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508);
+    int16x8_t v938 = vaddq_s16(v913, v937);
+    int16x8_t v939 = vsubq_s16(v796, v798);
+    int16x8_t v940 = vsubq_s16(v800, v802);
+    int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490);
+    int16x8_t v942 = vaddq_s16(v939, v941);
+    int16x8_t v943 = vsubq_s16(v806, v808);
+    int16x8_t v944 = vsubq_s16(v810, v812);
+    int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490);
+    int16x8_t v946 = vaddq_s16(v943, v945);
+    int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578);
+    int16x8_t v948 = vaddq_s16(v942, v947);
+    int16x8_t v949 = vsubq_s16(v818, v820);
+    int16x8_t v950 = vsubq_s16(v822, v824);
+    int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490);
+    int16x8_t v952 = vaddq_s16(v949, v951);
+    int16x8_t v953 = vsubq_s16(v828, v830);
+    int16x8_t v954 = vsubq_s16(v832, v834);
+    int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490);
+    int16x8_t v956 = vaddq_s16(v953, v955);
+    int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578);
+    int16x8_t v958 = vaddq_s16(v952, v957);
+    int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890);
+    int16x8_t v960 = vaddq_s16(v948, v959);
+    int16x8_t v961 = vsubq_s16(v842, v844);
+    int16x8_t v962 = vsubq_s16(v846, v848);
+    int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490);
+    int16x8_t v964 = vaddq_s16(v961, v963);
+    int16x8_t v965 = vsubq_s16(v852, v854);
+    int16x8_t v966 = vsubq_s16(v856, v858);
+    int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490);
+    int16x8_t v968 = vaddq_s16(v965, v967);
+    int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578);
+    int16x8_t v970 = vaddq_s16(v964, v969);
+    int16x8_t v971 = vsubq_s16(v864, v866);
+    int16x8_t v972 = vsubq_s16(v868, v870);
+    int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490);
+    int16x8_t v974 = vaddq_s16(v971, v973);
+    int16x8_t v975 = vsubq_s16(v874, v876);
+    int16x8_t v976 = vsubq_s16(v878, v880);
+    int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490);
+    int16x8_t v978 = vaddq_s16(v975, v977);
+    int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578);
+    int16x8_t v980 = vaddq_s16(v974, v979);
+    int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890);
+    int16x8_t v982 = vaddq_s16(v970, v981);
+    int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508);
+    int16x8_t v984 = vaddq_s16(v960, v983);
+    int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415);
+    int16x8_t v986 = vaddq_s16(v938, v985);
+    int16x8_t v987 = vsubq_s16(v2, v8);
+    int16x8_t v988 = vsubq_s16(v15, v22);
+    int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446);
+    int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2);
+    int16x8_t v990 = vaddq_s16(v987, v989);
+    int16x8_t v991 = vsubq_s16(v31, v41);
+    int16x8_t v992 = vsubq_s16(v48, v56);
+    int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446);
+    int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2);
+    int16x8_t v994 = vaddq_s16(v991, v993);
+    int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195);
+    int16x8_t v996 = vaddq_s16(v990, v995);
+    int16x8_t v997 = vsubq_s16(v67, v77);
+    int16x8_t v998 = vsubq_s16(v90, v99);
+    int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446);
+    int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2);
+    int16x8_t v1000 = vaddq_s16(v997, v999);
+    int16x8_t v1001 = vsubq_s16(v108, v118);
+    int16x8_t v1002 = vsubq_s16(v125, v134);
+    int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446);
+    int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2);
+    int16x8_t v1004 = vaddq_s16(v1001, v1003);
+    int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195);
+    int16x8_t v1006 = vaddq_s16(v1000, v1005);
+    int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401);
+    int16x8_t v1008 = vaddq_s16(v996, v1007);
+    int16x8_t v1009 = vsubq_s16(v147, v157);
+    int16x8_t v1010 = vsubq_s16(v170, v179);
+    int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446);
+    int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2);
+    int16x8_t v1012 = vaddq_s16(v1009, v1011);
+    int16x8_t v1013 = vsubq_s16(v194, v212);
+    int16x8_t v1014 = vsubq_s16(v219, v229);
+    int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446);
+    int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2);
+    int16x8_t v1016 = vaddq_s16(v1013, v1015);
+    int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195);
+    int16x8_t v1018 = vaddq_s16(v1012, v1017);
+    int16x8_t v1019 = vsubq_s16(v240, v250);
+    int16x8_t v1020 = vsubq_s16(v263, v272);
+    int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446);
+    int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2);
+    int16x8_t v1022 = vaddq_s16(v1019, v1021);
+    int16x8_t v1023 = vsubq_s16(v281, v291);
+    int16x8_t v1024 = vsubq_s16(v298, v308);
+    int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446);
+    int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2);
+    int16x8_t v1026 = vaddq_s16(v1023, v1025);
+    int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195);
+    int16x8_t v1028 = vaddq_s16(v1022, v1027);
+    int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401);
+    int16x8_t v1030 = vaddq_s16(v1018, v1029);
+    int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629);
+    int16x8_t v1032 = vaddq_s16(v1008, v1031);
+    int16x8_t v1033 = vsubq_s16(v323, v333);
+    int16x8_t v1034 = vsubq_s16(v346, v355);
+    int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446);
+    int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2);
+    int16x8_t v1036 = vaddq_s16(v1033, v1035);
+    int16x8_t v1037 = vsubq_s16(v370, v388);
+    int16x8_t v1038 = vsubq_s16(v395, v405);
+    int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446);
+    int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2);
+    int16x8_t v1040 = vaddq_s16(v1037, v1039);
+    int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195);
+    int16x8_t v1042 = vaddq_s16(v1036, v1041);
+    int16x8_t v1043 = vsubq_s16(v422, v440);
+    int16x8_t v1044 = vsubq_s16(v465, v478);
+    int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446);
+    int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2);
+    int16x8_t v1046 = vaddq_s16(v1043, v1045);
+    int16x8_t v1047 = vsubq_s16(v487, v497);
+    int16x8_t v1048 = vsubq_s16(v504, v515);
+    int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446);
+    int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2);
+    int16x8_t v1050 = vaddq_s16(v1047, v1049);
+    int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195);
+    int16x8_t v1052 = vaddq_s16(v1046, v1051);
+    int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401);
+    int16x8_t v1054 = vaddq_s16(v1042, v1053);
+    int16x8_t v1055 = vsubq_s16(v528, v538);
+    int16x8_t v1056 = vsubq_s16(v551, v560);
+    int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446);
+    int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2);
+    int16x8_t v1058 = vaddq_s16(v1055, v1057);
+    int16x8_t v1059 = vsubq_s16(v575, v593);
+    int16x8_t v1060 = vsubq_s16(v600, v610);
+    int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446);
+    int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2);
+    int16x8_t v1062 = vaddq_s16(v1059, v1061);
+    int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195);
+    int16x8_t v1064 = vaddq_s16(v1058, v1063);
+    int16x8_t v1065 = vsubq_s16(v621, v631);
+    int16x8_t v1066 = vsubq_s16(v644, v653);
+    int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446);
+    int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2);
+    int16x8_t v1068 = vaddq_s16(v1065, v1067);
+    int16x8_t v1069 = vsubq_s16(v662, v672);
+    int16x8_t v1070 = vsubq_s16(v679, v690);
+    int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446);
+    int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2);
+    int16x8_t v1072 = vaddq_s16(v1069, v1071);
+    int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195);
+    int16x8_t v1074 = vaddq_s16(v1068, v1073);
+    int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401);
+    int16x8_t v1076 = vaddq_s16(v1064, v1075);
+    int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629);
+    int16x8_t v1078 = vaddq_s16(v1054, v1077);
+    int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445);
+    int16x8_t v1080 = vaddq_s16(v1032, v1079);
+    int16x8_t v1081 = vsubq_s16(v987, v989);
+    int16x8_t v1082 = vsubq_s16(v991, v993);
+    int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826);
+    int16x8_t v1084 = vaddq_s16(v1081, v1083);
+    int16x8_t v1085 = vsubq_s16(v997, v999);
+    int16x8_t v1086 = vsubq_s16(v1001, v1003);
+    int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826);
+    int16x8_t v1088 = vaddq_s16(v1085, v1087);
+    int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124);
+    int16x8_t v1090 = vaddq_s16(v1084, v1089);
+    int16x8_t v1091 = vsubq_s16(v1009, v1011);
+    int16x8_t v1092 = vsubq_s16(v1013, v1015);
+    int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826);
+    int16x8_t v1094 = vaddq_s16(v1091, v1093);
+    int16x8_t v1095 = vsubq_s16(v1019, v1021);
+    int16x8_t v1096 = vsubq_s16(v1023, v1025);
+    int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826);
+    int16x8_t v1098 = vaddq_s16(v1095, v1097);
+    int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124);
+    int16x8_t v1100 = vaddq_s16(v1094, v1099);
+    int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792);
+    int16x8_t v1102 = vaddq_s16(v1090, v1101);
+    int16x8_t v1103 = vsubq_s16(v1033, v1035);
+    int16x8_t v1104 = vsubq_s16(v1037, v1039);
+    int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826);
+    int16x8_t v1106 = vaddq_s16(v1103, v1105);
+    int16x8_t v1107 = vsubq_s16(v1043, v1045);
+    int16x8_t v1108 = vsubq_s16(v1047, v1049);
+    int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826);
+    int16x8_t v1110 = vaddq_s16(v1107, v1109);
+    int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124);
+    int16x8_t v1112 = vaddq_s16(v1106, v1111);
+    int16x8_t v1113 = vsubq_s16(v1055, v1057);
+    int16x8_t v1114 = vsubq_s16(v1059, v1061);
+    int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826);
+    int16x8_t v1116 = vaddq_s16(v1113, v1115);
+    int16x8_t v1117 = vsubq_s16(v1065, v1067);
+    int16x8_t v1118 = vsubq_s16(v1069, v1071);
+    int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826);
+    int16x8_t v1120 = vaddq_s16(v1117, v1119);
+    int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124);
+    int16x8_t v1122 = vaddq_s16(v1116, v1121);
+    int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792);
+    int16x8_t v1124 = vaddq_s16(v1112, v1123);
+    int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484);
+    int16x8_t v1126 = vaddq_s16(v1102, v1125);
+    int16x8_t v1127 = vsubq_s16(v892, v894);
+    int16x8_t v1128 = vsubq_s16(v896, v898);
+    int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988);
+    int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128);
+    int16x8_t v1130 = vaddq_s16(v1127, v1129);
+    int16x8_t v1131 = vsubq_s16(v902, v904);
+    int16x8_t v1132 = vsubq_s16(v906, v908);
+    int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988);
+    int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132);
+    int16x8_t v1134 = vaddq_s16(v1131, v1133);
+    int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102);
+    int16x8_t v1136 = vaddq_s16(v1130, v1135);
+    int16x8_t v1137 = vsubq_s16(v914, v917);
+    int16x8_t v1138 = vsubq_s16(v919, v921);
+    int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988);
+    int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138);
+    int16x8_t v1140 = vaddq_s16(v1137, v1139);
+    int16x8_t v1141 = vsubq_s16(v925, v927);
+    int16x8_t v1142 = vsubq_s16(v929, v931);
+    int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988);
+    int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142);
+    int16x8_t v1144 = vaddq_s16(v1141, v1143);
+    int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102);
+    int16x8_t v1146 = vaddq_s16(v1140, v1145);
+    int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000);
+    int16x8_t v1148 = vaddq_s16(v1136, v1147);
+    int16x8_t v1149 = vsubq_s16(v939, v941);
+    int16x8_t v1150 = vsubq_s16(v943, v945);
+    int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988);
+    int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150);
+    int16x8_t v1152 = vaddq_s16(v1149, v1151);
+    int16x8_t v1153 = vsubq_s16(v949, v951);
+    int16x8_t v1154 = vsubq_s16(v953, v955);
+    int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988);
+    int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154);
+    int16x8_t v1156 = vaddq_s16(v1153, v1155);
+    int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102);
+    int16x8_t v1158 = vaddq_s16(v1152, v1157);
+    int16x8_t v1159 = vsubq_s16(v961, v963);
+    int16x8_t v1160 = vsubq_s16(v965, v967);
+    int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988);
+    int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160);
+    int16x8_t v1162 = vaddq_s16(v1159, v1161);
+    int16x8_t v1163 = vsubq_s16(v971, v973);
+    int16x8_t v1164 = vsubq_s16(v975, v977);
+    int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988);
+    int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164);
+    int16x8_t v1166 = vaddq_s16(v1163, v1165);
+    int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102);
+    int16x8_t v1168 = vaddq_s16(v1162, v1167);
+    int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000);
+    int16x8_t v1170 = vaddq_s16(v1158, v1169);
+    int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534);
+    int16x8_t v1172 = vaddq_s16(v1148, v1171);
+    int16x8_t v1173 = vsubq_s16(v705, v710);
+    int16x8_t v1174 = vsubq_s16(v715, v720);
+    int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673);
+    int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174);
+    int16x8_t v1176 = vaddq_s16(v1173, v1175);
+    int16x8_t v1177 = vsubq_s16(v727, v732);
+    int16x8_t v1178 = vsubq_s16(v737, v742);
+    int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673);
+    int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178);
+    int16x8_t v1180 = vaddq_s16(v1177, v1179);
+    int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398);
+    int16x8_t v1182 = vaddq_s16(v1176, v1181);
+    int16x8_t v1183 = vsubq_s16(v751, v756);
+    int16x8_t v1184 = vsubq_s16(v761, v766);
+    int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673);
+    int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184);
+    int16x8_t v1186 = vaddq_s16(v1183, v1185);
+    int16x8_t v1187 = vsubq_s16(v773, v778);
+    int16x8_t v1188 = vsubq_s16(v783, v788);
+    int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673);
+    int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188);
+    int16x8_t v1190 = vaddq_s16(v1187, v1189);
+    int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398);
+    int16x8_t v1192 = vaddq_s16(v1186, v1191);
+    int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255);
+    int16x8_t v1194 = vaddq_s16(v1182, v1193);
+    int16x8_t v1195 = vsubq_s16(v799, v804);
+    int16x8_t v1196 = vsubq_s16(v809, v814);
+    int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673);
+    int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196);
+    int16x8_t v1198 = vaddq_s16(v1195, v1197);
+    int16x8_t v1199 = vsubq_s16(v821, v826);
+    int16x8_t v1200 = vsubq_s16(v831, v836);
+    int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673);
+    int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200);
+    int16x8_t v1202 = vaddq_s16(v1199, v1201);
+    int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398);
+    int16x8_t v1204 = vaddq_s16(v1198, v1203);
+    int16x8_t v1205 = vsubq_s16(v845, v850);
+    int16x8_t v1206 = vsubq_s16(v855, v860);
+    int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673);
+    int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
+    int16x8_t v1208 = vaddq_s16(v1205, v1207);
+    int16x8_t v1209 = vsubq_s16(v867, v872);
+    int16x8_t v1210 = vsubq_s16(v877, v882);
+    int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673);
+    int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210);
+    int16x8_t v1212 = vaddq_s16(v1209, v1211);
+    int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398);
+    int16x8_t v1214 = vaddq_s16(v1208, v1213);
+    int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255);
+    int16x8_t v1216 = vaddq_s16(v1204, v1215);
+    int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595);
+    int16x8_t v1218 = vaddq_s16(v1194, v1217);
+    int16x8_t v1219 = vsubq_s16(v9, v24);
+    int16x8_t v1220 = vsubq_s16(v42, v58);
+    int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314);
+    int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5);
+    int16x8_t v1222 = vaddq_s16(v1219, v1221);
+    int16x8_t v1223 = vsubq_s16(v78, v101);
+    int16x8_t v1224 = vsubq_s16(v119, v136);
+    int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314);
+    int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5);
+    int16x8_t v1226 = vaddq_s16(v1223, v1225);
+    int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112);
+    int16x8_t v1228 = vaddq_s16(v1222, v1227);
+    int16x8_t v1229 = vsubq_s16(v158, v181);
+    int16x8_t v1230 = vsubq_s16(v213, v231);
+    int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314);
+    int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5);
+    int16x8_t v1232 = vaddq_s16(v1229, v1231);
+    int16x8_t v1233 = vsubq_s16(v251, v274);
+    int16x8_t v1234 = vsubq_s16(v292, v310);
+    int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314);
+    int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5);
+    int16x8_t v1236 = vaddq_s16(v1233, v1235);
+    int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112);
+    int16x8_t v1238 = vaddq_s16(v1232, v1237);
+    int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561);
+    int16x8_t v1240 = vaddq_s16(v1228, v1239);
+    int16x8_t v1241 = vsubq_s16(v334, v357);
+    int16x8_t v1242 = vsubq_s16(v389, v407);
+    int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314);
+    int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5);
+    int16x8_t v1244 = vaddq_s16(v1241, v1243);
+    int16x8_t v1245 = vsubq_s16(v441, v480);
+    int16x8_t v1246 = vsubq_s16(v498, v517);
+    int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314);
+    int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5);
+    int16x8_t v1248 = vaddq_s16(v1245, v1247);
+    int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112);
+    int16x8_t v1250 = vaddq_s16(v1244, v1249);
+    int16x8_t v1251 = vsubq_s16(v539, v562);
+    int16x8_t v1252 = vsubq_s16(v594, v612);
+    int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314);
+    int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5);
+    int16x8_t v1254 = vaddq_s16(v1251, v1253);
+    int16x8_t v1255 = vsubq_s16(v632, v655);
+    int16x8_t v1256 = vsubq_s16(v673, v692);
+    int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314);
+    int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5);
+    int16x8_t v1258 = vaddq_s16(v1255, v1257);
+    int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112);
+    int16x8_t v1260 = vaddq_s16(v1254, v1259);
+    int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561);
+    int16x8_t v1262 = vaddq_s16(v1250, v1261);
+    int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666);
+    int16x8_t v1264 = vaddq_s16(v1240, v1263);
+    int16x8_t v1265 = vsubq_s16(v1219, v1221);
+    int16x8_t v1266 = vsubq_s16(v1223, v1225);
+    int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397);
+    int16x8_t v1268 = vaddq_s16(v1265, v1267);
+    int16x8_t v1269 = vsubq_s16(v1229, v1231);
+    int16x8_t v1270 = vsubq_s16(v1233, v1235);
+    int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397);
+    int16x8_t v1272 = vaddq_s16(v1269, v1271);
+    int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921);
+    int16x8_t v1274 = vaddq_s16(v1268, v1273);
+    int16x8_t v1275 = vsubq_s16(v1241, v1243);
+    int16x8_t v1276 = vsubq_s16(v1245, v1247);
+    int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397);
+    int16x8_t v1278 = vaddq_s16(v1275, v1277);
+    int16x8_t v1279 = vsubq_s16(v1251, v1253);
+    int16x8_t v1280 = vsubq_s16(v1255, v1257);
+    int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397);
+    int16x8_t v1282 = vaddq_s16(v1279, v1281);
+    int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921);
+    int16x8_t v1284 = vaddq_s16(v1278, v1283);
+    int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747);
+    int16x8_t v1286 = vaddq_s16(v1274, v1285);
+    int16x8_t v1287 = vsubq_s16(v1173, v1175);
+    int16x8_t v1288 = vsubq_s16(v1177, v1179);
+    int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504);
+    int16x8_t v1290 = vaddq_s16(v1287, v1289);
+    int16x8_t v1291 = vsubq_s16(v1183, v1185);
+    int16x8_t v1292 = vsubq_s16(v1187, v1189);
+    int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504);
+    int16x8_t v1294 = vaddq_s16(v1291, v1293);
+    int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343);
+    int16x8_t v1296 = vaddq_s16(v1290, v1295);
+    int16x8_t v1297 = vsubq_s16(v1195, v1197);
+    int16x8_t v1298 = vsubq_s16(v1199, v1201);
+    int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504);
+    int16x8_t v1300 = vaddq_s16(v1297, v1299);
+    int16x8_t v1301 = vsubq_s16(v1205, v1207);
+    int16x8_t v1302 = vsubq_s16(v1209, v1211);
+    int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504);
+    int16x8_t v1304 = vaddq_s16(v1301, v1303);
+    int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343);
+    int16x8_t v1306 = vaddq_s16(v1300, v1305);
+    int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840);
+    int16x8_t v1308 = vaddq_s16(v1296, v1307);
+    int16x8_t v1309 = vsubq_s16(v1127, v1129);
+    int16x8_t v1310 = vsubq_s16(v1131, v1133);
+    int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869);
+    int16x8_t v1312 = vaddq_s16(v1309, v1311);
+    int16x8_t v1313 = vsubq_s16(v1137, v1139);
+    int16x8_t v1314 = vsubq_s16(v1141, v1143);
+    int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869);
+    int16x8_t v1316 = vaddq_s16(v1313, v1315);
+    int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830);
+    int16x8_t v1318 = vaddq_s16(v1312, v1317);
+    int16x8_t v1319 = vsubq_s16(v1149, v1151);
+    int16x8_t v1320 = vsubq_s16(v1153, v1155);
+    int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869);
+    int16x8_t v1322 = vaddq_s16(v1319, v1321);
+    int16x8_t v1323 = vsubq_s16(v1159, v1161);
+    int16x8_t v1324 = vsubq_s16(v1163, v1165);
+    int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869);
+    int16x8_t v1326 = vaddq_s16(v1323, v1325);
+    int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830);
+    int16x8_t v1328 = vaddq_s16(v1322, v1327);
+    int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944);
+    int16x8_t v1330 = vaddq_s16(v1318, v1329);
+    int16x8_t v1331 = vsubq_s16(v1081, v1083);
+    int16x8_t v1332 = vsubq_s16(v1085, v1087);
+    int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552);
+    int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332);
+    int16x8_t v1334 = vaddq_s16(v1331, v1333);
+    int16x8_t v1335 = vsubq_s16(v1091, v1093);
+    int16x8_t v1336 = vsubq_s16(v1095, v1097);
+    int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552);
+    int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336);
+    int16x8_t v1338 = vaddq_s16(v1335, v1337);
+    int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393);
+    int16x8_t v1340 = vaddq_s16(v1334, v1339);
+    int16x8_t v1341 = vsubq_s16(v1103, v1105);
+    int16x8_t v1342 = vsubq_s16(v1107, v1109);
+    int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552);
+    int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342);
+    int16x8_t v1344 = vaddq_s16(v1341, v1343);
+    int16x8_t v1345 = vsubq_s16(v1113, v1115);
+    int16x8_t v1346 = vsubq_s16(v1117, v1119);
+    int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552);
+    int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346);
+    int16x8_t v1348 = vaddq_s16(v1345, v1347);
+    int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393);
+    int16x8_t v1350 = vaddq_s16(v1344, v1349);
+    int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059);
+    int16x8_t v1352 = vaddq_s16(v1340, v1351);
+    int16x8_t v1353 = vsubq_s16(v990, v995);
+    int16x8_t v1354 = vsubq_s16(v1000, v1005);
+    int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865);
+    int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354);
+    int16x8_t v1356 = vaddq_s16(v1353, v1355);
+    int16x8_t v1357 = vsubq_s16(v1012, v1017);
+    int16x8_t v1358 = vsubq_s16(v1022, v1027);
+    int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865);
+    int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358);
+    int16x8_t v1360 = vaddq_s16(v1357, v1359);
+    int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040);
+    int16x8_t v1362 = vaddq_s16(v1356, v1361);
+    int16x8_t v1363 = vsubq_s16(v1036, v1041);
+    int16x8_t v1364 = vsubq_s16(v1046, v1051);
+    int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865);
+    int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364);
+    int16x8_t v1366 = vaddq_s16(v1363, v1365);
+    int16x8_t v1367 = vsubq_s16(v1058, v1063);
+    int16x8_t v1368 = vsubq_s16(v1068, v1073);
+    int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865);
+    int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368);
+    int16x8_t v1370 = vaddq_s16(v1367, v1369);
+    int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040);
+    int16x8_t v1372 = vaddq_s16(v1366, v1371);
+    int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187);
+    int16x8_t v1374 = vaddq_s16(v1362, v1373);
+    int16x8_t v1375 = vsubq_s16(v895, v900);
+    int16x8_t v1376 = vsubq_s16(v905, v910);
+    int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893);
+    int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2);
+    int16x8_t v1378 = vaddq_s16(v1375, v1377);
+    int16x8_t v1379 = vsubq_s16(v918, v923);
+    int16x8_t v1380 = vsubq_s16(v928, v933);
+    int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893);
+    int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2);
+    int16x8_t v1382 = vaddq_s16(v1379, v1381);
+    int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783);
+    int16x8_t v1384 = vaddq_s16(v1378, v1383);
+    int16x8_t v1385 = vsubq_s16(v942, v947);
+    int16x8_t v1386 = vsubq_s16(v952, v957);
+    int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893);
+    int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2);
+    int16x8_t v1388 = vaddq_s16(v1385, v1387);
+    int16x8_t v1389 = vsubq_s16(v964, v969);
+    int16x8_t v1390 = vsubq_s16(v974, v979);
+    int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893);
+    int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2);
+    int16x8_t v1392 = vaddq_s16(v1389, v1391);
+    int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783);
+    int16x8_t v1394 = vaddq_s16(v1388, v1393);
+    int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326);
+    int16x8_t v1396 = vaddq_s16(v1384, v1395);
+    int16x8_t v1397 = vsubq_s16(v711, v722);
+    int16x8_t v1398 = vsubq_s16(v733, v744);
+    int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357);
+    int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3);
+    int16x8_t v1400 = vaddq_s16(v1397, v1399);
+    int16x8_t v1401 = vsubq_s16(v757, v768);
+    int16x8_t v1402 = vsubq_s16(v779, v790);
+    int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357);
+    int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3);
+    int16x8_t v1404 = vaddq_s16(v1401, v1403);
+    int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637);
+    int16x8_t v1406 = vaddq_s16(v1400, v1405);
+    int16x8_t v1407 = vsubq_s16(v805, v816);
+    int16x8_t v1408 = vsubq_s16(v827, v838);
+    int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357);
+    int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3);
+    int16x8_t v1410 = vaddq_s16(v1407, v1409);
+    int16x8_t v1411 = vsubq_s16(v851, v862);
+    int16x8_t v1412 = vsubq_s16(v873, v884);
+    int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357);
+    int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3);
+    int16x8_t v1414 = vaddq_s16(v1411, v1413);
+    int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637);
+    int16x8_t v1416 = vaddq_s16(v1410, v1415);
+    int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479);
+    int16x8_t v1418 = vaddq_s16(v1406, v1417);
+    int16x8_t v1419 = vsubq_s16(v25, v60);
+    int16x8_t v1420 = vsubq_s16(v102, v138);
+    int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226);
+    int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10);
+    int16x8_t v1422 = vaddq_s16(v1419, v1421);
+    int16x8_t v1423 = vsubq_s16(v182, v233);
+    int16x8_t v1424 = vsubq_s16(v275, v312);
+    int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226);
+    int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10);
+    int16x8_t v1426 = vaddq_s16(v1423, v1425);
+    int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622);
+    int16x8_t v1428 = vaddq_s16(v1422, v1427);
+    int16x8_t v1429 = vsubq_s16(v358, v409);
+    int16x8_t v1430 = vsubq_s16(v481, v519);
+    int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226);
+    int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10);
+    int16x8_t v1432 = vaddq_s16(v1429, v1431);
+    int16x8_t v1433 = vsubq_s16(v563, v614);
+    int16x8_t v1434 = vsubq_s16(v656, v694);
+    int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226);
+    int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10);
+    int16x8_t v1436 = vaddq_s16(v1433, v1435);
+    int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622);
+    int16x8_t v1438 = vaddq_s16(v1432, v1437);
+    int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646);
+    int16x8_t v1440 = vaddq_s16(v1428, v1439);
+    int16x8_t v1441 = vsubq_s16(v1419, v1421);
+    int16x8_t v1442 = vsubq_s16(v1423, v1425);
+    int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761);
+    int16x8_t v1444 = vaddq_s16(v1441, v1443);
+    int16x8_t v1445 = vsubq_s16(v1429, v1431);
+    int16x8_t v1446 = vsubq_s16(v1433, v1435);
+    int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761);
+    int16x8_t v1448 = vaddq_s16(v1445, v1447);
+    int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826);
+    int16x8_t v1450 = vaddq_s16(v1444, v1449);
+    int16x8_t v1451 = vsubq_s16(v1397, v1399);
+    int16x8_t v1452 = vsubq_s16(v1401, v1403);
+    int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084);
+    int16x8_t v1454 = vaddq_s16(v1451, v1453);
+    int16x8_t v1455 = vsubq_s16(v1407, v1409);
+    int16x8_t v1456 = vsubq_s16(v1411, v1413);
+    int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084);
+    int16x8_t v1458 = vaddq_s16(v1455, v1457);
+    int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021);
+    int16x8_t v1460 = vaddq_s16(v1454, v1459);
+    int16x8_t v1461 = vsubq_s16(v1375, v1377);
+    int16x8_t v1462 = vsubq_s16(v1379, v1381);
+    int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631);
+    int16x8_t v1464 = vaddq_s16(v1461, v1463);
+    int16x8_t v1465 = vsubq_s16(v1385, v1387);
+    int16x8_t v1466 = vsubq_s16(v1389, v1391);
+    int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631);
+    int16x8_t v1468 = vaddq_s16(v1465, v1467);
+    int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231);
+    int16x8_t v1470 = vaddq_s16(v1464, v1469);
+    int16x8_t v1471 = vsubq_s16(v1353, v1355);
+    int16x8_t v1472 = vsubq_s16(v1357, v1359);
+    int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454);
+    int16x8_t v1474 = vaddq_s16(v1471, v1473);
+    int16x8_t v1475 = vsubq_s16(v1363, v1365);
+    int16x8_t v1476 = vsubq_s16(v1367, v1369);
+    int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454);
+    int16x8_t v1478 = vaddq_s16(v1475, v1477);
+    int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458);
+    int16x8_t v1480 = vaddq_s16(v1474, v1479);
+    int16x8_t v1481 = vsubq_s16(v1331, v1333);
+    int16x8_t v1482 = vsubq_s16(v1335, v1337);
+    int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624);
+    int16x8_t v1484 = vaddq_s16(v1481, v1483);
+    int16x8_t v1485 = vsubq_s16(v1341, v1343);
+    int16x8_t v1486 = vsubq_s16(v1345, v1347);
+    int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624);
+    int16x8_t v1488 = vaddq_s16(v1485, v1487);
+    int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702);
+    int16x8_t v1490 = vaddq_s16(v1484, v1489);
+    int16x8_t v1491 = vsubq_s16(v1309, v1311);
+    int16x8_t v1492 = vsubq_s16(v1313, v1315);
+    int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472);
+    int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492);
+    int16x8_t v1494 = vaddq_s16(v1491, v1493);
+    int16x8_t v1495 = vsubq_s16(v1319, v1321);
+    int16x8_t v1496 = vsubq_s16(v1323, v1325);
+    int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472);
+    int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496);
+    int16x8_t v1498 = vaddq_s16(v1495, v1497);
+    int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964);
+    int16x8_t v1500 = vaddq_s16(v1494, v1499);
+    int16x8_t v1501 = vsubq_s16(v1287, v1289);
+    int16x8_t v1502 = vsubq_s16(v1291, v1293);
+    int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672);
+    int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502);
+    int16x8_t v1504 = vaddq_s16(v1501, v1503);
+    int16x8_t v1505 = vsubq_s16(v1297, v1299);
+    int16x8_t v1506 = vsubq_s16(v1301, v1303);
+    int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672);
+    int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506);
+    int16x8_t v1508 = vaddq_s16(v1505, v1507);
+    int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245);
+    int16x8_t v1510 = vaddq_s16(v1504, v1509);
+    int16x8_t v1511 = vsubq_s16(v1265, v1267);
+    int16x8_t v1512 = vsubq_s16(v1269, v1271);
+    int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662);
+    int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512);
+    int16x8_t v1514 = vaddq_s16(v1511, v1513);
+    int16x8_t v1515 = vsubq_s16(v1275, v1277);
+    int16x8_t v1516 = vsubq_s16(v1279, v1281);
+    int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662);
+    int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516);
+    int16x8_t v1518 = vaddq_s16(v1515, v1517);
+    int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546);
+    int16x8_t v1520 = vaddq_s16(v1514, v1519);
+    int16x8_t v1521 = vsubq_s16(v1222, v1227);
+    int16x8_t v1522 = vsubq_s16(v1232, v1237);
+    int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756);
+    int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522);
+    int16x8_t v1524 = vaddq_s16(v1521, v1523);
+    int16x8_t v1525 = vsubq_s16(v1244, v1249);
+    int16x8_t v1526 = vsubq_s16(v1254, v1259);
+    int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756);
+    int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526);
+    int16x8_t v1528 = vaddq_s16(v1525, v1527);
+    int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869);
+    int16x8_t v1530 = vaddq_s16(v1524, v1529);
+    int16x8_t v1531 = vsubq_s16(v1176, v1181);
+    int16x8_t v1532 = vsubq_s16(v1186, v1191);
+    int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463);
+    int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532);
+    int16x8_t v1534 = vaddq_s16(v1531, v1533);
+    int16x8_t v1535 = vsubq_s16(v1198, v1203);
+    int16x8_t v1536 = vsubq_s16(v1208, v1213);
+    int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463);
+    int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536);
+    int16x8_t v1538 = vaddq_s16(v1535, v1537);
+    int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216);
+    int16x8_t v1540 = vaddq_s16(v1534, v1539);
+    int16x8_t v1541 = vsubq_s16(v1130, v1135);
+    int16x8_t v1542 = vsubq_s16(v1140, v1145);
+    int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661);
+    int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542);
+    int16x8_t v1544 = vaddq_s16(v1541, v1543);
+    int16x8_t v1545 = vsubq_s16(v1152, v1157);
+    int16x8_t v1546 = vsubq_s16(v1162, v1167);
+    int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661);
+    int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546);
+    int16x8_t v1548 = vaddq_s16(v1545, v1547);
+    int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587);
+    int16x8_t v1550 = vaddq_s16(v1544, v1549);
+    int16x8_t v1551 = vsubq_s16(v1084, v1089);
+    int16x8_t v1552 = vsubq_s16(v1094, v1099);
+    int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242);
+    int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2);
+    int16x8_t v1554 = vaddq_s16(v1551, v1553);
+    int16x8_t v1555 = vsubq_s16(v1106, v1111);
+    int16x8_t v1556 = vsubq_s16(v1116, v1121);
+    int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242);
+    int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2);
+    int16x8_t v1558 = vaddq_s16(v1555, v1557);
+    int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985);
+    int16x8_t v1560 = vaddq_s16(v1554, v1559);
+    int16x8_t v1561 = vsubq_s16(v996, v1007);
+    int16x8_t v1562 = vsubq_s16(v1018, v1029);
+    int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298);
+    int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2);
+    int16x8_t v1564 = vaddq_s16(v1561, v1563);
+    int16x8_t v1565 = vsubq_s16(v1042, v1053);
+    int16x8_t v1566 = vsubq_s16(v1064, v1075);
+    int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298);
+    int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2);
+    int16x8_t v1568 = vaddq_s16(v1565, v1567);
+    int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412);
+    int16x8_t v1570 = vaddq_s16(v1564, v1569);
+    int16x8_t v1571 = vsubq_s16(v901, v912);
+    int16x8_t v1572 = vsubq_s16(v924, v935);
+    int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773);
+    int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4);
+    int16x8_t v1574 = vaddq_s16(v1571, v1573);
+    int16x8_t v1575 = vsubq_s16(v948, v959);
+    int16x8_t v1576 = vsubq_s16(v970, v981);
+    int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773);
+    int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4);
+    int16x8_t v1578 = vaddq_s16(v1575, v1577);
+    int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871);
+    int16x8_t v1580 = vaddq_s16(v1574, v1579);
+    int16x8_t v1581 = vsubq_s16(v723, v746);
+    int16x8_t v1582 = vsubq_s16(v769, v792);
+    int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108);
+    int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6);
+    int16x8_t v1584 = vaddq_s16(v1581, v1583);
+    int16x8_t v1585 = vsubq_s16(v817, v840);
+    int16x8_t v1586 = vsubq_s16(v863, v886);
+    int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108);
+    int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6);
+    int16x8_t v1588 = vaddq_s16(v1585, v1587);
+    int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363);
+    int16x8_t v1590 = vaddq_s16(v1584, v1589);
+    int16x8_t v1591 = vsubq_s16(v61, v140);
+    int16x8_t v1592 = vsubq_s16(v234, v314);
+    int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251);
+    int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20);
+    int16x8_t v1594 = vaddq_s16(v1591, v1593);
+    int16x8_t v1595 = vsubq_s16(v410, v521);
+    int16x8_t v1596 = vsubq_s16(v615, v696);
+    int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251);
+    int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20);
+    int16x8_t v1598 = vaddq_s16(v1595, v1597);
+    int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891);
+    int16x8_t v1600 = vaddq_s16(v1594, v1599);
+    int16x8_t v1601 = vsubq_s16(v1591, v1593);
+    int16x8_t v1602 = vsubq_s16(v1595, v1597);
+    int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460);
+    int16x8_t v1604 = vaddq_s16(v1601, v1603);
+    int16x8_t v1605 = vsubq_s16(v1581, v1583);
+    int16x8_t v1606 = vsubq_s16(v1585, v1587);
+    int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073);
+    int16x8_t v1608 = vaddq_s16(v1605, v1607);
+    int16x8_t v1609 = vsubq_s16(v1571, v1573);
+    int16x8_t v1610 = vsubq_s16(v1575, v1577);
+    int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734);
+    int16x8_t v1612 = vaddq_s16(v1609, v1611);
+    int16x8_t v1613 = vsubq_s16(v1561, v1563);
+    int16x8_t v1614 = vsubq_s16(v1565, v1567);
+    int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448);
+    int16x8_t v1616 = vaddq_s16(v1613, v1615);
+    int16x8_t v1617 = vsubq_s16(v1551, v1553);
+    int16x8_t v1618 = vsubq_s16(v1555, v1557);
+    int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220);
+    int16x8_t v1620 = vaddq_s16(v1617, v1619);
+    int16x8_t v1621 = vsubq_s16(v1541, v1543);
+    int16x8_t v1622 = vsubq_s16(v1545, v1547);
+    int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058);
+    int16x8_t v1624 = vaddq_s16(v1621, v1623);
+    int16x8_t v1625 = vsubq_s16(v1531, v1533);
+    int16x8_t v1626 = vsubq_s16(v1535, v1537);
+    int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969);
+    int16x8_t v1628 = vaddq_s16(v1625, v1627);
+    int16x8_t v1629 = vsubq_s16(v1521, v1523);
+    int16x8_t v1630 = vsubq_s16(v1525, v1527);
+    int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961);
+    int16x8_t v1632 = vaddq_s16(v1629, v1631);
+    int16x8_t v1633 = vsubq_s16(v1511, v1513);
+    int16x8_t v1634 = vsubq_s16(v1515, v1517);
+    int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044);
+    int16x8_t v1636 = vaddq_s16(v1633, v1635);
+    int16x8_t v1637 = vsubq_s16(v1501, v1503);
+    int16x8_t v1638 = vsubq_s16(v1505, v1507);
+    int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232);
+    int16x8_t v1640 = vaddq_s16(v1637, v1639);
+    int16x8_t v1641 = vsubq_s16(v1491, v1493);
+    int16x8_t v1642 = vsubq_s16(v1495, v1497);
+    int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538);
+    int16x8_t v1644 = vaddq_s16(v1641, v1643);
+    int16x8_t v1645 = vsubq_s16(v1481, v1483);
+    int16x8_t v1646 = vsubq_s16(v1485, v1487);
+    int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211);
+    int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646);
+    int16x8_t v1648 = vaddq_s16(v1645, v1647);
+    int16x8_t v1649 = vsubq_s16(v1471, v1473);
+    int16x8_t v1650 = vsubq_s16(v1475, v1477);
+    int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808);
+    int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650);
+    int16x8_t v1652 = vaddq_s16(v1649, v1651);
+    int16x8_t v1653 = vsubq_s16(v1461, v1463);
+    int16x8_t v1654 = vsubq_s16(v1465, v1467);
+    int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586);
+    int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654);
+    int16x8_t v1656 = vaddq_s16(v1653, v1655);
+    int16x8_t v1657 = vsubq_s16(v1451, v1453);
+    int16x8_t v1658 = vsubq_s16(v1455, v1457);
+    int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576);
+    int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658);
+    int16x8_t v1660 = vaddq_s16(v1657, v1659);
+    int16x8_t v1661 = vsubq_s16(v1441, v1443);
+    int16x8_t v1662 = vsubq_s16(v1445, v1447);
+    int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817);
+    int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662);
+    int16x8_t v1664 = vaddq_s16(v1661, v1663);
+    int16x8_t v1665 = vsubq_s16(v1422, v1427);
+    int16x8_t v1666 = vsubq_s16(v1432, v1437);
+    int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356);
+    int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666);
+    int16x8_t v1668 = vaddq_s16(v1665, v1667);
+    int16x8_t v1669 = vsubq_s16(v1400, v1405);
+    int16x8_t v1670 = vsubq_s16(v1410, v1415);
+    int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256);
+    int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670);
+    int16x8_t v1672 = vaddq_s16(v1669, v1671);
+    int16x8_t v1673 = vsubq_s16(v1378, v1383);
+    int16x8_t v1674 = vsubq_s16(v1388, v1393);
+    int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596);
+    int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674);
+    int16x8_t v1676 = vaddq_s16(v1673, v1675);
+    int16x8_t v1677 = vsubq_s16(v1356, v1361);
+    int16x8_t v1678 = vsubq_s16(v1366, v1371);
+    int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483);
+    int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678);
+    int16x8_t v1680 = vaddq_s16(v1677, v1679);
+    int16x8_t v1681 = vsubq_s16(v1334, v1339);
+    int16x8_t v1682 = vsubq_s16(v1344, v1349);
+    int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057);
+    int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682);
+    int16x8_t v1684 = vaddq_s16(v1681, v1683);
+    int16x8_t v1685 = vsubq_s16(v1312, v1317);
+    int16x8_t v1686 = vsubq_s16(v1322, v1327);
+    int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517);
+    int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686);
+    int16x8_t v1688 = vaddq_s16(v1685, v1687);
+    int16x8_t v1689 = vsubq_s16(v1290, v1295);
+    int16x8_t v1690 = vsubq_s16(v1300, v1305);
+    int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373);
+    int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2);
+    int16x8_t v1692 = vaddq_s16(v1689, v1691);
+    int16x8_t v1693 = vsubq_s16(v1268, v1273);
+    int16x8_t v1694 = vsubq_s16(v1278, v1283);
+    int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571);
+    int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2);
+    int16x8_t v1696 = vaddq_s16(v1693, v1695);
+    int16x8_t v1697 = vsubq_s16(v1228, v1239);
+    int16x8_t v1698 = vsubq_s16(v1250, v1261);
+    int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975);
+    int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2);
+    int16x8_t v1700 = vaddq_s16(v1697, v1699);
+    int16x8_t v1701 = vsubq_s16(v1182, v1193);
+    int16x8_t v1702 = vsubq_s16(v1204, v1215);
+    int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832);
+    int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3);
+    int16x8_t v1704 = vaddq_s16(v1701, v1703);
+    int16x8_t v1705 = vsubq_s16(v1136, v1147);
+    int16x8_t v1706 = vsubq_s16(v1158, v1169);
+    int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437);
+    int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3);
+    int16x8_t v1708 = vaddq_s16(v1705, v1707);
+    int16x8_t v1709 = vsubq_s16(v1090, v1101);
+    int16x8_t v1710 = vsubq_s16(v1112, v1123);
+    int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573);
+    int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4);
+    int16x8_t v1712 = vaddq_s16(v1709, v1711);
+    int16x8_t v1713 = vsubq_s16(v1008, v1031);
+    int16x8_t v1714 = vsubq_s16(v1054, v1077);
+    int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122);
+    int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5);
+    int16x8_t v1716 = vaddq_s16(v1713, v1715);
+    int16x8_t v1717 = vsubq_s16(v913, v937);
+    int16x8_t v1718 = vsubq_s16(v960, v983);
+    int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041);
+    int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8);
+    int16x8_t v1720 = vaddq_s16(v1717, v1719);
+    int16x8_t v1721 = vsubq_s16(v747, v794);
+    int16x8_t v1722 = vsubq_s16(v841, v888);
+    int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146);
+    int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13);
+    int16x8_t v1724 = vaddq_s16(v1721, v1723);
+    int16x8_t v1725 = vsubq_s16(v141, v316);
+    int16x8_t v1726 = vsubq_s16(v522, v698);
+    int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402);
+    int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40);
+    int16x8_t v1728 = vaddq_s16(v1725, v1727);
+    int16x8_t v1729 = vsubq_s16(v1725, v1727);
+    int16x8_t v1730 = vsubq_s16(v1721, v1723);
+    int16x8_t v1731 = vsubq_s16(v1717, v1719);
+    int16x8_t v1732 = vsubq_s16(v1713, v1715);
+    int16x8_t v1733 = vsubq_s16(v1709, v1711);
+    int16x8_t v1734 = vsubq_s16(v1705, v1707);
+    int16x8_t v1735 = vsubq_s16(v1701, v1703);
+    int16x8_t v1736 = vsubq_s16(v1697, v1699);
+    int16x8_t v1737 = vsubq_s16(v1693, v1695);
+    int16x8_t v1738 = vsubq_s16(v1689, v1691);
+    int16x8_t v1739 = vsubq_s16(v1685, v1687);
+    int16x8_t v1740 = vsubq_s16(v1681, v1683);
+    int16x8_t v1741 = vsubq_s16(v1677, v1679);
+    int16x8_t v1742 = vsubq_s16(v1673, v1675);
+    int16x8_t v1743 = vsubq_s16(v1669, v1671);
+    int16x8_t v1744 = vsubq_s16(v1665, v1667);
+    int16x8_t v1745 = vsubq_s16(v1661, v1663);
+    int16x8_t v1746 = vsubq_s16(v1657, v1659);
+    int16x8_t v1747 = vsubq_s16(v1653, v1655);
+    int16x8_t v1748 = vsubq_s16(v1649, v1651);
+    int16x8_t v1749 = vsubq_s16(v1645, v1647);
+    int16x8_t v1750 = vsubq_s16(v1641, v1643);
+    int16x8_t v1751 = vsubq_s16(v1637, v1639);
+    int16x8_t v1752 = vsubq_s16(v1633, v1635);
+    int16x8_t v1753 = vsubq_s16(v1629, v1631);
+    int16x8_t v1754 = vsubq_s16(v1625, v1627);
+    int16x8_t v1755 = vsubq_s16(v1621, v1623);
+    int16x8_t v1756 = vsubq_s16(v1617, v1619);
+    int16x8_t v1757 = vsubq_s16(v1613, v1615);
+    int16x8_t v1758 = vsubq_s16(v1609, v1611);
+    int16x8_t v1759 = vsubq_s16(v1605, v1607);
+    int16x8_t v1760 = vsubq_s16(v1601, v1603);
+    int16x8_t v1761 = vsubq_s16(v1594, v1599);
+    int16x8_t v1762 = vsubq_s16(v1584, v1589);
+    int16x8_t v1763 = vsubq_s16(v1574, v1579);
+    int16x8_t v1764 = vsubq_s16(v1564, v1569);
+    int16x8_t v1765 = vsubq_s16(v1554, v1559);
+    int16x8_t v1766 = vsubq_s16(v1544, v1549);
+    int16x8_t v1767 = vsubq_s16(v1534, v1539);
+    int16x8_t v1768 = vsubq_s16(v1524, v1529);
+    int16x8_t v1769 = vsubq_s16(v1514, v1519);
+    int16x8_t v1770 = vsubq_s16(v1504, v1509);
+    int16x8_t v1771 = vsubq_s16(v1494, v1499);
+    int16x8_t v1772 = vsubq_s16(v1484, v1489);
+    int16x8_t v1773 = vsubq_s16(v1474, v1479);
+    int16x8_t v1774 = vsubq_s16(v1464, v1469);
+    int16x8_t v1775 = vsubq_s16(v1454, v1459);
+    int16x8_t v1776 = vsubq_s16(v1444, v1449);
+    int16x8_t v1777 = vsubq_s16(v1428, v1439);
+    int16x8_t v1778 = vsubq_s16(v1406, v1417);
+    int16x8_t v1779 = vsubq_s16(v1384, v1395);
+    int16x8_t v1780 = vsubq_s16(v1362, v1373);
+    int16x8_t v1781 = vsubq_s16(v1340, v1351);
+    int16x8_t v1782 = vsubq_s16(v1318, v1329);
+    int16x8_t v1783 = vsubq_s16(v1296, v1307);
+    int16x8_t v1784 = vsubq_s16(v1274, v1285);
+    int16x8_t v1785 = vsubq_s16(v1240, v1263);
+    int16x8_t v1786 = vsubq_s16(v1194, v1217);
+    int16x8_t v1787 = vsubq_s16(v1148, v1171);
+    int16x8_t v1788 = vsubq_s16(v1102, v1125);
+    int16x8_t v1789 = vsubq_s16(v1032, v1079);
+    int16x8_t v1790 = vsubq_s16(v938, v985);
+    int16x8_t v1791 = vsubq_s16(v795, v890);
+    int16x8_t v1792 = vsubq_s16(v317, v700);
+    vst1q_s16(out + out_stride * 0 + i, v701);
+    vst1q_s16(out + out_stride * 1 + i, v891);
+    vst1q_s16(out + out_stride * 2 + i, v986);
+    vst1q_s16(out + out_stride * 3 + i, v1080);
+    vst1q_s16(out + out_stride * 4 + i, v1126);
+    vst1q_s16(out + out_stride * 5 + i, v1172);
+    vst1q_s16(out + out_stride * 6 + i, v1218);
+    vst1q_s16(out + out_stride * 7 + i, v1264);
+    vst1q_s16(out + out_stride * 8 + i, v1286);
+    vst1q_s16(out + out_stride * 9 + i, v1308);
+    vst1q_s16(out + out_stride * 10 + i, v1330);
+    vst1q_s16(out + out_stride * 11 + i, v1352);
+    vst1q_s16(out + out_stride * 12 + i, v1374);
+    vst1q_s16(out + out_stride * 13 + i, v1396);
+    vst1q_s16(out + out_stride * 14 + i, v1418);
+    vst1q_s16(out + out_stride * 15 + i, v1440);
+    vst1q_s16(out + out_stride * 16 + i, v1450);
+    vst1q_s16(out + out_stride * 17 + i, v1460);
+    vst1q_s16(out + out_stride * 18 + i, v1470);
+    vst1q_s16(out + out_stride * 19 + i, v1480);
+    vst1q_s16(out + out_stride * 20 + i, v1490);
+    vst1q_s16(out + out_stride * 21 + i, v1500);
+    vst1q_s16(out + out_stride * 22 + i, v1510);
+    vst1q_s16(out + out_stride * 23 + i, v1520);
+    vst1q_s16(out + out_stride * 24 + i, v1530);
+    vst1q_s16(out + out_stride * 25 + i, v1540);
+    vst1q_s16(out + out_stride * 26 + i, v1550);
+    vst1q_s16(out + out_stride * 27 + i, v1560);
+    vst1q_s16(out + out_stride * 28 + i, v1570);
+    vst1q_s16(out + out_stride * 29 + i, v1580);
+    vst1q_s16(out + out_stride * 30 + i, v1590);
+    vst1q_s16(out + out_stride * 31 + i, v1600);
+    vst1q_s16(out + out_stride * 32 + i, v1604);
+    vst1q_s16(out + out_stride * 33 + i, v1608);
+    vst1q_s16(out + out_stride * 34 + i, v1612);
+    vst1q_s16(out + out_stride * 35 + i, v1616);
+    vst1q_s16(out + out_stride * 36 + i, v1620);
+    vst1q_s16(out + out_stride * 37 + i, v1624);
+    vst1q_s16(out + out_stride * 38 + i, v1628);
+    vst1q_s16(out + out_stride * 39 + i, v1632);
+    vst1q_s16(out + out_stride * 40 + i, v1636);
+    vst1q_s16(out + out_stride * 41 + i, v1640);
+    vst1q_s16(out + out_stride * 42 + i, v1644);
+    vst1q_s16(out + out_stride * 43 + i, v1648);
+    vst1q_s16(out + out_stride * 44 + i, v1652);
+    vst1q_s16(out + out_stride * 45 + i, v1656);
+    vst1q_s16(out + out_stride * 46 + i, v1660);
+    vst1q_s16(out + out_stride * 47 + i, v1664);
+    vst1q_s16(out + out_stride * 48 + i, v1668);
+    vst1q_s16(out + out_stride * 49 + i, v1672);
+    vst1q_s16(out + out_stride * 50 + i, v1676);
+    vst1q_s16(out + out_stride * 51 + i, v1680);
+    vst1q_s16(out + out_stride * 52 + i, v1684);
+    vst1q_s16(out + out_stride * 53 + i, v1688);
+    vst1q_s16(out + out_stride * 54 + i, v1692);
+    vst1q_s16(out + out_stride * 55 + i, v1696);
+    vst1q_s16(out + out_stride * 56 + i, v1700);
+    vst1q_s16(out + out_stride * 57 + i, v1704);
+    vst1q_s16(out + out_stride * 58 + i, v1708);
+    vst1q_s16(out + out_stride * 59 + i, v1712);
+    vst1q_s16(out + out_stride * 60 + i, v1716);
+    vst1q_s16(out + out_stride * 61 + i, v1720);
+    vst1q_s16(out + out_stride * 62 + i, v1724);
+    vst1q_s16(out + out_stride * 63 + i, v1728);
+    vst1q_s16(out + out_stride * 64 + i, v1729);
+    vst1q_s16(out + out_stride * 65 + i, v1730);
+    vst1q_s16(out + out_stride * 66 + i, v1731);
+    vst1q_s16(out + out_stride * 67 + i, v1732);
+    vst1q_s16(out + out_stride * 68 + i, v1733);
+    vst1q_s16(out + out_stride * 69 + i, v1734);
+    vst1q_s16(out + out_stride * 70 + i, v1735);
+    vst1q_s16(out + out_stride * 71 + i, v1736);
+    vst1q_s16(out + out_stride * 72 + i, v1737);
+    vst1q_s16(out + out_stride * 73 + i, v1738);
+    vst1q_s16(out + out_stride * 74 + i, v1739);
+    vst1q_s16(out + out_stride * 75 + i, v1740);
+    vst1q_s16(out + out_stride * 76 + i, v1741);
+    vst1q_s16(out + out_stride * 77 + i, v1742);
+    vst1q_s16(out + out_stride * 78 + i, v1743);
+    vst1q_s16(out + out_stride * 79 + i, v1744);
+    vst1q_s16(out + out_stride * 80 + i, v1745);
+    vst1q_s16(out + out_stride * 81 + i, v1746);
+    vst1q_s16(out + out_stride * 82 + i, v1747);
+    vst1q_s16(out + out_stride * 83 + i, v1748);
+    vst1q_s16(out + out_stride * 84 + i, v1749);
+    vst1q_s16(out + out_stride * 85 + i, v1750);
+    vst1q_s16(out + out_stride * 86 + i, v1751);
+    vst1q_s16(out + out_stride * 87 + i, v1752);
+    vst1q_s16(out + out_stride * 88 + i, v1753);
+    vst1q_s16(out + out_stride * 89 + i, v1754);
+    vst1q_s16(out + out_stride * 90 + i, v1755);
+    vst1q_s16(out + out_stride * 91 + i, v1756);
+    vst1q_s16(out + out_stride * 92 + i, v1757);
+    vst1q_s16(out + out_stride * 93 + i, v1758);
+    vst1q_s16(out + out_stride * 94 + i, v1759);
+    vst1q_s16(out + out_stride * 95 + i, v1760);
+    vst1q_s16(out + out_stride * 96 + i, v1761);
+    vst1q_s16(out + out_stride * 97 + i, v1762);
+    vst1q_s16(out + out_stride * 98 + i, v1763);
+    vst1q_s16(out + out_stride * 99 + i, v1764);
+    vst1q_s16(out + out_stride * 100 + i, v1765);
+    vst1q_s16(out + out_stride * 101 + i, v1766);
+    vst1q_s16(out + out_stride * 102 + i, v1767);
+    vst1q_s16(out + out_stride * 103 + i, v1768);
+    vst1q_s16(out + out_stride * 104 + i, v1769);
+    vst1q_s16(out + out_stride * 105 + i, v1770);
+    vst1q_s16(out + out_stride * 106 + i, v1771);
+    vst1q_s16(out + out_stride * 107 + i, v1772);
+    vst1q_s16(out + out_stride * 108 + i, v1773);
+    vst1q_s16(out + out_stride * 109 + i, v1774);
+    vst1q_s16(out + out_stride * 110 + i, v1775);
+    vst1q_s16(out + out_stride * 111 + i, v1776);
+    vst1q_s16(out + out_stride * 112 + i, v1777);
+    vst1q_s16(out + out_stride * 113 + i, v1778);
+    vst1q_s16(out + out_stride * 114 + i, v1779);
+    vst1q_s16(out + out_stride * 115 + i, v1780);
+    vst1q_s16(out + out_stride * 116 + i, v1781);
+    vst1q_s16(out + out_stride * 117 + i, v1782);
+    vst1q_s16(out + out_stride * 118 + i, v1783);
+    vst1q_s16(out + out_stride * 119 + i, v1784);
+    vst1q_s16(out + out_stride * 120 + i, v1785);
+    vst1q_s16(out + out_stride * 121 + i, v1786);
+    vst1q_s16(out + out_stride * 122 + i, v1787);
+    vst1q_s16(out + out_stride * 123 + i, v1788);
+    vst1q_s16(out + out_stride * 124 + i, v1789);
+    vst1q_s16(out + out_stride * 125 + i, v1790);
+    vst1q_s16(out + out_stride * 126 + i, v1791);
+    vst1q_s16(out + out_stride * 127 + i, v1792);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h
new file mode 100644
index 0000000000..472ec20d42
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct16-inl.h
@@ -0,0 +1,180 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<16>) { return 1; }
+
+void FastIDCT(FastDCTTag<16>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v16, v19);
+    int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
+    int16x8_t v22 = vaddq_s16(v17, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v27 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v28 = vaddq_s16(v26, v27);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v30 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v31 = vaddq_s16(v29, v30);
+    int16x8_t v32 = vaddq_s16(v28, v31);
+    int16x8_t v33 = vqrdmulhq_n_s16(v32, 17734);
+    int16x8_t v34 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v35 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v36 = vaddq_s16(v34, v35);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v38 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v39 = vaddq_s16(v37, v38);
+    int16x8_t v40 = vaddq_s16(v36, v39);
+    int16x8_t v41_tmp = vqrdmulhq_n_s16(v40, 10045);
+    int16x8_t v41 = vaddq_s16(v41_tmp, v40);
+    int16x8_t v42 = vaddq_s16(v33, v41);
+    int16x8_t v43 = vqrdmulhq_n_s16(v42, 16705);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v36, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v36);
+    int16x8_t v45 = vaddq_s16(v39, v31);
+    int16x8_t v46 = vaddq_s16(v44, v45);
+    int16x8_t v47 = vqrdmulhq_n_s16(v46, 16705);
+    int16x8_t v48 = vaddq_s16(v43, v47);
+    int16x8_t v49_tmp = vqrdmulhq_n_s16(v35, 13573);
+    int16x8_t v49 = vaddq_s16(v49_tmp, v35);
+    int16x8_t v50 = vaddq_s16(v30, v37);
+    int16x8_t v51 = vaddq_s16(v49, v50);
+    int16x8_t v52 = vaddq_s16(v38, v34);
+    int16x8_t v53 = vaddq_s16(v27, v29);
+    int16x8_t v54 = vaddq_s16(v52, v53);
+    int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
+    int16x8_t v56 = vqrdmulhq_n_s16(v52, 25080);
+    int16x8_t v57 = vaddq_s16(v55, v56);
+    int16x8_t v58 = vaddq_s16(v51, v57);
+    int16x8_t v59 = vaddq_s16(v48, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vsubq_s16(v0, v1);
+    int16x8_t v63 = vsubq_s16(v4, v6);
+    int16x8_t v64_tmp = vqrdmulhq_n_s16(v63, 10045);
+    int16x8_t v64 = vaddq_s16(v64_tmp, v63);
+    int16x8_t v65 = vaddq_s16(v62, v64);
+    int16x8_t v66 = vsubq_s16(v11, v14);
+    int16x8_t v67 = vqrdmulhq_n_s16(v16, 17734);
+    int16x8_t v68_tmp = vqrdmulhq_n_s16(v19, 10045);
+    int16x8_t v68 = vaddq_s16(v68_tmp, v19);
+    int16x8_t v69 = vsubq_s16(v67, v68);
+    int16x8_t v70 = vaddq_s16(v66, v69);
+    int16x8_t v71 = vqrdmulhq_n_s16(v70, 19705);
+    int16x8_t v72 = vaddq_s16(v65, v71);
+    int16x8_t v73 = vsubq_s16(v49, v50);
+    int16x8_t v74 = vqrdmulhq_n_s16(v52, 17734);
+    int16x8_t v75_tmp = vqrdmulhq_n_s16(v53, 10045);
+    int16x8_t v75 = vaddq_s16(v75_tmp, v53);
+    int16x8_t v76 = vsubq_s16(v74, v75);
+    int16x8_t v77 = vaddq_s16(v73, v76);
+    int16x8_t v78 = vsubq_s16(v44, v45);
+    int16x8_t v79 = vqrdmulhq_n_s16(v78, 19705);
+    int16x8_t v80 = vqrdmulhq_n_s16(v40, 13573);
+    int16x8_t v81 = vsubq_s16(v80, v32);
+    int16x8_t v82 = vqrdmulhq_n_s16(v81, 25746);
+    int16x8_t v83 = vaddq_s16(v79, v82);
+    int16x8_t v84 = vaddq_s16(v77, v83);
+    int16x8_t v85 = vqrdmulhq_n_s16(v84, 17121);
+    int16x8_t v86 = vaddq_s16(v72, v85);
+    int16x8_t v87 = vsubq_s16(v62, v64);
+    int16x8_t v88 = vsubq_s16(v66, v69);
+    int16x8_t v89 = vqrdmulhq_n_s16(v88, 29490);
+    int16x8_t v90 = vaddq_s16(v87, v89);
+    int16x8_t v91 = vsubq_s16(v73, v76);
+    int16x8_t v92 = vqrdmulhq_n_s16(v78, 29490);
+    int16x8_t v93_tmp = vqrdmulhq_n_s16(v81, 5763);
+    int16x8_t v93 = vaddq_s16(v93_tmp, v81);
+    int16x8_t v94 = vsubq_s16(v92, v93);
+    int16x8_t v95 = vaddq_s16(v91, v94);
+    int16x8_t v96 = vqrdmulhq_n_s16(v95, 18578);
+    int16x8_t v97 = vaddq_s16(v90, v96);
+    int16x8_t v98 = vsubq_s16(v46, v42);
+    int16x8_t v99_tmp = vqrdmulhq_n_s16(v98, 18446);
+    int16x8_t v99 = vmlaq_n_s16(v99_tmp, v98, 2);
+    int16x8_t v100 = vsubq_s16(v51, v57);
+    int16x8_t v101 = vaddq_s16(v99, v100);
+    int16x8_t v102 = vqrdmulhq_n_s16(v101, 21195);
+    int16x8_t v103 = vsubq_s16(v2, v8);
+    int16x8_t v104 = vsubq_s16(v15, v22);
+    int16x8_t v105_tmp = vqrdmulhq_n_s16(v104, 18446);
+    int16x8_t v105 = vmlaq_n_s16(v105_tmp, v104, 2);
+    int16x8_t v106 = vaddq_s16(v103, v105);
+    int16x8_t v107 = vaddq_s16(v102, v106);
+    int16x8_t v108 = vsubq_s16(v103, v105);
+    int16x8_t v109 = vsubq_s16(v100, v99);
+    int16x8_t v110 = vqrdmulhq_n_s16(v109, 25826);
+    int16x8_t v111 = vaddq_s16(v108, v110);
+    int16x8_t v112 = vsubq_s16(v87, v89);
+    int16x8_t v113 = vsubq_s16(v91, v94);
+    int16x8_t v114_tmp = vqrdmulhq_n_s16(v113, 1988);
+    int16x8_t v114 = vaddq_s16(v114_tmp, v113);
+    int16x8_t v115 = vaddq_s16(v112, v114);
+    int16x8_t v116 = vsubq_s16(v65, v71);
+    int16x8_t v117 = vsubq_s16(v77, v83);
+    int16x8_t v118_tmp = vqrdmulhq_n_s16(v117, 23673);
+    int16x8_t v118 = vaddq_s16(v118_tmp, v117);
+    int16x8_t v119 = vaddq_s16(v116, v118);
+    int16x8_t v120 = vsubq_s16(v58, v48);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 3314);
+    int16x8_t v121 = vmlaq_n_s16(v121_tmp, v120, 5);
+    int16x8_t v122 = vsubq_s16(v9, v24);
+    int16x8_t v123 = vaddq_s16(v121, v122);
+    int16x8_t v124 = vsubq_s16(v122, v121);
+    int16x8_t v125 = vsubq_s16(v116, v118);
+    int16x8_t v126 = vsubq_s16(v112, v114);
+    int16x8_t v127 = vsubq_s16(v108, v110);
+    int16x8_t v128 = vsubq_s16(v106, v102);
+    int16x8_t v129 = vsubq_s16(v90, v96);
+    int16x8_t v130 = vsubq_s16(v72, v85);
+    int16x8_t v131 = vsubq_s16(v25, v60);
+    vst1q_s16(out + out_stride * 0 + i, v61);
+    vst1q_s16(out + out_stride * 1 + i, v86);
+    vst1q_s16(out + out_stride * 2 + i, v97);
+    vst1q_s16(out + out_stride * 3 + i, v107);
+    vst1q_s16(out + out_stride * 4 + i, v111);
+    vst1q_s16(out + out_stride * 5 + i, v115);
+    vst1q_s16(out + out_stride * 6 + i, v119);
+    vst1q_s16(out + out_stride * 7 + i, v123);
+    vst1q_s16(out + out_stride * 8 + i, v124);
+    vst1q_s16(out + out_stride * 9 + i, v125);
+    vst1q_s16(out + out_stride * 10 + i, v126);
+    vst1q_s16(out + out_stride * 11 + i, v127);
+    vst1q_s16(out + out_stride * 12 + i, v128);
+    vst1q_s16(out + out_stride * 13 + i, v129);
+    vst1q_s16(out + out_stride * 14 + i, v130);
+    vst1q_s16(out + out_stride * 15 + i, v131);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h
new file mode 100644
index 0000000000..a823440af2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct256-inl.h
@@ -0,0 +1,4811 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<256>) { return 3; }
+
+void FastIDCT(FastDCTTag<256>, const int16_t* in, size_t in_stride,
+              int16_t* out, size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 128 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 64 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 192 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 160 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 96 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
+    int16x8_t v17 = vaddq_s16(v17_tmp, v16);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 224 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v19, v16);
+    int16x8_t v21 = vaddq_s16(v17, v20);
+    int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 144 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 112 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 80 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
+    int16x8_t v35 = vaddq_s16(v35_tmp, v34);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 208 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 176 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vaddq_s16(v35, v39);
+    int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vaddq_s16(v46, v43);
+    int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
+    int16x8_t v50 = vaddq_s16(v50_tmp, v49);
+    int16x8_t v51 = vld1q_s16(in + in_stride * 240 + i);
+    int16x8_t v52 = vaddq_s16(v51, v36);
+    int16x8_t v53 = vaddq_s16(v52, v45);
+    int16x8_t v54 = vaddq_s16(v53, v49);
+    int16x8_t v55 = vaddq_s16(v50, v54);
+    int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
+    int16x8_t v57 = vaddq_s16(v48, v56);
+    int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 136 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 120 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 72 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
+    int16x8_t v71 = vaddq_s16(v71_tmp, v70);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 200 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 184 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vaddq_s16(v71, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 168 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 152 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 104 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 88 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vaddq_s16(v88, v81);
+    int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
+    int16x8_t v92 = vaddq_s16(v92_tmp, v91);
+    int16x8_t v93 = vld1q_s16(in + in_stride * 232 + i);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 216 + i);
+    int16x8_t v95 = vaddq_s16(v93, v94);
+    int16x8_t v96 = vaddq_s16(v95, v85);
+    int16x8_t v97 = vaddq_s16(v96, v91);
+    int16x8_t v98 = vaddq_s16(v92, v97);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v90, v99);
+    int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v80, v62);
+    int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
+    int16x8_t v104 = vaddq_s16(v104_tmp, v103);
+    int16x8_t v105 = vaddq_s16(v84, v64);
+    int16x8_t v106 = vaddq_s16(v65, v86);
+    int16x8_t v107 = vaddq_s16(v105, v106);
+    int16x8_t v108 = vaddq_s16(v104, v107);
+    int16x8_t v109 = vaddq_s16(v87, v68);
+    int16x8_t v110 = vaddq_s16(v69, v79);
+    int16x8_t v111 = vaddq_s16(v109, v110);
+    int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
+    int16x8_t v112 = vaddq_s16(v112_tmp, v111);
+    int16x8_t v113 = vaddq_s16(v94, v72);
+    int16x8_t v114 = vaddq_s16(v73, v83);
+    int16x8_t v115 = vaddq_s16(v113, v114);
+    int16x8_t v116 = vaddq_s16(v115, v111);
+    int16x8_t v117 = vaddq_s16(v112, v116);
+    int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
+    int16x8_t v119 = vaddq_s16(v108, v118);
+    int16x8_t v120 = vaddq_s16(v110, v103);
+    int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
+    int16x8_t v121 = vaddq_s16(v121_tmp, v120);
+    int16x8_t v122 = vaddq_s16(v114, v105);
+    int16x8_t v123 = vaddq_s16(v106, v109);
+    int16x8_t v124 = vaddq_s16(v122, v123);
+    int16x8_t v125 = vaddq_s16(v121, v124);
+    int16x8_t v126 = vaddq_s16(v123, v120);
+    int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
+    int16x8_t v127 = vaddq_s16(v127_tmp, v126);
+    int16x8_t v128 = vld1q_s16(in + in_stride * 248 + i);
+    int16x8_t v129 = vaddq_s16(v128, v93);
+    int16x8_t v130 = vaddq_s16(v129, v113);
+    int16x8_t v131 = vaddq_s16(v130, v122);
+    int16x8_t v132 = vaddq_s16(v131, v126);
+    int16x8_t v133 = vaddq_s16(v127, v132);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v125, v134);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
+    int16x8_t v137 = vaddq_s16(v119, v136);
+    int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
+    int16x8_t v139 = vaddq_s16(v102, v138);
+    int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
+    int16x8_t v141 = vaddq_s16(v61, v140);
+    int16x8_t v142 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
+    int16x8_t v143 = vaddq_s16(v143_tmp, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 132 + i);
+    int16x8_t v145 = vld1q_s16(in + in_stride * 124 + i);
+    int16x8_t v146 = vaddq_s16(v144, v145);
+    int16x8_t v147 = vaddq_s16(v143, v146);
+    int16x8_t v148 = vld1q_s16(in + in_stride * 68 + i);
+    int16x8_t v149 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v150 = vaddq_s16(v148, v149);
+    int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
+    int16x8_t v151 = vaddq_s16(v151_tmp, v150);
+    int16x8_t v152 = vld1q_s16(in + in_stride * 196 + i);
+    int16x8_t v153 = vld1q_s16(in + in_stride * 188 + i);
+    int16x8_t v154 = vaddq_s16(v152, v153);
+    int16x8_t v155 = vaddq_s16(v154, v150);
+    int16x8_t v156 = vaddq_s16(v151, v155);
+    int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
+    int16x8_t v158 = vaddq_s16(v147, v157);
+    int16x8_t v159 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v160 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v161 = vaddq_s16(v159, v160);
+    int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
+    int16x8_t v162 = vaddq_s16(v162_tmp, v161);
+    int16x8_t v163 = vld1q_s16(in + in_stride * 164 + i);
+    int16x8_t v164 = vld1q_s16(in + in_stride * 156 + i);
+    int16x8_t v165 = vaddq_s16(v163, v164);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 100 + i);
+    int16x8_t v167 = vld1q_s16(in + in_stride * 92 + i);
+    int16x8_t v168 = vaddq_s16(v166, v167);
+    int16x8_t v169 = vaddq_s16(v165, v168);
+    int16x8_t v170 = vaddq_s16(v162, v169);
+    int16x8_t v171 = vaddq_s16(v168, v161);
+    int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
+    int16x8_t v172 = vaddq_s16(v172_tmp, v171);
+    int16x8_t v173 = vld1q_s16(in + in_stride * 228 + i);
+    int16x8_t v174 = vld1q_s16(in + in_stride * 220 + i);
+    int16x8_t v175 = vaddq_s16(v173, v174);
+    int16x8_t v176 = vaddq_s16(v175, v165);
+    int16x8_t v177 = vaddq_s16(v176, v171);
+    int16x8_t v178 = vaddq_s16(v172, v177);
+    int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
+    int16x8_t v180 = vaddq_s16(v170, v179);
+    int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
+    int16x8_t v182 = vaddq_s16(v158, v181);
+    int16x8_t v183 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v184 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v185 = vaddq_s16(v183, v184);
+    int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
+    int16x8_t v186 = vaddq_s16(v186_tmp, v185);
+    int16x8_t v187 = vld1q_s16(in + in_stride * 148 + i);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 140 + i);
+    int16x8_t v189 = vaddq_s16(v187, v188);
+    int16x8_t v190 = vld1q_s16(in + in_stride * 116 + i);
+    int16x8_t v191 = vld1q_s16(in + in_stride * 108 + i);
+    int16x8_t v192 = vaddq_s16(v190, v191);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vaddq_s16(v186, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 84 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 76 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v199 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v200 = vaddq_s16(v198, v199);
+    int16x8_t v201 = vaddq_s16(v197, v200);
+    int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
+    int16x8_t v202 = vaddq_s16(v202_tmp, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 212 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 204 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vld1q_s16(in + in_stride * 180 + i);
+    int16x8_t v207 = vld1q_s16(in + in_stride * 172 + i);
+    int16x8_t v208 = vaddq_s16(v206, v207);
+    int16x8_t v209 = vaddq_s16(v205, v208);
+    int16x8_t v210 = vaddq_s16(v209, v201);
+    int16x8_t v211 = vaddq_s16(v202, v210);
+    int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
+    int16x8_t v213 = vaddq_s16(v194, v212);
+    int16x8_t v214 = vaddq_s16(v200, v185);
+    int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
+    int16x8_t v215 = vaddq_s16(v215_tmp, v214);
+    int16x8_t v216 = vaddq_s16(v208, v189);
+    int16x8_t v217 = vaddq_s16(v192, v197);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v217, v214);
+    int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
+    int16x8_t v221 = vaddq_s16(v221_tmp, v220);
+    int16x8_t v222 = vld1q_s16(in + in_stride * 244 + i);
+    int16x8_t v223 = vld1q_s16(in + in_stride * 236 + i);
+    int16x8_t v224 = vaddq_s16(v222, v223);
+    int16x8_t v225 = vaddq_s16(v224, v205);
+    int16x8_t v226 = vaddq_s16(v225, v216);
+    int16x8_t v227 = vaddq_s16(v226, v220);
+    int16x8_t v228 = vaddq_s16(v221, v227);
+    int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
+    int16x8_t v230 = vaddq_s16(v219, v229);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
+    int16x8_t v232 = vaddq_s16(v213, v231);
+    int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
+    int16x8_t v234 = vaddq_s16(v182, v233);
+    int16x8_t v235 = vaddq_s16(v184, v142);
+    int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
+    int16x8_t v236 = vaddq_s16(v236_tmp, v235);
+    int16x8_t v237 = vaddq_s16(v188, v144);
+    int16x8_t v238 = vaddq_s16(v145, v190);
+    int16x8_t v239 = vaddq_s16(v237, v238);
+    int16x8_t v240 = vaddq_s16(v236, v239);
+    int16x8_t v241 = vaddq_s16(v196, v148);
+    int16x8_t v242 = vaddq_s16(v149, v198);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
+    int16x8_t v244 = vaddq_s16(v244_tmp, v243);
+    int16x8_t v245 = vaddq_s16(v204, v152);
+    int16x8_t v246 = vaddq_s16(v153, v206);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248 = vaddq_s16(v247, v243);
+    int16x8_t v249 = vaddq_s16(v244, v248);
+    int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
+    int16x8_t v251 = vaddq_s16(v240, v250);
+    int16x8_t v252 = vaddq_s16(v199, v159);
+    int16x8_t v253 = vaddq_s16(v160, v183);
+    int16x8_t v254 = vaddq_s16(v252, v253);
+    int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
+    int16x8_t v255 = vaddq_s16(v255_tmp, v254);
+    int16x8_t v256 = vaddq_s16(v207, v163);
+    int16x8_t v257 = vaddq_s16(v164, v187);
+    int16x8_t v258 = vaddq_s16(v256, v257);
+    int16x8_t v259 = vaddq_s16(v191, v166);
+    int16x8_t v260 = vaddq_s16(v167, v195);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v258, v261);
+    int16x8_t v263 = vaddq_s16(v255, v262);
+    int16x8_t v264 = vaddq_s16(v261, v254);
+    int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
+    int16x8_t v265 = vaddq_s16(v265_tmp, v264);
+    int16x8_t v266 = vaddq_s16(v223, v173);
+    int16x8_t v267 = vaddq_s16(v174, v203);
+    int16x8_t v268 = vaddq_s16(v266, v267);
+    int16x8_t v269 = vaddq_s16(v268, v258);
+    int16x8_t v270 = vaddq_s16(v269, v264);
+    int16x8_t v271 = vaddq_s16(v265, v270);
+    int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
+    int16x8_t v273 = vaddq_s16(v263, v272);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
+    int16x8_t v275 = vaddq_s16(v251, v274);
+    int16x8_t v276 = vaddq_s16(v253, v235);
+    int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
+    int16x8_t v277 = vaddq_s16(v277_tmp, v276);
+    int16x8_t v278 = vaddq_s16(v257, v237);
+    int16x8_t v279 = vaddq_s16(v238, v259);
+    int16x8_t v280 = vaddq_s16(v278, v279);
+    int16x8_t v281 = vaddq_s16(v277, v280);
+    int16x8_t v282 = vaddq_s16(v260, v241);
+    int16x8_t v283 = vaddq_s16(v242, v252);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
+    int16x8_t v285 = vaddq_s16(v285_tmp, v284);
+    int16x8_t v286 = vaddq_s16(v267, v245);
+    int16x8_t v287 = vaddq_s16(v246, v256);
+    int16x8_t v288 = vaddq_s16(v286, v287);
+    int16x8_t v289 = vaddq_s16(v288, v284);
+    int16x8_t v290 = vaddq_s16(v285, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v292 = vaddq_s16(v281, v291);
+    int16x8_t v293 = vaddq_s16(v283, v276);
+    int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
+    int16x8_t v294 = vaddq_s16(v294_tmp, v293);
+    int16x8_t v295 = vaddq_s16(v287, v278);
+    int16x8_t v296 = vaddq_s16(v279, v282);
+    int16x8_t v297 = vaddq_s16(v295, v296);
+    int16x8_t v298 = vaddq_s16(v294, v297);
+    int16x8_t v299 = vaddq_s16(v296, v293);
+    int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
+    int16x8_t v300 = vaddq_s16(v300_tmp, v299);
+    int16x8_t v301 = vld1q_s16(in + in_stride * 252 + i);
+    int16x8_t v302 = vaddq_s16(v301, v222);
+    int16x8_t v303 = vaddq_s16(v302, v266);
+    int16x8_t v304 = vaddq_s16(v303, v286);
+    int16x8_t v305 = vaddq_s16(v304, v295);
+    int16x8_t v306 = vaddq_s16(v305, v299);
+    int16x8_t v307 = vaddq_s16(v300, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
+    int16x8_t v309 = vaddq_s16(v298, v308);
+    int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
+    int16x8_t v311 = vaddq_s16(v292, v310);
+    int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
+    int16x8_t v313 = vaddq_s16(v275, v312);
+    int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
+    int16x8_t v315 = vaddq_s16(v234, v314);
+    int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
+    int16x8_t v317 = vaddq_s16(v141, v316);
+    int16x8_t v318 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
+    int16x8_t v319 = vaddq_s16(v319_tmp, v318);
+    int16x8_t v320 = vld1q_s16(in + in_stride * 130 + i);
+    int16x8_t v321 = vld1q_s16(in + in_stride * 126 + i);
+    int16x8_t v322 = vaddq_s16(v320, v321);
+    int16x8_t v323 = vaddq_s16(v319, v322);
+    int16x8_t v324 = vld1q_s16(in + in_stride * 66 + i);
+    int16x8_t v325 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v326 = vaddq_s16(v324, v325);
+    int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
+    int16x8_t v327 = vaddq_s16(v327_tmp, v326);
+    int16x8_t v328 = vld1q_s16(in + in_stride * 194 + i);
+    int16x8_t v329 = vld1q_s16(in + in_stride * 190 + i);
+    int16x8_t v330 = vaddq_s16(v328, v329);
+    int16x8_t v331 = vaddq_s16(v330, v326);
+    int16x8_t v332 = vaddq_s16(v327, v331);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
+    int16x8_t v334 = vaddq_s16(v323, v333);
+    int16x8_t v335 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v336 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v337 = vaddq_s16(v335, v336);
+    int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
+    int16x8_t v338 = vaddq_s16(v338_tmp, v337);
+    int16x8_t v339 = vld1q_s16(in + in_stride * 162 + i);
+    int16x8_t v340 = vld1q_s16(in + in_stride * 158 + i);
+    int16x8_t v341 = vaddq_s16(v339, v340);
+    int16x8_t v342 = vld1q_s16(in + in_stride * 98 + i);
+    int16x8_t v343 = vld1q_s16(in + in_stride * 94 + i);
+    int16x8_t v344 = vaddq_s16(v342, v343);
+    int16x8_t v345 = vaddq_s16(v341, v344);
+    int16x8_t v346 = vaddq_s16(v338, v345);
+    int16x8_t v347 = vaddq_s16(v344, v337);
+    int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
+    int16x8_t v348 = vaddq_s16(v348_tmp, v347);
+    int16x8_t v349 = vld1q_s16(in + in_stride * 226 + i);
+    int16x8_t v350 = vld1q_s16(in + in_stride * 222 + i);
+    int16x8_t v351 = vaddq_s16(v349, v350);
+    int16x8_t v352 = vaddq_s16(v351, v341);
+    int16x8_t v353 = vaddq_s16(v352, v347);
+    int16x8_t v354 = vaddq_s16(v348, v353);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
+    int16x8_t v356 = vaddq_s16(v346, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
+    int16x8_t v358 = vaddq_s16(v334, v357);
+    int16x8_t v359 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v360 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v361 = vaddq_s16(v359, v360);
+    int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
+    int16x8_t v362 = vaddq_s16(v362_tmp, v361);
+    int16x8_t v363 = vld1q_s16(in + in_stride * 146 + i);
+    int16x8_t v364 = vld1q_s16(in + in_stride * 142 + i);
+    int16x8_t v365 = vaddq_s16(v363, v364);
+    int16x8_t v366 = vld1q_s16(in + in_stride * 114 + i);
+    int16x8_t v367 = vld1q_s16(in + in_stride * 110 + i);
+    int16x8_t v368 = vaddq_s16(v366, v367);
+    int16x8_t v369 = vaddq_s16(v365, v368);
+    int16x8_t v370 = vaddq_s16(v362, v369);
+    int16x8_t v371 = vld1q_s16(in + in_stride * 82 + i);
+    int16x8_t v372 = vld1q_s16(in + in_stride * 78 + i);
+    int16x8_t v373 = vaddq_s16(v371, v372);
+    int16x8_t v374 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v375 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v376 = vaddq_s16(v374, v375);
+    int16x8_t v377 = vaddq_s16(v373, v376);
+    int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
+    int16x8_t v378 = vaddq_s16(v378_tmp, v377);
+    int16x8_t v379 = vld1q_s16(in + in_stride * 210 + i);
+    int16x8_t v380 = vld1q_s16(in + in_stride * 206 + i);
+    int16x8_t v381 = vaddq_s16(v379, v380);
+    int16x8_t v382 = vld1q_s16(in + in_stride * 178 + i);
+    int16x8_t v383 = vld1q_s16(in + in_stride * 174 + i);
+    int16x8_t v384 = vaddq_s16(v382, v383);
+    int16x8_t v385 = vaddq_s16(v381, v384);
+    int16x8_t v386 = vaddq_s16(v385, v377);
+    int16x8_t v387 = vaddq_s16(v378, v386);
+    int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
+    int16x8_t v389 = vaddq_s16(v370, v388);
+    int16x8_t v390 = vaddq_s16(v376, v361);
+    int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
+    int16x8_t v391 = vaddq_s16(v391_tmp, v390);
+    int16x8_t v392 = vaddq_s16(v384, v365);
+    int16x8_t v393 = vaddq_s16(v368, v373);
+    int16x8_t v394 = vaddq_s16(v392, v393);
+    int16x8_t v395 = vaddq_s16(v391, v394);
+    int16x8_t v396 = vaddq_s16(v393, v390);
+    int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
+    int16x8_t v397 = vaddq_s16(v397_tmp, v396);
+    int16x8_t v398 = vld1q_s16(in + in_stride * 242 + i);
+    int16x8_t v399 = vld1q_s16(in + in_stride * 238 + i);
+    int16x8_t v400 = vaddq_s16(v398, v399);
+    int16x8_t v401 = vaddq_s16(v400, v381);
+    int16x8_t v402 = vaddq_s16(v401, v392);
+    int16x8_t v403 = vaddq_s16(v402, v396);
+    int16x8_t v404 = vaddq_s16(v397, v403);
+    int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
+    int16x8_t v406 = vaddq_s16(v395, v405);
+    int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
+    int16x8_t v408 = vaddq_s16(v389, v407);
+    int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
+    int16x8_t v410 = vaddq_s16(v358, v409);
+    int16x8_t v411 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v412 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v413 = vaddq_s16(v411, v412);
+    int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
+    int16x8_t v414 = vaddq_s16(v414_tmp, v413);
+    int16x8_t v415 = vld1q_s16(in + in_stride * 138 + i);
+    int16x8_t v416 = vld1q_s16(in + in_stride * 134 + i);
+    int16x8_t v417 = vaddq_s16(v415, v416);
+    int16x8_t v418 = vld1q_s16(in + in_stride * 122 + i);
+    int16x8_t v419 = vld1q_s16(in + in_stride * 118 + i);
+    int16x8_t v420 = vaddq_s16(v418, v419);
+    int16x8_t v421 = vaddq_s16(v417, v420);
+    int16x8_t v422 = vaddq_s16(v414, v421);
+    int16x8_t v423 = vld1q_s16(in + in_stride * 74 + i);
+    int16x8_t v424 = vld1q_s16(in + in_stride * 70 + i);
+    int16x8_t v425 = vaddq_s16(v423, v424);
+    int16x8_t v426 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v427 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v428 = vaddq_s16(v426, v427);
+    int16x8_t v429 = vaddq_s16(v425, v428);
+    int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
+    int16x8_t v430 = vaddq_s16(v430_tmp, v429);
+    int16x8_t v431 = vld1q_s16(in + in_stride * 202 + i);
+    int16x8_t v432 = vld1q_s16(in + in_stride * 198 + i);
+    int16x8_t v433 = vaddq_s16(v431, v432);
+    int16x8_t v434 = vld1q_s16(in + in_stride * 186 + i);
+    int16x8_t v435 = vld1q_s16(in + in_stride * 182 + i);
+    int16x8_t v436 = vaddq_s16(v434, v435);
+    int16x8_t v437 = vaddq_s16(v433, v436);
+    int16x8_t v438 = vaddq_s16(v437, v429);
+    int16x8_t v439 = vaddq_s16(v430, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
+    int16x8_t v441 = vaddq_s16(v422, v440);
+    int16x8_t v442 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v443 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v444 = vaddq_s16(v442, v443);
+    int16x8_t v445 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v446 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v447 = vaddq_s16(v445, v446);
+    int16x8_t v448 = vaddq_s16(v444, v447);
+    int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
+    int16x8_t v449 = vaddq_s16(v449_tmp, v448);
+    int16x8_t v450 = vld1q_s16(in + in_stride * 170 + i);
+    int16x8_t v451 = vld1q_s16(in + in_stride * 166 + i);
+    int16x8_t v452 = vaddq_s16(v450, v451);
+    int16x8_t v453 = vld1q_s16(in + in_stride * 154 + i);
+    int16x8_t v454 = vld1q_s16(in + in_stride * 150 + i);
+    int16x8_t v455 = vaddq_s16(v453, v454);
+    int16x8_t v456 = vaddq_s16(v452, v455);
+    int16x8_t v457 = vld1q_s16(in + in_stride * 106 + i);
+    int16x8_t v458 = vld1q_s16(in + in_stride * 102 + i);
+    int16x8_t v459 = vaddq_s16(v457, v458);
+    int16x8_t v460 = vld1q_s16(in + in_stride * 90 + i);
+    int16x8_t v461 = vld1q_s16(in + in_stride * 86 + i);
+    int16x8_t v462 = vaddq_s16(v460, v461);
+    int16x8_t v463 = vaddq_s16(v459, v462);
+    int16x8_t v464 = vaddq_s16(v456, v463);
+    int16x8_t v465 = vaddq_s16(v449, v464);
+    int16x8_t v466 = vaddq_s16(v463, v448);
+    int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
+    int16x8_t v467 = vaddq_s16(v467_tmp, v466);
+    int16x8_t v468 = vld1q_s16(in + in_stride * 234 + i);
+    int16x8_t v469 = vld1q_s16(in + in_stride * 230 + i);
+    int16x8_t v470 = vaddq_s16(v468, v469);
+    int16x8_t v471 = vld1q_s16(in + in_stride * 218 + i);
+    int16x8_t v472 = vld1q_s16(in + in_stride * 214 + i);
+    int16x8_t v473 = vaddq_s16(v471, v472);
+    int16x8_t v474 = vaddq_s16(v470, v473);
+    int16x8_t v475 = vaddq_s16(v474, v456);
+    int16x8_t v476 = vaddq_s16(v475, v466);
+    int16x8_t v477 = vaddq_s16(v467, v476);
+    int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
+    int16x8_t v479 = vaddq_s16(v465, v478);
+    int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
+    int16x8_t v481 = vaddq_s16(v441, v480);
+    int16x8_t v482 = vaddq_s16(v447, v413);
+    int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
+    int16x8_t v483 = vaddq_s16(v483_tmp, v482);
+    int16x8_t v484 = vaddq_s16(v455, v417);
+    int16x8_t v485 = vaddq_s16(v420, v459);
+    int16x8_t v486 = vaddq_s16(v484, v485);
+    int16x8_t v487 = vaddq_s16(v483, v486);
+    int16x8_t v488 = vaddq_s16(v462, v425);
+    int16x8_t v489 = vaddq_s16(v428, v444);
+    int16x8_t v490 = vaddq_s16(v488, v489);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v490);
+    int16x8_t v492 = vaddq_s16(v473, v433);
+    int16x8_t v493 = vaddq_s16(v436, v452);
+    int16x8_t v494 = vaddq_s16(v492, v493);
+    int16x8_t v495 = vaddq_s16(v494, v490);
+    int16x8_t v496 = vaddq_s16(v491, v495);
+    int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
+    int16x8_t v498 = vaddq_s16(v487, v497);
+    int16x8_t v499 = vaddq_s16(v489, v482);
+    int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
+    int16x8_t v500 = vaddq_s16(v500_tmp, v499);
+    int16x8_t v501 = vaddq_s16(v493, v484);
+    int16x8_t v502 = vaddq_s16(v485, v488);
+    int16x8_t v503 = vaddq_s16(v501, v502);
+    int16x8_t v504 = vaddq_s16(v500, v503);
+    int16x8_t v505 = vaddq_s16(v502, v499);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
+    int16x8_t v506 = vaddq_s16(v506_tmp, v505);
+    int16x8_t v507 = vld1q_s16(in + in_stride * 250 + i);
+    int16x8_t v508 = vld1q_s16(in + in_stride * 246 + i);
+    int16x8_t v509 = vaddq_s16(v507, v508);
+    int16x8_t v510 = vaddq_s16(v509, v470);
+    int16x8_t v511 = vaddq_s16(v510, v492);
+    int16x8_t v512 = vaddq_s16(v511, v501);
+    int16x8_t v513 = vaddq_s16(v512, v505);
+    int16x8_t v514 = vaddq_s16(v506, v513);
+    int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
+    int16x8_t v516 = vaddq_s16(v504, v515);
+    int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
+    int16x8_t v518 = vaddq_s16(v498, v517);
+    int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
+    int16x8_t v520 = vaddq_s16(v481, v519);
+    int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
+    int16x8_t v522 = vaddq_s16(v410, v521);
+    int16x8_t v523 = vaddq_s16(v412, v318);
+    int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
+    int16x8_t v524 = vaddq_s16(v524_tmp, v523);
+    int16x8_t v525 = vaddq_s16(v416, v320);
+    int16x8_t v526 = vaddq_s16(v321, v418);
+    int16x8_t v527 = vaddq_s16(v525, v526);
+    int16x8_t v528 = vaddq_s16(v524, v527);
+    int16x8_t v529 = vaddq_s16(v424, v324);
+    int16x8_t v530 = vaddq_s16(v325, v426);
+    int16x8_t v531 = vaddq_s16(v529, v530);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
+    int16x8_t v532 = vaddq_s16(v532_tmp, v531);
+    int16x8_t v533 = vaddq_s16(v432, v328);
+    int16x8_t v534 = vaddq_s16(v329, v434);
+    int16x8_t v535 = vaddq_s16(v533, v534);
+    int16x8_t v536 = vaddq_s16(v535, v531);
+    int16x8_t v537 = vaddq_s16(v532, v536);
+    int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
+    int16x8_t v539 = vaddq_s16(v528, v538);
+    int16x8_t v540 = vaddq_s16(v443, v335);
+    int16x8_t v541 = vaddq_s16(v336, v445);
+    int16x8_t v542 = vaddq_s16(v540, v541);
+    int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
+    int16x8_t v543 = vaddq_s16(v543_tmp, v542);
+    int16x8_t v544 = vaddq_s16(v451, v339);
+    int16x8_t v545 = vaddq_s16(v340, v453);
+    int16x8_t v546 = vaddq_s16(v544, v545);
+    int16x8_t v547 = vaddq_s16(v458, v342);
+    int16x8_t v548 = vaddq_s16(v343, v460);
+    int16x8_t v549 = vaddq_s16(v547, v548);
+    int16x8_t v550 = vaddq_s16(v546, v549);
+    int16x8_t v551 = vaddq_s16(v543, v550);
+    int16x8_t v552 = vaddq_s16(v549, v542);
+    int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
+    int16x8_t v553 = vaddq_s16(v553_tmp, v552);
+    int16x8_t v554 = vaddq_s16(v469, v349);
+    int16x8_t v555 = vaddq_s16(v350, v471);
+    int16x8_t v556 = vaddq_s16(v554, v555);
+    int16x8_t v557 = vaddq_s16(v556, v546);
+    int16x8_t v558 = vaddq_s16(v557, v552);
+    int16x8_t v559 = vaddq_s16(v553, v558);
+    int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
+    int16x8_t v561 = vaddq_s16(v551, v560);
+    int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
+    int16x8_t v563 = vaddq_s16(v539, v562);
+    int16x8_t v564 = vaddq_s16(v446, v359);
+    int16x8_t v565 = vaddq_s16(v360, v411);
+    int16x8_t v566 = vaddq_s16(v564, v565);
+    int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
+    int16x8_t v567 = vaddq_s16(v567_tmp, v566);
+    int16x8_t v568 = vaddq_s16(v454, v363);
+    int16x8_t v569 = vaddq_s16(v364, v415);
+    int16x8_t v570 = vaddq_s16(v568, v569);
+    int16x8_t v571 = vaddq_s16(v419, v366);
+    int16x8_t v572 = vaddq_s16(v367, v457);
+    int16x8_t v573 = vaddq_s16(v571, v572);
+    int16x8_t v574 = vaddq_s16(v570, v573);
+    int16x8_t v575 = vaddq_s16(v567, v574);
+    int16x8_t v576 = vaddq_s16(v461, v371);
+    int16x8_t v577 = vaddq_s16(v372, v423);
+    int16x8_t v578 = vaddq_s16(v576, v577);
+    int16x8_t v579 = vaddq_s16(v427, v374);
+    int16x8_t v580 = vaddq_s16(v375, v442);
+    int16x8_t v581 = vaddq_s16(v579, v580);
+    int16x8_t v582 = vaddq_s16(v578, v581);
+    int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
+    int16x8_t v583 = vaddq_s16(v583_tmp, v582);
+    int16x8_t v584 = vaddq_s16(v472, v379);
+    int16x8_t v585 = vaddq_s16(v380, v431);
+    int16x8_t v586 = vaddq_s16(v584, v585);
+    int16x8_t v587 = vaddq_s16(v435, v382);
+    int16x8_t v588 = vaddq_s16(v383, v450);
+    int16x8_t v589 = vaddq_s16(v587, v588);
+    int16x8_t v590 = vaddq_s16(v586, v589);
+    int16x8_t v591 = vaddq_s16(v590, v582);
+    int16x8_t v592 = vaddq_s16(v583, v591);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
+    int16x8_t v594 = vaddq_s16(v575, v593);
+    int16x8_t v595 = vaddq_s16(v581, v566);
+    int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
+    int16x8_t v596 = vaddq_s16(v596_tmp, v595);
+    int16x8_t v597 = vaddq_s16(v589, v570);
+    int16x8_t v598 = vaddq_s16(v573, v578);
+    int16x8_t v599 = vaddq_s16(v597, v598);
+    int16x8_t v600 = vaddq_s16(v596, v599);
+    int16x8_t v601 = vaddq_s16(v598, v595);
+    int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
+    int16x8_t v602 = vaddq_s16(v602_tmp, v601);
+    int16x8_t v603 = vaddq_s16(v508, v398);
+    int16x8_t v604 = vaddq_s16(v399, v468);
+    int16x8_t v605 = vaddq_s16(v603, v604);
+    int16x8_t v606 = vaddq_s16(v605, v586);
+    int16x8_t v607 = vaddq_s16(v606, v597);
+    int16x8_t v608 = vaddq_s16(v607, v601);
+    int16x8_t v609 = vaddq_s16(v602, v608);
+    int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
+    int16x8_t v611 = vaddq_s16(v600, v610);
+    int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
+    int16x8_t v613 = vaddq_s16(v594, v612);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
+    int16x8_t v615 = vaddq_s16(v563, v614);
+    int16x8_t v616 = vaddq_s16(v565, v523);
+    int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
+    int16x8_t v617 = vaddq_s16(v617_tmp, v616);
+    int16x8_t v618 = vaddq_s16(v569, v525);
+    int16x8_t v619 = vaddq_s16(v526, v571);
+    int16x8_t v620 = vaddq_s16(v618, v619);
+    int16x8_t v621 = vaddq_s16(v617, v620);
+    int16x8_t v622 = vaddq_s16(v577, v529);
+    int16x8_t v623 = vaddq_s16(v530, v579);
+    int16x8_t v624 = vaddq_s16(v622, v623);
+    int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
+    int16x8_t v625 = vaddq_s16(v625_tmp, v624);
+    int16x8_t v626 = vaddq_s16(v585, v533);
+    int16x8_t v627 = vaddq_s16(v534, v587);
+    int16x8_t v628 = vaddq_s16(v626, v627);
+    int16x8_t v629 = vaddq_s16(v628, v624);
+    int16x8_t v630 = vaddq_s16(v625, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
+    int16x8_t v632 = vaddq_s16(v621, v631);
+    int16x8_t v633 = vaddq_s16(v580, v540);
+    int16x8_t v634 = vaddq_s16(v541, v564);
+    int16x8_t v635 = vaddq_s16(v633, v634);
+    int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
+    int16x8_t v636 = vaddq_s16(v636_tmp, v635);
+    int16x8_t v637 = vaddq_s16(v588, v544);
+    int16x8_t v638 = vaddq_s16(v545, v568);
+    int16x8_t v639 = vaddq_s16(v637, v638);
+    int16x8_t v640 = vaddq_s16(v572, v547);
+    int16x8_t v641 = vaddq_s16(v548, v576);
+    int16x8_t v642 = vaddq_s16(v640, v641);
+    int16x8_t v643 = vaddq_s16(v639, v642);
+    int16x8_t v644 = vaddq_s16(v636, v643);
+    int16x8_t v645 = vaddq_s16(v642, v635);
+    int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
+    int16x8_t v646 = vaddq_s16(v646_tmp, v645);
+    int16x8_t v647 = vaddq_s16(v604, v554);
+    int16x8_t v648 = vaddq_s16(v555, v584);
+    int16x8_t v649 = vaddq_s16(v647, v648);
+    int16x8_t v650 = vaddq_s16(v649, v639);
+    int16x8_t v651 = vaddq_s16(v650, v645);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
+    int16x8_t v654 = vaddq_s16(v644, v653);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
+    int16x8_t v656 = vaddq_s16(v632, v655);
+    int16x8_t v657 = vaddq_s16(v634, v616);
+    int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
+    int16x8_t v658 = vaddq_s16(v658_tmp, v657);
+    int16x8_t v659 = vaddq_s16(v638, v618);
+    int16x8_t v660 = vaddq_s16(v619, v640);
+    int16x8_t v661 = vaddq_s16(v659, v660);
+    int16x8_t v662 = vaddq_s16(v658, v661);
+    int16x8_t v663 = vaddq_s16(v641, v622);
+    int16x8_t v664 = vaddq_s16(v623, v633);
+    int16x8_t v665 = vaddq_s16(v663, v664);
+    int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
+    int16x8_t v666 = vaddq_s16(v666_tmp, v665);
+    int16x8_t v667 = vaddq_s16(v648, v626);
+    int16x8_t v668 = vaddq_s16(v627, v637);
+    int16x8_t v669 = vaddq_s16(v667, v668);
+    int16x8_t v670 = vaddq_s16(v669, v665);
+    int16x8_t v671 = vaddq_s16(v666, v670);
+    int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
+    int16x8_t v673 = vaddq_s16(v662, v672);
+    int16x8_t v674 = vaddq_s16(v664, v657);
+    int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
+    int16x8_t v675 = vaddq_s16(v675_tmp, v674);
+    int16x8_t v676 = vaddq_s16(v668, v659);
+    int16x8_t v677 = vaddq_s16(v660, v663);
+    int16x8_t v678 = vaddq_s16(v676, v677);
+    int16x8_t v679 = vaddq_s16(v675, v678);
+    int16x8_t v680 = vaddq_s16(v677, v674);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vld1q_s16(in + in_stride * 254 + i);
+    int16x8_t v683 = vaddq_s16(v682, v507);
+    int16x8_t v684 = vaddq_s16(v683, v603);
+    int16x8_t v685 = vaddq_s16(v684, v647);
+    int16x8_t v686 = vaddq_s16(v685, v667);
+    int16x8_t v687 = vaddq_s16(v686, v676);
+    int16x8_t v688 = vaddq_s16(v687, v680);
+    int16x8_t v689 = vaddq_s16(v681, v688);
+    int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
+    int16x8_t v691 = vaddq_s16(v679, v690);
+    int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
+    int16x8_t v693 = vaddq_s16(v673, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
+    int16x8_t v695 = vaddq_s16(v656, v694);
+    int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
+    int16x8_t v697 = vaddq_s16(v615, v696);
+    int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
+    int16x8_t v699 = vaddq_s16(v522, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
+    int16x8_t v701 = vaddq_s16(v317, v700);
+    int16x8_t v702 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 13573);
+    int16x8_t v703 = vaddq_s16(v703_tmp, v702);
+    int16x8_t v704 = vld1q_s16(in + in_stride * 129 + i);
+    int16x8_t v705 = vld1q_s16(in + in_stride * 127 + i);
+    int16x8_t v706 = vaddq_s16(v704, v705);
+    int16x8_t v707 = vaddq_s16(v703, v706);
+    int16x8_t v708 = vld1q_s16(in + in_stride * 65 + i);
+    int16x8_t v709 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v710 = vaddq_s16(v708, v709);
+    int16x8_t v711_tmp = vqrdmulhq_n_s16(v710, 13573);
+    int16x8_t v711 = vaddq_s16(v711_tmp, v710);
+    int16x8_t v712 = vld1q_s16(in + in_stride * 193 + i);
+    int16x8_t v713 = vld1q_s16(in + in_stride * 191 + i);
+    int16x8_t v714 = vaddq_s16(v712, v713);
+    int16x8_t v715 = vaddq_s16(v714, v710);
+    int16x8_t v716 = vaddq_s16(v711, v715);
+    int16x8_t v717 = vqrdmulhq_n_s16(v716, 17734);
+    int16x8_t v718 = vaddq_s16(v707, v717);
+    int16x8_t v719 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v720 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v721 = vaddq_s16(v719, v720);
+    int16x8_t v722_tmp = vqrdmulhq_n_s16(v721, 13573);
+    int16x8_t v722 = vaddq_s16(v722_tmp, v721);
+    int16x8_t v723 = vld1q_s16(in + in_stride * 161 + i);
+    int16x8_t v724 = vld1q_s16(in + in_stride * 159 + i);
+    int16x8_t v725 = vaddq_s16(v723, v724);
+    int16x8_t v726 = vld1q_s16(in + in_stride * 97 + i);
+    int16x8_t v727 = vld1q_s16(in + in_stride * 95 + i);
+    int16x8_t v728 = vaddq_s16(v726, v727);
+    int16x8_t v729 = vaddq_s16(v725, v728);
+    int16x8_t v730 = vaddq_s16(v722, v729);
+    int16x8_t v731 = vaddq_s16(v728, v721);
+    int16x8_t v732_tmp = vqrdmulhq_n_s16(v731, 13573);
+    int16x8_t v732 = vaddq_s16(v732_tmp, v731);
+    int16x8_t v733 = vld1q_s16(in + in_stride * 225 + i);
+    int16x8_t v734 = vld1q_s16(in + in_stride * 223 + i);
+    int16x8_t v735 = vaddq_s16(v733, v734);
+    int16x8_t v736 = vaddq_s16(v735, v725);
+    int16x8_t v737 = vaddq_s16(v736, v731);
+    int16x8_t v738 = vaddq_s16(v732, v737);
+    int16x8_t v739 = vqrdmulhq_n_s16(v738, 17734);
+    int16x8_t v740 = vaddq_s16(v730, v739);
+    int16x8_t v741 = vqrdmulhq_n_s16(v740, 16705);
+    int16x8_t v742 = vaddq_s16(v718, v741);
+    int16x8_t v743 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v744 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v745 = vaddq_s16(v743, v744);
+    int16x8_t v746_tmp = vqrdmulhq_n_s16(v745, 13573);
+    int16x8_t v746 = vaddq_s16(v746_tmp, v745);
+    int16x8_t v747 = vld1q_s16(in + in_stride * 145 + i);
+    int16x8_t v748 = vld1q_s16(in + in_stride * 143 + i);
+    int16x8_t v749 = vaddq_s16(v747, v748);
+    int16x8_t v750 = vld1q_s16(in + in_stride * 113 + i);
+    int16x8_t v751 = vld1q_s16(in + in_stride * 111 + i);
+    int16x8_t v752 = vaddq_s16(v750, v751);
+    int16x8_t v753 = vaddq_s16(v749, v752);
+    int16x8_t v754 = vaddq_s16(v746, v753);
+    int16x8_t v755 = vld1q_s16(in + in_stride * 81 + i);
+    int16x8_t v756 = vld1q_s16(in + in_stride * 79 + i);
+    int16x8_t v757 = vaddq_s16(v755, v756);
+    int16x8_t v758 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v759 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v760 = vaddq_s16(v758, v759);
+    int16x8_t v761 = vaddq_s16(v757, v760);
+    int16x8_t v762_tmp = vqrdmulhq_n_s16(v761, 13573);
+    int16x8_t v762 = vaddq_s16(v762_tmp, v761);
+    int16x8_t v763 = vld1q_s16(in + in_stride * 209 + i);
+    int16x8_t v764 = vld1q_s16(in + in_stride * 207 + i);
+    int16x8_t v765 = vaddq_s16(v763, v764);
+    int16x8_t v766 = vld1q_s16(in + in_stride * 177 + i);
+    int16x8_t v767 = vld1q_s16(in + in_stride * 175 + i);
+    int16x8_t v768 = vaddq_s16(v766, v767);
+    int16x8_t v769 = vaddq_s16(v765, v768);
+    int16x8_t v770 = vaddq_s16(v769, v761);
+    int16x8_t v771 = vaddq_s16(v762, v770);
+    int16x8_t v772 = vqrdmulhq_n_s16(v771, 17734);
+    int16x8_t v773 = vaddq_s16(v754, v772);
+    int16x8_t v774 = vaddq_s16(v760, v745);
+    int16x8_t v775_tmp = vqrdmulhq_n_s16(v774, 13573);
+    int16x8_t v775 = vaddq_s16(v775_tmp, v774);
+    int16x8_t v776 = vaddq_s16(v768, v749);
+    int16x8_t v777 = vaddq_s16(v752, v757);
+    int16x8_t v778 = vaddq_s16(v776, v777);
+    int16x8_t v779 = vaddq_s16(v775, v778);
+    int16x8_t v780 = vaddq_s16(v777, v774);
+    int16x8_t v781_tmp = vqrdmulhq_n_s16(v780, 13573);
+    int16x8_t v781 = vaddq_s16(v781_tmp, v780);
+    int16x8_t v782 = vld1q_s16(in + in_stride * 241 + i);
+    int16x8_t v783 = vld1q_s16(in + in_stride * 239 + i);
+    int16x8_t v784 = vaddq_s16(v782, v783);
+    int16x8_t v785 = vaddq_s16(v784, v765);
+    int16x8_t v786 = vaddq_s16(v785, v776);
+    int16x8_t v787 = vaddq_s16(v786, v780);
+    int16x8_t v788 = vaddq_s16(v781, v787);
+    int16x8_t v789 = vqrdmulhq_n_s16(v788, 17734);
+    int16x8_t v790 = vaddq_s16(v779, v789);
+    int16x8_t v791 = vqrdmulhq_n_s16(v790, 16705);
+    int16x8_t v792 = vaddq_s16(v773, v791);
+    int16x8_t v793 = vqrdmulhq_n_s16(v792, 16463);
+    int16x8_t v794 = vaddq_s16(v742, v793);
+    int16x8_t v795 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v796 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v797 = vaddq_s16(v795, v796);
+    int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 13573);
+    int16x8_t v798 = vaddq_s16(v798_tmp, v797);
+    int16x8_t v799 = vld1q_s16(in + in_stride * 137 + i);
+    int16x8_t v800 = vld1q_s16(in + in_stride * 135 + i);
+    int16x8_t v801 = vaddq_s16(v799, v800);
+    int16x8_t v802 = vld1q_s16(in + in_stride * 121 + i);
+    int16x8_t v803 = vld1q_s16(in + in_stride * 119 + i);
+    int16x8_t v804 = vaddq_s16(v802, v803);
+    int16x8_t v805 = vaddq_s16(v801, v804);
+    int16x8_t v806 = vaddq_s16(v798, v805);
+    int16x8_t v807 = vld1q_s16(in + in_stride * 73 + i);
+    int16x8_t v808 = vld1q_s16(in + in_stride * 71 + i);
+    int16x8_t v809 = vaddq_s16(v807, v808);
+    int16x8_t v810 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v811 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v812 = vaddq_s16(v810, v811);
+    int16x8_t v813 = vaddq_s16(v809, v812);
+    int16x8_t v814_tmp = vqrdmulhq_n_s16(v813, 13573);
+    int16x8_t v814 = vaddq_s16(v814_tmp, v813);
+    int16x8_t v815 = vld1q_s16(in + in_stride * 201 + i);
+    int16x8_t v816 = vld1q_s16(in + in_stride * 199 + i);
+    int16x8_t v817 = vaddq_s16(v815, v816);
+    int16x8_t v818 = vld1q_s16(in + in_stride * 185 + i);
+    int16x8_t v819 = vld1q_s16(in + in_stride * 183 + i);
+    int16x8_t v820 = vaddq_s16(v818, v819);
+    int16x8_t v821 = vaddq_s16(v817, v820);
+    int16x8_t v822 = vaddq_s16(v821, v813);
+    int16x8_t v823 = vaddq_s16(v814, v822);
+    int16x8_t v824 = vqrdmulhq_n_s16(v823, 17734);
+    int16x8_t v825 = vaddq_s16(v806, v824);
+    int16x8_t v826 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v827 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v828 = vaddq_s16(v826, v827);
+    int16x8_t v829 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v830 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v831 = vaddq_s16(v829, v830);
+    int16x8_t v832 = vaddq_s16(v828, v831);
+    int16x8_t v833_tmp = vqrdmulhq_n_s16(v832, 13573);
+    int16x8_t v833 = vaddq_s16(v833_tmp, v832);
+    int16x8_t v834 = vld1q_s16(in + in_stride * 169 + i);
+    int16x8_t v835 = vld1q_s16(in + in_stride * 167 + i);
+    int16x8_t v836 = vaddq_s16(v834, v835);
+    int16x8_t v837 = vld1q_s16(in + in_stride * 153 + i);
+    int16x8_t v838 = vld1q_s16(in + in_stride * 151 + i);
+    int16x8_t v839 = vaddq_s16(v837, v838);
+    int16x8_t v840 = vaddq_s16(v836, v839);
+    int16x8_t v841 = vld1q_s16(in + in_stride * 105 + i);
+    int16x8_t v842 = vld1q_s16(in + in_stride * 103 + i);
+    int16x8_t v843 = vaddq_s16(v841, v842);
+    int16x8_t v844 = vld1q_s16(in + in_stride * 89 + i);
+    int16x8_t v845 = vld1q_s16(in + in_stride * 87 + i);
+    int16x8_t v846 = vaddq_s16(v844, v845);
+    int16x8_t v847 = vaddq_s16(v843, v846);
+    int16x8_t v848 = vaddq_s16(v840, v847);
+    int16x8_t v849 = vaddq_s16(v833, v848);
+    int16x8_t v850 = vaddq_s16(v847, v832);
+    int16x8_t v851_tmp = vqrdmulhq_n_s16(v850, 13573);
+    int16x8_t v851 = vaddq_s16(v851_tmp, v850);
+    int16x8_t v852 = vld1q_s16(in + in_stride * 233 + i);
+    int16x8_t v853 = vld1q_s16(in + in_stride * 231 + i);
+    int16x8_t v854 = vaddq_s16(v852, v853);
+    int16x8_t v855 = vld1q_s16(in + in_stride * 217 + i);
+    int16x8_t v856 = vld1q_s16(in + in_stride * 215 + i);
+    int16x8_t v857 = vaddq_s16(v855, v856);
+    int16x8_t v858 = vaddq_s16(v854, v857);
+    int16x8_t v859 = vaddq_s16(v858, v840);
+    int16x8_t v860 = vaddq_s16(v859, v850);
+    int16x8_t v861 = vaddq_s16(v851, v860);
+    int16x8_t v862 = vqrdmulhq_n_s16(v861, 17734);
+    int16x8_t v863 = vaddq_s16(v849, v862);
+    int16x8_t v864 = vqrdmulhq_n_s16(v863, 16705);
+    int16x8_t v865 = vaddq_s16(v825, v864);
+    int16x8_t v866 = vaddq_s16(v831, v797);
+    int16x8_t v867_tmp = vqrdmulhq_n_s16(v866, 13573);
+    int16x8_t v867 = vaddq_s16(v867_tmp, v866);
+    int16x8_t v868 = vaddq_s16(v839, v801);
+    int16x8_t v869 = vaddq_s16(v804, v843);
+    int16x8_t v870 = vaddq_s16(v868, v869);
+    int16x8_t v871 = vaddq_s16(v867, v870);
+    int16x8_t v872 = vaddq_s16(v846, v809);
+    int16x8_t v873 = vaddq_s16(v812, v828);
+    int16x8_t v874 = vaddq_s16(v872, v873);
+    int16x8_t v875_tmp = vqrdmulhq_n_s16(v874, 13573);
+    int16x8_t v875 = vaddq_s16(v875_tmp, v874);
+    int16x8_t v876 = vaddq_s16(v857, v817);
+    int16x8_t v877 = vaddq_s16(v820, v836);
+    int16x8_t v878 = vaddq_s16(v876, v877);
+    int16x8_t v879 = vaddq_s16(v878, v874);
+    int16x8_t v880 = vaddq_s16(v875, v879);
+    int16x8_t v881 = vqrdmulhq_n_s16(v880, 17734);
+    int16x8_t v882 = vaddq_s16(v871, v881);
+    int16x8_t v883 = vaddq_s16(v873, v866);
+    int16x8_t v884_tmp = vqrdmulhq_n_s16(v883, 13573);
+    int16x8_t v884 = vaddq_s16(v884_tmp, v883);
+    int16x8_t v885 = vaddq_s16(v877, v868);
+    int16x8_t v886 = vaddq_s16(v869, v872);
+    int16x8_t v887 = vaddq_s16(v885, v886);
+    int16x8_t v888 = vaddq_s16(v884, v887);
+    int16x8_t v889 = vaddq_s16(v886, v883);
+    int16x8_t v890_tmp = vqrdmulhq_n_s16(v889, 13573);
+    int16x8_t v890 = vaddq_s16(v890_tmp, v889);
+    int16x8_t v891 = vld1q_s16(in + in_stride * 249 + i);
+    int16x8_t v892 = vld1q_s16(in + in_stride * 247 + i);
+    int16x8_t v893 = vaddq_s16(v891, v892);
+    int16x8_t v894 = vaddq_s16(v893, v854);
+    int16x8_t v895 = vaddq_s16(v894, v876);
+    int16x8_t v896 = vaddq_s16(v895, v885);
+    int16x8_t v897 = vaddq_s16(v896, v889);
+    int16x8_t v898 = vaddq_s16(v890, v897);
+    int16x8_t v899 = vqrdmulhq_n_s16(v898, 17734);
+    int16x8_t v900 = vaddq_s16(v888, v899);
+    int16x8_t v901 = vqrdmulhq_n_s16(v900, 16705);
+    int16x8_t v902 = vaddq_s16(v882, v901);
+    int16x8_t v903 = vqrdmulhq_n_s16(v902, 16463);
+    int16x8_t v904 = vaddq_s16(v865, v903);
+    int16x8_t v905 = vqrdmulhq_n_s16(v904, 16404);
+    int16x8_t v906 = vaddq_s16(v794, v905);
+    int16x8_t v907 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v908 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v909 = vaddq_s16(v907, v908);
+    int16x8_t v910_tmp = vqrdmulhq_n_s16(v909, 13573);
+    int16x8_t v910 = vaddq_s16(v910_tmp, v909);
+    int16x8_t v911 = vld1q_s16(in + in_stride * 133 + i);
+    int16x8_t v912 = vld1q_s16(in + in_stride * 131 + i);
+    int16x8_t v913 = vaddq_s16(v911, v912);
+    int16x8_t v914 = vld1q_s16(in + in_stride * 125 + i);
+    int16x8_t v915 = vld1q_s16(in + in_stride * 123 + i);
+    int16x8_t v916 = vaddq_s16(v914, v915);
+    int16x8_t v917 = vaddq_s16(v913, v916);
+    int16x8_t v918 = vaddq_s16(v910, v917);
+    int16x8_t v919 = vld1q_s16(in + in_stride * 69 + i);
+    int16x8_t v920 = vld1q_s16(in + in_stride * 67 + i);
+    int16x8_t v921 = vaddq_s16(v919, v920);
+    int16x8_t v922 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v923 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v924 = vaddq_s16(v922, v923);
+    int16x8_t v925 = vaddq_s16(v921, v924);
+    int16x8_t v926_tmp = vqrdmulhq_n_s16(v925, 13573);
+    int16x8_t v926 = vaddq_s16(v926_tmp, v925);
+    int16x8_t v927 = vld1q_s16(in + in_stride * 197 + i);
+    int16x8_t v928 = vld1q_s16(in + in_stride * 195 + i);
+    int16x8_t v929 = vaddq_s16(v927, v928);
+    int16x8_t v930 = vld1q_s16(in + in_stride * 189 + i);
+    int16x8_t v931 = vld1q_s16(in + in_stride * 187 + i);
+    int16x8_t v932 = vaddq_s16(v930, v931);
+    int16x8_t v933 = vaddq_s16(v929, v932);
+    int16x8_t v934 = vaddq_s16(v933, v925);
+    int16x8_t v935 = vaddq_s16(v926, v934);
+    int16x8_t v936 = vqrdmulhq_n_s16(v935, 17734);
+    int16x8_t v937 = vaddq_s16(v918, v936);
+    int16x8_t v938 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v939 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v940 = vaddq_s16(v938, v939);
+    int16x8_t v941 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v942 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v943 = vaddq_s16(v941, v942);
+    int16x8_t v944 = vaddq_s16(v940, v943);
+    int16x8_t v945_tmp = vqrdmulhq_n_s16(v944, 13573);
+    int16x8_t v945 = vaddq_s16(v945_tmp, v944);
+    int16x8_t v946 = vld1q_s16(in + in_stride * 165 + i);
+    int16x8_t v947 = vld1q_s16(in + in_stride * 163 + i);
+    int16x8_t v948 = vaddq_s16(v946, v947);
+    int16x8_t v949 = vld1q_s16(in + in_stride * 157 + i);
+    int16x8_t v950 = vld1q_s16(in + in_stride * 155 + i);
+    int16x8_t v951 = vaddq_s16(v949, v950);
+    int16x8_t v952 = vaddq_s16(v948, v951);
+    int16x8_t v953 = vld1q_s16(in + in_stride * 101 + i);
+    int16x8_t v954 = vld1q_s16(in + in_stride * 99 + i);
+    int16x8_t v955 = vaddq_s16(v953, v954);
+    int16x8_t v956 = vld1q_s16(in + in_stride * 93 + i);
+    int16x8_t v957 = vld1q_s16(in + in_stride * 91 + i);
+    int16x8_t v958 = vaddq_s16(v956, v957);
+    int16x8_t v959 = vaddq_s16(v955, v958);
+    int16x8_t v960 = vaddq_s16(v952, v959);
+    int16x8_t v961 = vaddq_s16(v945, v960);
+    int16x8_t v962 = vaddq_s16(v959, v944);
+    int16x8_t v963_tmp = vqrdmulhq_n_s16(v962, 13573);
+    int16x8_t v963 = vaddq_s16(v963_tmp, v962);
+    int16x8_t v964 = vld1q_s16(in + in_stride * 229 + i);
+    int16x8_t v965 = vld1q_s16(in + in_stride * 227 + i);
+    int16x8_t v966 = vaddq_s16(v964, v965);
+    int16x8_t v967 = vld1q_s16(in + in_stride * 221 + i);
+    int16x8_t v968 = vld1q_s16(in + in_stride * 219 + i);
+    int16x8_t v969 = vaddq_s16(v967, v968);
+    int16x8_t v970 = vaddq_s16(v966, v969);
+    int16x8_t v971 = vaddq_s16(v970, v952);
+    int16x8_t v972 = vaddq_s16(v971, v962);
+    int16x8_t v973 = vaddq_s16(v963, v972);
+    int16x8_t v974 = vqrdmulhq_n_s16(v973, 17734);
+    int16x8_t v975 = vaddq_s16(v961, v974);
+    int16x8_t v976 = vqrdmulhq_n_s16(v975, 16705);
+    int16x8_t v977 = vaddq_s16(v937, v976);
+    int16x8_t v978 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v979 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v980 = vaddq_s16(v978, v979);
+    int16x8_t v981 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v982 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v983 = vaddq_s16(v981, v982);
+    int16x8_t v984 = vaddq_s16(v980, v983);
+    int16x8_t v985_tmp = vqrdmulhq_n_s16(v984, 13573);
+    int16x8_t v985 = vaddq_s16(v985_tmp, v984);
+    int16x8_t v986 = vld1q_s16(in + in_stride * 149 + i);
+    int16x8_t v987 = vld1q_s16(in + in_stride * 147 + i);
+    int16x8_t v988 = vaddq_s16(v986, v987);
+    int16x8_t v989 = vld1q_s16(in + in_stride * 141 + i);
+    int16x8_t v990 = vld1q_s16(in + in_stride * 139 + i);
+    int16x8_t v991 = vaddq_s16(v989, v990);
+    int16x8_t v992 = vaddq_s16(v988, v991);
+    int16x8_t v993 = vld1q_s16(in + in_stride * 117 + i);
+    int16x8_t v994 = vld1q_s16(in + in_stride * 115 + i);
+    int16x8_t v995 = vaddq_s16(v993, v994);
+    int16x8_t v996 = vld1q_s16(in + in_stride * 109 + i);
+    int16x8_t v997 = vld1q_s16(in + in_stride * 107 + i);
+    int16x8_t v998 = vaddq_s16(v996, v997);
+    int16x8_t v999 = vaddq_s16(v995, v998);
+    int16x8_t v1000 = vaddq_s16(v992, v999);
+    int16x8_t v1001 = vaddq_s16(v985, v1000);
+    int16x8_t v1002 = vld1q_s16(in + in_stride * 85 + i);
+    int16x8_t v1003 = vld1q_s16(in + in_stride * 83 + i);
+    int16x8_t v1004 = vaddq_s16(v1002, v1003);
+    int16x8_t v1005 = vld1q_s16(in + in_stride * 77 + i);
+    int16x8_t v1006 = vld1q_s16(in + in_stride * 75 + i);
+    int16x8_t v1007 = vaddq_s16(v1005, v1006);
+    int16x8_t v1008 = vaddq_s16(v1004, v1007);
+    int16x8_t v1009 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v1010 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v1011 = vaddq_s16(v1009, v1010);
+    int16x8_t v1012 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v1013 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v1014 = vaddq_s16(v1012, v1013);
+    int16x8_t v1015 = vaddq_s16(v1011, v1014);
+    int16x8_t v1016 = vaddq_s16(v1008, v1015);
+    int16x8_t v1017_tmp = vqrdmulhq_n_s16(v1016, 13573);
+    int16x8_t v1017 = vaddq_s16(v1017_tmp, v1016);
+    int16x8_t v1018 = vld1q_s16(in + in_stride * 213 + i);
+    int16x8_t v1019 = vld1q_s16(in + in_stride * 211 + i);
+    int16x8_t v1020 = vaddq_s16(v1018, v1019);
+    int16x8_t v1021 = vld1q_s16(in + in_stride * 205 + i);
+    int16x8_t v1022 = vld1q_s16(in + in_stride * 203 + i);
+    int16x8_t v1023 = vaddq_s16(v1021, v1022);
+    int16x8_t v1024 = vaddq_s16(v1020, v1023);
+    int16x8_t v1025 = vld1q_s16(in + in_stride * 181 + i);
+    int16x8_t v1026 = vld1q_s16(in + in_stride * 179 + i);
+    int16x8_t v1027 = vaddq_s16(v1025, v1026);
+    int16x8_t v1028 = vld1q_s16(in + in_stride * 173 + i);
+    int16x8_t v1029 = vld1q_s16(in + in_stride * 171 + i);
+    int16x8_t v1030 = vaddq_s16(v1028, v1029);
+    int16x8_t v1031 = vaddq_s16(v1027, v1030);
+    int16x8_t v1032 = vaddq_s16(v1024, v1031);
+    int16x8_t v1033 = vaddq_s16(v1032, v1016);
+    int16x8_t v1034 = vaddq_s16(v1017, v1033);
+    int16x8_t v1035 = vqrdmulhq_n_s16(v1034, 17734);
+    int16x8_t v1036 = vaddq_s16(v1001, v1035);
+    int16x8_t v1037 = vaddq_s16(v1015, v984);
+    int16x8_t v1038_tmp = vqrdmulhq_n_s16(v1037, 13573);
+    int16x8_t v1038 = vaddq_s16(v1038_tmp, v1037);
+    int16x8_t v1039 = vaddq_s16(v1031, v992);
+    int16x8_t v1040 = vaddq_s16(v999, v1008);
+    int16x8_t v1041 = vaddq_s16(v1039, v1040);
+    int16x8_t v1042 = vaddq_s16(v1038, v1041);
+    int16x8_t v1043 = vaddq_s16(v1040, v1037);
+    int16x8_t v1044_tmp = vqrdmulhq_n_s16(v1043, 13573);
+    int16x8_t v1044 = vaddq_s16(v1044_tmp, v1043);
+    int16x8_t v1045 = vld1q_s16(in + in_stride * 245 + i);
+    int16x8_t v1046 = vld1q_s16(in + in_stride * 243 + i);
+    int16x8_t v1047 = vaddq_s16(v1045, v1046);
+    int16x8_t v1048 = vld1q_s16(in + in_stride * 237 + i);
+    int16x8_t v1049 = vld1q_s16(in + in_stride * 235 + i);
+    int16x8_t v1050 = vaddq_s16(v1048, v1049);
+    int16x8_t v1051 = vaddq_s16(v1047, v1050);
+    int16x8_t v1052 = vaddq_s16(v1051, v1024);
+    int16x8_t v1053 = vaddq_s16(v1052, v1039);
+    int16x8_t v1054 = vaddq_s16(v1053, v1043);
+    int16x8_t v1055 = vaddq_s16(v1044, v1054);
+    int16x8_t v1056 = vqrdmulhq_n_s16(v1055, 17734);
+    int16x8_t v1057 = vaddq_s16(v1042, v1056);
+    int16x8_t v1058 = vqrdmulhq_n_s16(v1057, 16705);
+    int16x8_t v1059 = vaddq_s16(v1036, v1058);
+    int16x8_t v1060 = vqrdmulhq_n_s16(v1059, 16463);
+    int16x8_t v1061 = vaddq_s16(v977, v1060);
+    int16x8_t v1062 = vaddq_s16(v983, v909);
+    int16x8_t v1063_tmp = vqrdmulhq_n_s16(v1062, 13573);
+    int16x8_t v1063 = vaddq_s16(v1063_tmp, v1062);
+    int16x8_t v1064 = vaddq_s16(v991, v913);
+    int16x8_t v1065 = vaddq_s16(v916, v995);
+    int16x8_t v1066 = vaddq_s16(v1064, v1065);
+    int16x8_t v1067 = vaddq_s16(v1063, v1066);
+    int16x8_t v1068 = vaddq_s16(v1007, v921);
+    int16x8_t v1069 = vaddq_s16(v924, v1011);
+    int16x8_t v1070 = vaddq_s16(v1068, v1069);
+    int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 13573);
+    int16x8_t v1071 = vaddq_s16(v1071_tmp, v1070);
+    int16x8_t v1072 = vaddq_s16(v1023, v929);
+    int16x8_t v1073 = vaddq_s16(v932, v1027);
+    int16x8_t v1074 = vaddq_s16(v1072, v1073);
+    int16x8_t v1075 = vaddq_s16(v1074, v1070);
+    int16x8_t v1076 = vaddq_s16(v1071, v1075);
+    int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 17734);
+    int16x8_t v1078 = vaddq_s16(v1067, v1077);
+    int16x8_t v1079 = vaddq_s16(v1014, v940);
+    int16x8_t v1080 = vaddq_s16(v943, v980);
+    int16x8_t v1081 = vaddq_s16(v1079, v1080);
+    int16x8_t v1082_tmp = vqrdmulhq_n_s16(v1081, 13573);
+    int16x8_t v1082 = vaddq_s16(v1082_tmp, v1081);
+    int16x8_t v1083 = vaddq_s16(v1030, v948);
+    int16x8_t v1084 = vaddq_s16(v951, v988);
+    int16x8_t v1085 = vaddq_s16(v1083, v1084);
+    int16x8_t v1086 = vaddq_s16(v998, v955);
+    int16x8_t v1087 = vaddq_s16(v958, v1004);
+    int16x8_t v1088 = vaddq_s16(v1086, v1087);
+    int16x8_t v1089 = vaddq_s16(v1085, v1088);
+    int16x8_t v1090 = vaddq_s16(v1082, v1089);
+    int16x8_t v1091 = vaddq_s16(v1088, v1081);
+    int16x8_t v1092_tmp = vqrdmulhq_n_s16(v1091, 13573);
+    int16x8_t v1092 = vaddq_s16(v1092_tmp, v1091);
+    int16x8_t v1093 = vaddq_s16(v1050, v966);
+    int16x8_t v1094 = vaddq_s16(v969, v1020);
+    int16x8_t v1095 = vaddq_s16(v1093, v1094);
+    int16x8_t v1096 = vaddq_s16(v1095, v1085);
+    int16x8_t v1097 = vaddq_s16(v1096, v1091);
+    int16x8_t v1098 = vaddq_s16(v1092, v1097);
+    int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 17734);
+    int16x8_t v1100 = vaddq_s16(v1090, v1099);
+    int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16705);
+    int16x8_t v1102 = vaddq_s16(v1078, v1101);
+    int16x8_t v1103 = vaddq_s16(v1080, v1062);
+    int16x8_t v1104_tmp = vqrdmulhq_n_s16(v1103, 13573);
+    int16x8_t v1104 = vaddq_s16(v1104_tmp, v1103);
+    int16x8_t v1105 = vaddq_s16(v1084, v1064);
+    int16x8_t v1106 = vaddq_s16(v1065, v1086);
+    int16x8_t v1107 = vaddq_s16(v1105, v1106);
+    int16x8_t v1108 = vaddq_s16(v1104, v1107);
+    int16x8_t v1109 = vaddq_s16(v1087, v1068);
+    int16x8_t v1110 = vaddq_s16(v1069, v1079);
+    int16x8_t v1111 = vaddq_s16(v1109, v1110);
+    int16x8_t v1112_tmp = vqrdmulhq_n_s16(v1111, 13573);
+    int16x8_t v1112 = vaddq_s16(v1112_tmp, v1111);
+    int16x8_t v1113 = vaddq_s16(v1094, v1072);
+    int16x8_t v1114 = vaddq_s16(v1073, v1083);
+    int16x8_t v1115 = vaddq_s16(v1113, v1114);
+    int16x8_t v1116 = vaddq_s16(v1115, v1111);
+    int16x8_t v1117 = vaddq_s16(v1112, v1116);
+    int16x8_t v1118 = vqrdmulhq_n_s16(v1117, 17734);
+    int16x8_t v1119 = vaddq_s16(v1108, v1118);
+    int16x8_t v1120 = vaddq_s16(v1110, v1103);
+    int16x8_t v1121_tmp = vqrdmulhq_n_s16(v1120, 13573);
+    int16x8_t v1121 = vaddq_s16(v1121_tmp, v1120);
+    int16x8_t v1122 = vaddq_s16(v1114, v1105);
+    int16x8_t v1123 = vaddq_s16(v1106, v1109);
+    int16x8_t v1124 = vaddq_s16(v1122, v1123);
+    int16x8_t v1125 = vaddq_s16(v1121, v1124);
+    int16x8_t v1126 = vaddq_s16(v1123, v1120);
+    int16x8_t v1127_tmp = vqrdmulhq_n_s16(v1126, 13573);
+    int16x8_t v1127 = vaddq_s16(v1127_tmp, v1126);
+    int16x8_t v1128 = vld1q_s16(in + in_stride * 253 + i);
+    int16x8_t v1129 = vld1q_s16(in + in_stride * 251 + i);
+    int16x8_t v1130 = vaddq_s16(v1128, v1129);
+    int16x8_t v1131 = vaddq_s16(v1130, v1047);
+    int16x8_t v1132 = vaddq_s16(v1131, v1093);
+    int16x8_t v1133 = vaddq_s16(v1132, v1113);
+    int16x8_t v1134 = vaddq_s16(v1133, v1122);
+    int16x8_t v1135 = vaddq_s16(v1134, v1126);
+    int16x8_t v1136 = vaddq_s16(v1127, v1135);
+    int16x8_t v1137 = vqrdmulhq_n_s16(v1136, 17734);
+    int16x8_t v1138 = vaddq_s16(v1125, v1137);
+    int16x8_t v1139 = vqrdmulhq_n_s16(v1138, 16705);
+    int16x8_t v1140 = vaddq_s16(v1119, v1139);
+    int16x8_t v1141 = vqrdmulhq_n_s16(v1140, 16463);
+    int16x8_t v1142 = vaddq_s16(v1102, v1141);
+    int16x8_t v1143 = vqrdmulhq_n_s16(v1142, 16404);
+    int16x8_t v1144 = vaddq_s16(v1061, v1143);
+    int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 16389);
+    int16x8_t v1146 = vaddq_s16(v906, v1145);
+    int16x8_t v1147 = vaddq_s16(v908, v702);
+    int16x8_t v1148_tmp = vqrdmulhq_n_s16(v1147, 13573);
+    int16x8_t v1148 = vaddq_s16(v1148_tmp, v1147);
+    int16x8_t v1149 = vaddq_s16(v912, v704);
+    int16x8_t v1150 = vaddq_s16(v705, v914);
+    int16x8_t v1151 = vaddq_s16(v1149, v1150);
+    int16x8_t v1152 = vaddq_s16(v1148, v1151);
+    int16x8_t v1153 = vaddq_s16(v920, v708);
+    int16x8_t v1154 = vaddq_s16(v709, v922);
+    int16x8_t v1155 = vaddq_s16(v1153, v1154);
+    int16x8_t v1156_tmp = vqrdmulhq_n_s16(v1155, 13573);
+    int16x8_t v1156 = vaddq_s16(v1156_tmp, v1155);
+    int16x8_t v1157 = vaddq_s16(v928, v712);
+    int16x8_t v1158 = vaddq_s16(v713, v930);
+    int16x8_t v1159 = vaddq_s16(v1157, v1158);
+    int16x8_t v1160 = vaddq_s16(v1159, v1155);
+    int16x8_t v1161 = vaddq_s16(v1156, v1160);
+    int16x8_t v1162 = vqrdmulhq_n_s16(v1161, 17734);
+    int16x8_t v1163 = vaddq_s16(v1152, v1162);
+    int16x8_t v1164 = vaddq_s16(v939, v719);
+    int16x8_t v1165 = vaddq_s16(v720, v941);
+    int16x8_t v1166 = vaddq_s16(v1164, v1165);
+    int16x8_t v1167_tmp = vqrdmulhq_n_s16(v1166, 13573);
+    int16x8_t v1167 = vaddq_s16(v1167_tmp, v1166);
+    int16x8_t v1168 = vaddq_s16(v947, v723);
+    int16x8_t v1169 = vaddq_s16(v724, v949);
+    int16x8_t v1170 = vaddq_s16(v1168, v1169);
+    int16x8_t v1171 = vaddq_s16(v954, v726);
+    int16x8_t v1172 = vaddq_s16(v727, v956);
+    int16x8_t v1173 = vaddq_s16(v1171, v1172);
+    int16x8_t v1174 = vaddq_s16(v1170, v1173);
+    int16x8_t v1175 = vaddq_s16(v1167, v1174);
+    int16x8_t v1176 = vaddq_s16(v1173, v1166);
+    int16x8_t v1177_tmp = vqrdmulhq_n_s16(v1176, 13573);
+    int16x8_t v1177 = vaddq_s16(v1177_tmp, v1176);
+    int16x8_t v1178 = vaddq_s16(v965, v733);
+    int16x8_t v1179 = vaddq_s16(v734, v967);
+    int16x8_t v1180 = vaddq_s16(v1178, v1179);
+    int16x8_t v1181 = vaddq_s16(v1180, v1170);
+    int16x8_t v1182 = vaddq_s16(v1181, v1176);
+    int16x8_t v1183 = vaddq_s16(v1177, v1182);
+    int16x8_t v1184 = vqrdmulhq_n_s16(v1183, 17734);
+    int16x8_t v1185 = vaddq_s16(v1175, v1184);
+    int16x8_t v1186 = vqrdmulhq_n_s16(v1185, 16705);
+    int16x8_t v1187 = vaddq_s16(v1163, v1186);
+    int16x8_t v1188 = vaddq_s16(v979, v743);
+    int16x8_t v1189 = vaddq_s16(v744, v981);
+    int16x8_t v1190 = vaddq_s16(v1188, v1189);
+    int16x8_t v1191_tmp = vqrdmulhq_n_s16(v1190, 13573);
+    int16x8_t v1191 = vaddq_s16(v1191_tmp, v1190);
+    int16x8_t v1192 = vaddq_s16(v987, v747);
+    int16x8_t v1193 = vaddq_s16(v748, v989);
+    int16x8_t v1194 = vaddq_s16(v1192, v1193);
+    int16x8_t v1195 = vaddq_s16(v994, v750);
+    int16x8_t v1196 = vaddq_s16(v751, v996);
+    int16x8_t v1197 = vaddq_s16(v1195, v1196);
+    int16x8_t v1198 = vaddq_s16(v1194, v1197);
+    int16x8_t v1199 = vaddq_s16(v1191, v1198);
+    int16x8_t v1200 = vaddq_s16(v1003, v755);
+    int16x8_t v1201 = vaddq_s16(v756, v1005);
+    int16x8_t v1202 = vaddq_s16(v1200, v1201);
+    int16x8_t v1203 = vaddq_s16(v1010, v758);
+    int16x8_t v1204 = vaddq_s16(v759, v1012);
+    int16x8_t v1205 = vaddq_s16(v1203, v1204);
+    int16x8_t v1206 = vaddq_s16(v1202, v1205);
+    int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 13573);
+    int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
+    int16x8_t v1208 = vaddq_s16(v1019, v763);
+    int16x8_t v1209 = vaddq_s16(v764, v1021);
+    int16x8_t v1210 = vaddq_s16(v1208, v1209);
+    int16x8_t v1211 = vaddq_s16(v1026, v766);
+    int16x8_t v1212 = vaddq_s16(v767, v1028);
+    int16x8_t v1213 = vaddq_s16(v1211, v1212);
+    int16x8_t v1214 = vaddq_s16(v1210, v1213);
+    int16x8_t v1215 = vaddq_s16(v1214, v1206);
+    int16x8_t v1216 = vaddq_s16(v1207, v1215);
+    int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 17734);
+    int16x8_t v1218 = vaddq_s16(v1199, v1217);
+    int16x8_t v1219 = vaddq_s16(v1205, v1190);
+    int16x8_t v1220_tmp = vqrdmulhq_n_s16(v1219, 13573);
+    int16x8_t v1220 = vaddq_s16(v1220_tmp, v1219);
+    int16x8_t v1221 = vaddq_s16(v1213, v1194);
+    int16x8_t v1222 = vaddq_s16(v1197, v1202);
+    int16x8_t v1223 = vaddq_s16(v1221, v1222);
+    int16x8_t v1224 = vaddq_s16(v1220, v1223);
+    int16x8_t v1225 = vaddq_s16(v1222, v1219);
+    int16x8_t v1226_tmp = vqrdmulhq_n_s16(v1225, 13573);
+    int16x8_t v1226 = vaddq_s16(v1226_tmp, v1225);
+    int16x8_t v1227 = vaddq_s16(v1046, v782);
+    int16x8_t v1228 = vaddq_s16(v783, v1048);
+    int16x8_t v1229 = vaddq_s16(v1227, v1228);
+    int16x8_t v1230 = vaddq_s16(v1229, v1210);
+    int16x8_t v1231 = vaddq_s16(v1230, v1221);
+    int16x8_t v1232 = vaddq_s16(v1231, v1225);
+    int16x8_t v1233 = vaddq_s16(v1226, v1232);
+    int16x8_t v1234 = vqrdmulhq_n_s16(v1233, 17734);
+    int16x8_t v1235 = vaddq_s16(v1224, v1234);
+    int16x8_t v1236 = vqrdmulhq_n_s16(v1235, 16705);
+    int16x8_t v1237 = vaddq_s16(v1218, v1236);
+    int16x8_t v1238 = vqrdmulhq_n_s16(v1237, 16463);
+    int16x8_t v1239 = vaddq_s16(v1187, v1238);
+    int16x8_t v1240 = vaddq_s16(v982, v795);
+    int16x8_t v1241 = vaddq_s16(v796, v907);
+    int16x8_t v1242 = vaddq_s16(v1240, v1241);
+    int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 13573);
+    int16x8_t v1243 = vaddq_s16(v1243_tmp, v1242);
+    int16x8_t v1244 = vaddq_s16(v990, v799);
+    int16x8_t v1245 = vaddq_s16(v800, v911);
+    int16x8_t v1246 = vaddq_s16(v1244, v1245);
+    int16x8_t v1247 = vaddq_s16(v915, v802);
+    int16x8_t v1248 = vaddq_s16(v803, v993);
+    int16x8_t v1249 = vaddq_s16(v1247, v1248);
+    int16x8_t v1250 = vaddq_s16(v1246, v1249);
+    int16x8_t v1251 = vaddq_s16(v1243, v1250);
+    int16x8_t v1252 = vaddq_s16(v1006, v807);
+    int16x8_t v1253 = vaddq_s16(v808, v919);
+    int16x8_t v1254 = vaddq_s16(v1252, v1253);
+    int16x8_t v1255 = vaddq_s16(v923, v810);
+    int16x8_t v1256 = vaddq_s16(v811, v1009);
+    int16x8_t v1257 = vaddq_s16(v1255, v1256);
+    int16x8_t v1258 = vaddq_s16(v1254, v1257);
+    int16x8_t v1259_tmp = vqrdmulhq_n_s16(v1258, 13573);
+    int16x8_t v1259 = vaddq_s16(v1259_tmp, v1258);
+    int16x8_t v1260 = vaddq_s16(v1022, v815);
+    int16x8_t v1261 = vaddq_s16(v816, v927);
+    int16x8_t v1262 = vaddq_s16(v1260, v1261);
+    int16x8_t v1263 = vaddq_s16(v931, v818);
+    int16x8_t v1264 = vaddq_s16(v819, v1025);
+    int16x8_t v1265 = vaddq_s16(v1263, v1264);
+    int16x8_t v1266 = vaddq_s16(v1262, v1265);
+    int16x8_t v1267 = vaddq_s16(v1266, v1258);
+    int16x8_t v1268 = vaddq_s16(v1259, v1267);
+    int16x8_t v1269 = vqrdmulhq_n_s16(v1268, 17734);
+    int16x8_t v1270 = vaddq_s16(v1251, v1269);
+    int16x8_t v1271 = vaddq_s16(v1013, v826);
+    int16x8_t v1272 = vaddq_s16(v827, v938);
+    int16x8_t v1273 = vaddq_s16(v1271, v1272);
+    int16x8_t v1274 = vaddq_s16(v942, v829);
+    int16x8_t v1275 = vaddq_s16(v830, v978);
+    int16x8_t v1276 = vaddq_s16(v1274, v1275);
+    int16x8_t v1277 = vaddq_s16(v1273, v1276);
+    int16x8_t v1278_tmp = vqrdmulhq_n_s16(v1277, 13573);
+    int16x8_t v1278 = vaddq_s16(v1278_tmp, v1277);
+    int16x8_t v1279 = vaddq_s16(v1029, v834);
+    int16x8_t v1280 = vaddq_s16(v835, v946);
+    int16x8_t v1281 = vaddq_s16(v1279, v1280);
+    int16x8_t v1282 = vaddq_s16(v950, v837);
+    int16x8_t v1283 = vaddq_s16(v838, v986);
+    int16x8_t v1284 = vaddq_s16(v1282, v1283);
+    int16x8_t v1285 = vaddq_s16(v1281, v1284);
+    int16x8_t v1286 = vaddq_s16(v997, v841);
+    int16x8_t v1287 = vaddq_s16(v842, v953);
+    int16x8_t v1288 = vaddq_s16(v1286, v1287);
+    int16x8_t v1289 = vaddq_s16(v957, v844);
+    int16x8_t v1290 = vaddq_s16(v845, v1002);
+    int16x8_t v1291 = vaddq_s16(v1289, v1290);
+    int16x8_t v1292 = vaddq_s16(v1288, v1291);
+    int16x8_t v1293 = vaddq_s16(v1285, v1292);
+    int16x8_t v1294 = vaddq_s16(v1278, v1293);
+    int16x8_t v1295 = vaddq_s16(v1292, v1277);
+    int16x8_t v1296_tmp = vqrdmulhq_n_s16(v1295, 13573);
+    int16x8_t v1296 = vaddq_s16(v1296_tmp, v1295);
+    int16x8_t v1297 = vaddq_s16(v1049, v852);
+    int16x8_t v1298 = vaddq_s16(v853, v964);
+    int16x8_t v1299 = vaddq_s16(v1297, v1298);
+    int16x8_t v1300 = vaddq_s16(v968, v855);
+    int16x8_t v1301 = vaddq_s16(v856, v1018);
+    int16x8_t v1302 = vaddq_s16(v1300, v1301);
+    int16x8_t v1303 = vaddq_s16(v1299, v1302);
+    int16x8_t v1304 = vaddq_s16(v1303, v1285);
+    int16x8_t v1305 = vaddq_s16(v1304, v1295);
+    int16x8_t v1306 = vaddq_s16(v1296, v1305);
+    int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 17734);
+    int16x8_t v1308 = vaddq_s16(v1294, v1307);
+    int16x8_t v1309 = vqrdmulhq_n_s16(v1308, 16705);
+    int16x8_t v1310 = vaddq_s16(v1270, v1309);
+    int16x8_t v1311 = vaddq_s16(v1276, v1242);
+    int16x8_t v1312_tmp = vqrdmulhq_n_s16(v1311, 13573);
+    int16x8_t v1312 = vaddq_s16(v1312_tmp, v1311);
+    int16x8_t v1313 = vaddq_s16(v1284, v1246);
+    int16x8_t v1314 = vaddq_s16(v1249, v1288);
+    int16x8_t v1315 = vaddq_s16(v1313, v1314);
+    int16x8_t v1316 = vaddq_s16(v1312, v1315);
+    int16x8_t v1317 = vaddq_s16(v1291, v1254);
+    int16x8_t v1318 = vaddq_s16(v1257, v1273);
+    int16x8_t v1319 = vaddq_s16(v1317, v1318);
+    int16x8_t v1320_tmp = vqrdmulhq_n_s16(v1319, 13573);
+    int16x8_t v1320 = vaddq_s16(v1320_tmp, v1319);
+    int16x8_t v1321 = vaddq_s16(v1302, v1262);
+    int16x8_t v1322 = vaddq_s16(v1265, v1281);
+    int16x8_t v1323 = vaddq_s16(v1321, v1322);
+    int16x8_t v1324 = vaddq_s16(v1323, v1319);
+    int16x8_t v1325 = vaddq_s16(v1320, v1324);
+    int16x8_t v1326 = vqrdmulhq_n_s16(v1325, 17734);
+    int16x8_t v1327 = vaddq_s16(v1316, v1326);
+    int16x8_t v1328 = vaddq_s16(v1318, v1311);
+    int16x8_t v1329_tmp = vqrdmulhq_n_s16(v1328, 13573);
+    int16x8_t v1329 = vaddq_s16(v1329_tmp, v1328);
+    int16x8_t v1330 = vaddq_s16(v1322, v1313);
+    int16x8_t v1331 = vaddq_s16(v1314, v1317);
+    int16x8_t v1332 = vaddq_s16(v1330, v1331);
+    int16x8_t v1333 = vaddq_s16(v1329, v1332);
+    int16x8_t v1334 = vaddq_s16(v1331, v1328);
+    int16x8_t v1335_tmp = vqrdmulhq_n_s16(v1334, 13573);
+    int16x8_t v1335 = vaddq_s16(v1335_tmp, v1334);
+    int16x8_t v1336 = vaddq_s16(v1129, v891);
+    int16x8_t v1337 = vaddq_s16(v892, v1045);
+    int16x8_t v1338 = vaddq_s16(v1336, v1337);
+    int16x8_t v1339 = vaddq_s16(v1338, v1299);
+    int16x8_t v1340 = vaddq_s16(v1339, v1321);
+    int16x8_t v1341 = vaddq_s16(v1340, v1330);
+    int16x8_t v1342 = vaddq_s16(v1341, v1334);
+    int16x8_t v1343 = vaddq_s16(v1335, v1342);
+    int16x8_t v1344 = vqrdmulhq_n_s16(v1343, 17734);
+    int16x8_t v1345 = vaddq_s16(v1333, v1344);
+    int16x8_t v1346 = vqrdmulhq_n_s16(v1345, 16705);
+    int16x8_t v1347 = vaddq_s16(v1327, v1346);
+    int16x8_t v1348 = vqrdmulhq_n_s16(v1347, 16463);
+    int16x8_t v1349 = vaddq_s16(v1310, v1348);
+    int16x8_t v1350 = vqrdmulhq_n_s16(v1349, 16404);
+    int16x8_t v1351 = vaddq_s16(v1239, v1350);
+    int16x8_t v1352 = vaddq_s16(v1241, v1147);
+    int16x8_t v1353_tmp = vqrdmulhq_n_s16(v1352, 13573);
+    int16x8_t v1353 = vaddq_s16(v1353_tmp, v1352);
+    int16x8_t v1354 = vaddq_s16(v1245, v1149);
+    int16x8_t v1355 = vaddq_s16(v1150, v1247);
+    int16x8_t v1356 = vaddq_s16(v1354, v1355);
+    int16x8_t v1357 = vaddq_s16(v1353, v1356);
+    int16x8_t v1358 = vaddq_s16(v1253, v1153);
+    int16x8_t v1359 = vaddq_s16(v1154, v1255);
+    int16x8_t v1360 = vaddq_s16(v1358, v1359);
+    int16x8_t v1361_tmp = vqrdmulhq_n_s16(v1360, 13573);
+    int16x8_t v1361 = vaddq_s16(v1361_tmp, v1360);
+    int16x8_t v1362 = vaddq_s16(v1261, v1157);
+    int16x8_t v1363 = vaddq_s16(v1158, v1263);
+    int16x8_t v1364 = vaddq_s16(v1362, v1363);
+    int16x8_t v1365 = vaddq_s16(v1364, v1360);
+    int16x8_t v1366 = vaddq_s16(v1361, v1365);
+    int16x8_t v1367 = vqrdmulhq_n_s16(v1366, 17734);
+    int16x8_t v1368 = vaddq_s16(v1357, v1367);
+    int16x8_t v1369 = vaddq_s16(v1272, v1164);
+    int16x8_t v1370 = vaddq_s16(v1165, v1274);
+    int16x8_t v1371 = vaddq_s16(v1369, v1370);
+    int16x8_t v1372_tmp = vqrdmulhq_n_s16(v1371, 13573);
+    int16x8_t v1372 = vaddq_s16(v1372_tmp, v1371);
+    int16x8_t v1373 = vaddq_s16(v1280, v1168);
+    int16x8_t v1374 = vaddq_s16(v1169, v1282);
+    int16x8_t v1375 = vaddq_s16(v1373, v1374);
+    int16x8_t v1376 = vaddq_s16(v1287, v1171);
+    int16x8_t v1377 = vaddq_s16(v1172, v1289);
+    int16x8_t v1378 = vaddq_s16(v1376, v1377);
+    int16x8_t v1379 = vaddq_s16(v1375, v1378);
+    int16x8_t v1380 = vaddq_s16(v1372, v1379);
+    int16x8_t v1381 = vaddq_s16(v1378, v1371);
+    int16x8_t v1382_tmp = vqrdmulhq_n_s16(v1381, 13573);
+    int16x8_t v1382 = vaddq_s16(v1382_tmp, v1381);
+    int16x8_t v1383 = vaddq_s16(v1298, v1178);
+    int16x8_t v1384 = vaddq_s16(v1179, v1300);
+    int16x8_t v1385 = vaddq_s16(v1383, v1384);
+    int16x8_t v1386 = vaddq_s16(v1385, v1375);
+    int16x8_t v1387 = vaddq_s16(v1386, v1381);
+    int16x8_t v1388 = vaddq_s16(v1382, v1387);
+    int16x8_t v1389 = vqrdmulhq_n_s16(v1388, 17734);
+    int16x8_t v1390 = vaddq_s16(v1380, v1389);
+    int16x8_t v1391 = vqrdmulhq_n_s16(v1390, 16705);
+    int16x8_t v1392 = vaddq_s16(v1368, v1391);
+    int16x8_t v1393 = vaddq_s16(v1275, v1188);
+    int16x8_t v1394 = vaddq_s16(v1189, v1240);
+    int16x8_t v1395 = vaddq_s16(v1393, v1394);
+    int16x8_t v1396_tmp = vqrdmulhq_n_s16(v1395, 13573);
+    int16x8_t v1396 = vaddq_s16(v1396_tmp, v1395);
+    int16x8_t v1397 = vaddq_s16(v1283, v1192);
+    int16x8_t v1398 = vaddq_s16(v1193, v1244);
+    int16x8_t v1399 = vaddq_s16(v1397, v1398);
+    int16x8_t v1400 = vaddq_s16(v1248, v1195);
+    int16x8_t v1401 = vaddq_s16(v1196, v1286);
+    int16x8_t v1402 = vaddq_s16(v1400, v1401);
+    int16x8_t v1403 = vaddq_s16(v1399, v1402);
+    int16x8_t v1404 = vaddq_s16(v1396, v1403);
+    int16x8_t v1405 = vaddq_s16(v1290, v1200);
+    int16x8_t v1406 = vaddq_s16(v1201, v1252);
+    int16x8_t v1407 = vaddq_s16(v1405, v1406);
+    int16x8_t v1408 = vaddq_s16(v1256, v1203);
+    int16x8_t v1409 = vaddq_s16(v1204, v1271);
+    int16x8_t v1410 = vaddq_s16(v1408, v1409);
+    int16x8_t v1411 = vaddq_s16(v1407, v1410);
+    int16x8_t v1412_tmp = vqrdmulhq_n_s16(v1411, 13573);
+    int16x8_t v1412 = vaddq_s16(v1412_tmp, v1411);
+    int16x8_t v1413 = vaddq_s16(v1301, v1208);
+    int16x8_t v1414 = vaddq_s16(v1209, v1260);
+    int16x8_t v1415 = vaddq_s16(v1413, v1414);
+    int16x8_t v1416 = vaddq_s16(v1264, v1211);
+    int16x8_t v1417 = vaddq_s16(v1212, v1279);
+    int16x8_t v1418 = vaddq_s16(v1416, v1417);
+    int16x8_t v1419 = vaddq_s16(v1415, v1418);
+    int16x8_t v1420 = vaddq_s16(v1419, v1411);
+    int16x8_t v1421 = vaddq_s16(v1412, v1420);
+    int16x8_t v1422 = vqrdmulhq_n_s16(v1421, 17734);
+    int16x8_t v1423 = vaddq_s16(v1404, v1422);
+    int16x8_t v1424 = vaddq_s16(v1410, v1395);
+    int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 13573);
+    int16x8_t v1425 = vaddq_s16(v1425_tmp, v1424);
+    int16x8_t v1426 = vaddq_s16(v1418, v1399);
+    int16x8_t v1427 = vaddq_s16(v1402, v1407);
+    int16x8_t v1428 = vaddq_s16(v1426, v1427);
+    int16x8_t v1429 = vaddq_s16(v1425, v1428);
+    int16x8_t v1430 = vaddq_s16(v1427, v1424);
+    int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 13573);
+    int16x8_t v1431 = vaddq_s16(v1431_tmp, v1430);
+    int16x8_t v1432 = vaddq_s16(v1337, v1227);
+    int16x8_t v1433 = vaddq_s16(v1228, v1297);
+    int16x8_t v1434 = vaddq_s16(v1432, v1433);
+    int16x8_t v1435 = vaddq_s16(v1434, v1415);
+    int16x8_t v1436 = vaddq_s16(v1435, v1426);
+    int16x8_t v1437 = vaddq_s16(v1436, v1430);
+    int16x8_t v1438 = vaddq_s16(v1431, v1437);
+    int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17734);
+    int16x8_t v1440 = vaddq_s16(v1429, v1439);
+    int16x8_t v1441 = vqrdmulhq_n_s16(v1440, 16705);
+    int16x8_t v1442 = vaddq_s16(v1423, v1441);
+    int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 16463);
+    int16x8_t v1444 = vaddq_s16(v1392, v1443);
+    int16x8_t v1445 = vaddq_s16(v1394, v1352);
+    int16x8_t v1446_tmp = vqrdmulhq_n_s16(v1445, 13573);
+    int16x8_t v1446 = vaddq_s16(v1446_tmp, v1445);
+    int16x8_t v1447 = vaddq_s16(v1398, v1354);
+    int16x8_t v1448 = vaddq_s16(v1355, v1400);
+    int16x8_t v1449 = vaddq_s16(v1447, v1448);
+    int16x8_t v1450 = vaddq_s16(v1446, v1449);
+    int16x8_t v1451 = vaddq_s16(v1406, v1358);
+    int16x8_t v1452 = vaddq_s16(v1359, v1408);
+    int16x8_t v1453 = vaddq_s16(v1451, v1452);
+    int16x8_t v1454_tmp = vqrdmulhq_n_s16(v1453, 13573);
+    int16x8_t v1454 = vaddq_s16(v1454_tmp, v1453);
+    int16x8_t v1455 = vaddq_s16(v1414, v1362);
+    int16x8_t v1456 = vaddq_s16(v1363, v1416);
+    int16x8_t v1457 = vaddq_s16(v1455, v1456);
+    int16x8_t v1458 = vaddq_s16(v1457, v1453);
+    int16x8_t v1459 = vaddq_s16(v1454, v1458);
+    int16x8_t v1460 = vqrdmulhq_n_s16(v1459, 17734);
+    int16x8_t v1461 = vaddq_s16(v1450, v1460);
+    int16x8_t v1462 = vaddq_s16(v1409, v1369);
+    int16x8_t v1463 = vaddq_s16(v1370, v1393);
+    int16x8_t v1464 = vaddq_s16(v1462, v1463);
+    int16x8_t v1465_tmp = vqrdmulhq_n_s16(v1464, 13573);
+    int16x8_t v1465 = vaddq_s16(v1465_tmp, v1464);
+    int16x8_t v1466 = vaddq_s16(v1417, v1373);
+    int16x8_t v1467 = vaddq_s16(v1374, v1397);
+    int16x8_t v1468 = vaddq_s16(v1466, v1467);
+    int16x8_t v1469 = vaddq_s16(v1401, v1376);
+    int16x8_t v1470 = vaddq_s16(v1377, v1405);
+    int16x8_t v1471 = vaddq_s16(v1469, v1470);
+    int16x8_t v1472 = vaddq_s16(v1468, v1471);
+    int16x8_t v1473 = vaddq_s16(v1465, v1472);
+    int16x8_t v1474 = vaddq_s16(v1471, v1464);
+    int16x8_t v1475_tmp = vqrdmulhq_n_s16(v1474, 13573);
+    int16x8_t v1475 = vaddq_s16(v1475_tmp, v1474);
+    int16x8_t v1476 = vaddq_s16(v1433, v1383);
+    int16x8_t v1477 = vaddq_s16(v1384, v1413);
+    int16x8_t v1478 = vaddq_s16(v1476, v1477);
+    int16x8_t v1479 = vaddq_s16(v1478, v1468);
+    int16x8_t v1480 = vaddq_s16(v1479, v1474);
+    int16x8_t v1481 = vaddq_s16(v1475, v1480);
+    int16x8_t v1482 = vqrdmulhq_n_s16(v1481, 17734);
+    int16x8_t v1483 = vaddq_s16(v1473, v1482);
+    int16x8_t v1484 = vqrdmulhq_n_s16(v1483, 16705);
+    int16x8_t v1485 = vaddq_s16(v1461, v1484);
+    int16x8_t v1486 = vaddq_s16(v1463, v1445);
+    int16x8_t v1487_tmp = vqrdmulhq_n_s16(v1486, 13573);
+    int16x8_t v1487 = vaddq_s16(v1487_tmp, v1486);
+    int16x8_t v1488 = vaddq_s16(v1467, v1447);
+    int16x8_t v1489 = vaddq_s16(v1448, v1469);
+    int16x8_t v1490 = vaddq_s16(v1488, v1489);
+    int16x8_t v1491 = vaddq_s16(v1487, v1490);
+    int16x8_t v1492 = vaddq_s16(v1470, v1451);
+    int16x8_t v1493 = vaddq_s16(v1452, v1462);
+    int16x8_t v1494 = vaddq_s16(v1492, v1493);
+    int16x8_t v1495_tmp = vqrdmulhq_n_s16(v1494, 13573);
+    int16x8_t v1495 = vaddq_s16(v1495_tmp, v1494);
+    int16x8_t v1496 = vaddq_s16(v1477, v1455);
+    int16x8_t v1497 = vaddq_s16(v1456, v1466);
+    int16x8_t v1498 = vaddq_s16(v1496, v1497);
+    int16x8_t v1499 = vaddq_s16(v1498, v1494);
+    int16x8_t v1500 = vaddq_s16(v1495, v1499);
+    int16x8_t v1501 = vqrdmulhq_n_s16(v1500, 17734);
+    int16x8_t v1502 = vaddq_s16(v1491, v1501);
+    int16x8_t v1503 = vaddq_s16(v1493, v1486);
+    int16x8_t v1504_tmp = vqrdmulhq_n_s16(v1503, 13573);
+    int16x8_t v1504 = vaddq_s16(v1504_tmp, v1503);
+    int16x8_t v1505 = vaddq_s16(v1497, v1488);
+    int16x8_t v1506 = vaddq_s16(v1489, v1492);
+    int16x8_t v1507 = vaddq_s16(v1505, v1506);
+    int16x8_t v1508 = vaddq_s16(v1504, v1507);
+    int16x8_t v1509 = vaddq_s16(v1506, v1503);
+    int16x8_t v1510_tmp = vqrdmulhq_n_s16(v1509, 13573);
+    int16x8_t v1510 = vaddq_s16(v1510_tmp, v1509);
+    int16x8_t v1511 = vld1q_s16(in + in_stride * 255 + i);
+    int16x8_t v1512 = vaddq_s16(v1511, v1128);
+    int16x8_t v1513 = vaddq_s16(v1512, v1336);
+    int16x8_t v1514 = vaddq_s16(v1513, v1432);
+    int16x8_t v1515 = vaddq_s16(v1514, v1476);
+    int16x8_t v1516 = vaddq_s16(v1515, v1496);
+    int16x8_t v1517 = vaddq_s16(v1516, v1505);
+    int16x8_t v1518 = vaddq_s16(v1517, v1509);
+    int16x8_t v1519 = vaddq_s16(v1510, v1518);
+    int16x8_t v1520 = vqrdmulhq_n_s16(v1519, 17734);
+    int16x8_t v1521 = vaddq_s16(v1508, v1520);
+    int16x8_t v1522 = vqrdmulhq_n_s16(v1521, 16705);
+    int16x8_t v1523 = vaddq_s16(v1502, v1522);
+    int16x8_t v1524 = vqrdmulhq_n_s16(v1523, 16463);
+    int16x8_t v1525 = vaddq_s16(v1485, v1524);
+    int16x8_t v1526 = vqrdmulhq_n_s16(v1525, 16404);
+    int16x8_t v1527 = vaddq_s16(v1444, v1526);
+    int16x8_t v1528 = vqrdmulhq_n_s16(v1527, 16389);
+    int16x8_t v1529 = vaddq_s16(v1351, v1528);
+    int16x8_t v1530 = vqrdmulhq_n_s16(v1529, 16385);
+    int16x8_t v1531 = vaddq_s16(v1146, v1530);
+    int16x8_t v1532 = vqrdmulhq_n_s16(v1531, 16384);
+    int16x8_t v1533 = vaddq_s16(v701, v1532);
+    int16x8_t v1534 = vsubq_s16(v0, v1);
+    int16x8_t v1535 = vsubq_s16(v4, v6);
+    int16x8_t v1536_tmp = vqrdmulhq_n_s16(v1535, 10045);
+    int16x8_t v1536 = vaddq_s16(v1536_tmp, v1535);
+    int16x8_t v1537 = vaddq_s16(v1534, v1536);
+    int16x8_t v1538 = vsubq_s16(v11, v14);
+    int16x8_t v1539 = vsubq_s16(v17, v20);
+    int16x8_t v1540_tmp = vqrdmulhq_n_s16(v1539, 10045);
+    int16x8_t v1540 = vaddq_s16(v1540_tmp, v1539);
+    int16x8_t v1541 = vaddq_s16(v1538, v1540);
+    int16x8_t v1542 = vqrdmulhq_n_s16(v1541, 19705);
+    int16x8_t v1543 = vaddq_s16(v1537, v1542);
+    int16x8_t v1544 = vsubq_s16(v27, v30);
+    int16x8_t v1545 = vsubq_s16(v35, v39);
+    int16x8_t v1546_tmp = vqrdmulhq_n_s16(v1545, 10045);
+    int16x8_t v1546 = vaddq_s16(v1546_tmp, v1545);
+    int16x8_t v1547 = vaddq_s16(v1544, v1546);
+    int16x8_t v1548 = vsubq_s16(v44, v47);
+    int16x8_t v1549 = vsubq_s16(v50, v54);
+    int16x8_t v1550_tmp = vqrdmulhq_n_s16(v1549, 10045);
+    int16x8_t v1550 = vaddq_s16(v1550_tmp, v1549);
+    int16x8_t v1551 = vaddq_s16(v1548, v1550);
+    int16x8_t v1552 = vqrdmulhq_n_s16(v1551, 19705);
+    int16x8_t v1553 = vaddq_s16(v1547, v1552);
+    int16x8_t v1554 = vqrdmulhq_n_s16(v1553, 17121);
+    int16x8_t v1555 = vaddq_s16(v1543, v1554);
+    int16x8_t v1556 = vsubq_s16(v63, v66);
+    int16x8_t v1557 = vsubq_s16(v71, v75);
+    int16x8_t v1558_tmp = vqrdmulhq_n_s16(v1557, 10045);
+    int16x8_t v1558 = vaddq_s16(v1558_tmp, v1557);
+    int16x8_t v1559 = vaddq_s16(v1556, v1558);
+    int16x8_t v1560 = vsubq_s16(v82, v89);
+    int16x8_t v1561 = vsubq_s16(v92, v97);
+    int16x8_t v1562_tmp = vqrdmulhq_n_s16(v1561, 10045);
+    int16x8_t v1562 = vaddq_s16(v1562_tmp, v1561);
+    int16x8_t v1563 = vaddq_s16(v1560, v1562);
+    int16x8_t v1564 = vqrdmulhq_n_s16(v1563, 19705);
+    int16x8_t v1565 = vaddq_s16(v1559, v1564);
+    int16x8_t v1566 = vsubq_s16(v104, v107);
+    int16x8_t v1567 = vsubq_s16(v112, v116);
+    int16x8_t v1568_tmp = vqrdmulhq_n_s16(v1567, 10045);
+    int16x8_t v1568 = vaddq_s16(v1568_tmp, v1567);
+    int16x8_t v1569 = vaddq_s16(v1566, v1568);
+    int16x8_t v1570 = vsubq_s16(v121, v124);
+    int16x8_t v1571 = vsubq_s16(v127, v132);
+    int16x8_t v1572_tmp = vqrdmulhq_n_s16(v1571, 10045);
+    int16x8_t v1572 = vaddq_s16(v1572_tmp, v1571);
+    int16x8_t v1573 = vaddq_s16(v1570, v1572);
+    int16x8_t v1574 = vqrdmulhq_n_s16(v1573, 19705);
+    int16x8_t v1575 = vaddq_s16(v1569, v1574);
+    int16x8_t v1576 = vqrdmulhq_n_s16(v1575, 17121);
+    int16x8_t v1577 = vaddq_s16(v1565, v1576);
+    int16x8_t v1578 = vqrdmulhq_n_s16(v1577, 16563);
+    int16x8_t v1579 = vaddq_s16(v1555, v1578);
+    int16x8_t v1580 = vsubq_s16(v143, v146);
+    int16x8_t v1581 = vsubq_s16(v151, v155);
+    int16x8_t v1582_tmp = vqrdmulhq_n_s16(v1581, 10045);
+    int16x8_t v1582 = vaddq_s16(v1582_tmp, v1581);
+    int16x8_t v1583 = vaddq_s16(v1580, v1582);
+    int16x8_t v1584 = vsubq_s16(v162, v169);
+    int16x8_t v1585 = vsubq_s16(v172, v177);
+    int16x8_t v1586_tmp = vqrdmulhq_n_s16(v1585, 10045);
+    int16x8_t v1586 = vaddq_s16(v1586_tmp, v1585);
+    int16x8_t v1587 = vaddq_s16(v1584, v1586);
+    int16x8_t v1588 = vqrdmulhq_n_s16(v1587, 19705);
+    int16x8_t v1589 = vaddq_s16(v1583, v1588);
+    int16x8_t v1590 = vsubq_s16(v186, v193);
+    int16x8_t v1591 = vsubq_s16(v202, v210);
+    int16x8_t v1592_tmp = vqrdmulhq_n_s16(v1591, 10045);
+    int16x8_t v1592 = vaddq_s16(v1592_tmp, v1591);
+    int16x8_t v1593 = vaddq_s16(v1590, v1592);
+    int16x8_t v1594 = vsubq_s16(v215, v218);
+    int16x8_t v1595 = vsubq_s16(v221, v227);
+    int16x8_t v1596_tmp = vqrdmulhq_n_s16(v1595, 10045);
+    int16x8_t v1596 = vaddq_s16(v1596_tmp, v1595);
+    int16x8_t v1597 = vaddq_s16(v1594, v1596);
+    int16x8_t v1598 = vqrdmulhq_n_s16(v1597, 19705);
+    int16x8_t v1599 = vaddq_s16(v1593, v1598);
+    int16x8_t v1600 = vqrdmulhq_n_s16(v1599, 17121);
+    int16x8_t v1601 = vaddq_s16(v1589, v1600);
+    int16x8_t v1602 = vsubq_s16(v236, v239);
+    int16x8_t v1603 = vsubq_s16(v244, v248);
+    int16x8_t v1604_tmp = vqrdmulhq_n_s16(v1603, 10045);
+    int16x8_t v1604 = vaddq_s16(v1604_tmp, v1603);
+    int16x8_t v1605 = vaddq_s16(v1602, v1604);
+    int16x8_t v1606 = vsubq_s16(v255, v262);
+    int16x8_t v1607 = vsubq_s16(v265, v270);
+    int16x8_t v1608_tmp = vqrdmulhq_n_s16(v1607, 10045);
+    int16x8_t v1608 = vaddq_s16(v1608_tmp, v1607);
+    int16x8_t v1609 = vaddq_s16(v1606, v1608);
+    int16x8_t v1610 = vqrdmulhq_n_s16(v1609, 19705);
+    int16x8_t v1611 = vaddq_s16(v1605, v1610);
+    int16x8_t v1612 = vsubq_s16(v277, v280);
+    int16x8_t v1613 = vsubq_s16(v285, v289);
+    int16x8_t v1614_tmp = vqrdmulhq_n_s16(v1613, 10045);
+    int16x8_t v1614 = vaddq_s16(v1614_tmp, v1613);
+    int16x8_t v1615 = vaddq_s16(v1612, v1614);
+    int16x8_t v1616 = vsubq_s16(v294, v297);
+    int16x8_t v1617 = vsubq_s16(v300, v306);
+    int16x8_t v1618_tmp = vqrdmulhq_n_s16(v1617, 10045);
+    int16x8_t v1618 = vaddq_s16(v1618_tmp, v1617);
+    int16x8_t v1619 = vaddq_s16(v1616, v1618);
+    int16x8_t v1620 = vqrdmulhq_n_s16(v1619, 19705);
+    int16x8_t v1621 = vaddq_s16(v1615, v1620);
+    int16x8_t v1622 = vqrdmulhq_n_s16(v1621, 17121);
+    int16x8_t v1623 = vaddq_s16(v1611, v1622);
+    int16x8_t v1624 = vqrdmulhq_n_s16(v1623, 16563);
+    int16x8_t v1625 = vaddq_s16(v1601, v1624);
+    int16x8_t v1626 = vqrdmulhq_n_s16(v1625, 16429);
+    int16x8_t v1627 = vaddq_s16(v1579, v1626);
+    int16x8_t v1628 = vsubq_s16(v319, v322);
+    int16x8_t v1629 = vsubq_s16(v327, v331);
+    int16x8_t v1630_tmp = vqrdmulhq_n_s16(v1629, 10045);
+    int16x8_t v1630 = vaddq_s16(v1630_tmp, v1629);
+    int16x8_t v1631 = vaddq_s16(v1628, v1630);
+    int16x8_t v1632 = vsubq_s16(v338, v345);
+    int16x8_t v1633 = vsubq_s16(v348, v353);
+    int16x8_t v1634_tmp = vqrdmulhq_n_s16(v1633, 10045);
+    int16x8_t v1634 = vaddq_s16(v1634_tmp, v1633);
+    int16x8_t v1635 = vaddq_s16(v1632, v1634);
+    int16x8_t v1636 = vqrdmulhq_n_s16(v1635, 19705);
+    int16x8_t v1637 = vaddq_s16(v1631, v1636);
+    int16x8_t v1638 = vsubq_s16(v362, v369);
+    int16x8_t v1639 = vsubq_s16(v378, v386);
+    int16x8_t v1640_tmp = vqrdmulhq_n_s16(v1639, 10045);
+    int16x8_t v1640 = vaddq_s16(v1640_tmp, v1639);
+    int16x8_t v1641 = vaddq_s16(v1638, v1640);
+    int16x8_t v1642 = vsubq_s16(v391, v394);
+    int16x8_t v1643 = vsubq_s16(v397, v403);
+    int16x8_t v1644_tmp = vqrdmulhq_n_s16(v1643, 10045);
+    int16x8_t v1644 = vaddq_s16(v1644_tmp, v1643);
+    int16x8_t v1645 = vaddq_s16(v1642, v1644);
+    int16x8_t v1646 = vqrdmulhq_n_s16(v1645, 19705);
+    int16x8_t v1647 = vaddq_s16(v1641, v1646);
+    int16x8_t v1648 = vqrdmulhq_n_s16(v1647, 17121);
+    int16x8_t v1649 = vaddq_s16(v1637, v1648);
+    int16x8_t v1650 = vsubq_s16(v414, v421);
+    int16x8_t v1651 = vsubq_s16(v430, v438);
+    int16x8_t v1652_tmp = vqrdmulhq_n_s16(v1651, 10045);
+    int16x8_t v1652 = vaddq_s16(v1652_tmp, v1651);
+    int16x8_t v1653 = vaddq_s16(v1650, v1652);
+    int16x8_t v1654 = vsubq_s16(v449, v464);
+    int16x8_t v1655 = vsubq_s16(v467, v476);
+    int16x8_t v1656_tmp = vqrdmulhq_n_s16(v1655, 10045);
+    int16x8_t v1656 = vaddq_s16(v1656_tmp, v1655);
+    int16x8_t v1657 = vaddq_s16(v1654, v1656);
+    int16x8_t v1658 = vqrdmulhq_n_s16(v1657, 19705);
+    int16x8_t v1659 = vaddq_s16(v1653, v1658);
+    int16x8_t v1660 = vsubq_s16(v483, v486);
+    int16x8_t v1661 = vsubq_s16(v491, v495);
+    int16x8_t v1662_tmp = vqrdmulhq_n_s16(v1661, 10045);
+    int16x8_t v1662 = vaddq_s16(v1662_tmp, v1661);
+    int16x8_t v1663 = vaddq_s16(v1660, v1662);
+    int16x8_t v1664 = vsubq_s16(v500, v503);
+    int16x8_t v1665 = vsubq_s16(v506, v513);
+    int16x8_t v1666_tmp = vqrdmulhq_n_s16(v1665, 10045);
+    int16x8_t v1666 = vaddq_s16(v1666_tmp, v1665);
+    int16x8_t v1667 = vaddq_s16(v1664, v1666);
+    int16x8_t v1668 = vqrdmulhq_n_s16(v1667, 19705);
+    int16x8_t v1669 = vaddq_s16(v1663, v1668);
+    int16x8_t v1670 = vqrdmulhq_n_s16(v1669, 17121);
+    int16x8_t v1671 = vaddq_s16(v1659, v1670);
+    int16x8_t v1672 = vqrdmulhq_n_s16(v1671, 16563);
+    int16x8_t v1673 = vaddq_s16(v1649, v1672);
+    int16x8_t v1674 = vsubq_s16(v524, v527);
+    int16x8_t v1675 = vsubq_s16(v532, v536);
+    int16x8_t v1676_tmp = vqrdmulhq_n_s16(v1675, 10045);
+    int16x8_t v1676 = vaddq_s16(v1676_tmp, v1675);
+    int16x8_t v1677 = vaddq_s16(v1674, v1676);
+    int16x8_t v1678 = vsubq_s16(v543, v550);
+    int16x8_t v1679 = vsubq_s16(v553, v558);
+    int16x8_t v1680_tmp = vqrdmulhq_n_s16(v1679, 10045);
+    int16x8_t v1680 = vaddq_s16(v1680_tmp, v1679);
+    int16x8_t v1681 = vaddq_s16(v1678, v1680);
+    int16x8_t v1682 = vqrdmulhq_n_s16(v1681, 19705);
+    int16x8_t v1683 = vaddq_s16(v1677, v1682);
+    int16x8_t v1684 = vsubq_s16(v567, v574);
+    int16x8_t v1685 = vsubq_s16(v583, v591);
+    int16x8_t v1686_tmp = vqrdmulhq_n_s16(v1685, 10045);
+    int16x8_t v1686 = vaddq_s16(v1686_tmp, v1685);
+    int16x8_t v1687 = vaddq_s16(v1684, v1686);
+    int16x8_t v1688 = vsubq_s16(v596, v599);
+    int16x8_t v1689 = vsubq_s16(v602, v608);
+    int16x8_t v1690_tmp = vqrdmulhq_n_s16(v1689, 10045);
+    int16x8_t v1690 = vaddq_s16(v1690_tmp, v1689);
+    int16x8_t v1691 = vaddq_s16(v1688, v1690);
+    int16x8_t v1692 = vqrdmulhq_n_s16(v1691, 19705);
+    int16x8_t v1693 = vaddq_s16(v1687, v1692);
+    int16x8_t v1694 = vqrdmulhq_n_s16(v1693, 17121);
+    int16x8_t v1695 = vaddq_s16(v1683, v1694);
+    int16x8_t v1696 = vsubq_s16(v617, v620);
+    int16x8_t v1697 = vsubq_s16(v625, v629);
+    int16x8_t v1698_tmp = vqrdmulhq_n_s16(v1697, 10045);
+    int16x8_t v1698 = vaddq_s16(v1698_tmp, v1697);
+    int16x8_t v1699 = vaddq_s16(v1696, v1698);
+    int16x8_t v1700 = vsubq_s16(v636, v643);
+    int16x8_t v1701 = vsubq_s16(v646, v651);
+    int16x8_t v1702_tmp = vqrdmulhq_n_s16(v1701, 10045);
+    int16x8_t v1702 = vaddq_s16(v1702_tmp, v1701);
+    int16x8_t v1703 = vaddq_s16(v1700, v1702);
+    int16x8_t v1704 = vqrdmulhq_n_s16(v1703, 19705);
+    int16x8_t v1705 = vaddq_s16(v1699, v1704);
+    int16x8_t v1706 = vsubq_s16(v658, v661);
+    int16x8_t v1707 = vsubq_s16(v666, v670);
+    int16x8_t v1708_tmp = vqrdmulhq_n_s16(v1707, 10045);
+    int16x8_t v1708 = vaddq_s16(v1708_tmp, v1707);
+    int16x8_t v1709 = vaddq_s16(v1706, v1708);
+    int16x8_t v1710 = vsubq_s16(v675, v678);
+    int16x8_t v1711 = vsubq_s16(v681, v688);
+    int16x8_t v1712_tmp = vqrdmulhq_n_s16(v1711, 10045);
+    int16x8_t v1712 = vaddq_s16(v1712_tmp, v1711);
+    int16x8_t v1713 = vaddq_s16(v1710, v1712);
+    int16x8_t v1714 = vqrdmulhq_n_s16(v1713, 19705);
+    int16x8_t v1715 = vaddq_s16(v1709, v1714);
+    int16x8_t v1716 = vqrdmulhq_n_s16(v1715, 17121);
+    int16x8_t v1717 = vaddq_s16(v1705, v1716);
+    int16x8_t v1718 = vqrdmulhq_n_s16(v1717, 16563);
+    int16x8_t v1719 = vaddq_s16(v1695, v1718);
+    int16x8_t v1720 = vqrdmulhq_n_s16(v1719, 16429);
+    int16x8_t v1721 = vaddq_s16(v1673, v1720);
+    int16x8_t v1722 = vqrdmulhq_n_s16(v1721, 16395);
+    int16x8_t v1723 = vaddq_s16(v1627, v1722);
+    int16x8_t v1724 = vsubq_s16(v703, v706);
+    int16x8_t v1725 = vsubq_s16(v711, v715);
+    int16x8_t v1726_tmp = vqrdmulhq_n_s16(v1725, 10045);
+    int16x8_t v1726 = vaddq_s16(v1726_tmp, v1725);
+    int16x8_t v1727 = vaddq_s16(v1724, v1726);
+    int16x8_t v1728 = vsubq_s16(v722, v729);
+    int16x8_t v1729 = vsubq_s16(v732, v737);
+    int16x8_t v1730_tmp = vqrdmulhq_n_s16(v1729, 10045);
+    int16x8_t v1730 = vaddq_s16(v1730_tmp, v1729);
+    int16x8_t v1731 = vaddq_s16(v1728, v1730);
+    int16x8_t v1732 = vqrdmulhq_n_s16(v1731, 19705);
+    int16x8_t v1733 = vaddq_s16(v1727, v1732);
+    int16x8_t v1734 = vsubq_s16(v746, v753);
+    int16x8_t v1735 = vsubq_s16(v762, v770);
+    int16x8_t v1736_tmp = vqrdmulhq_n_s16(v1735, 10045);
+    int16x8_t v1736 = vaddq_s16(v1736_tmp, v1735);
+    int16x8_t v1737 = vaddq_s16(v1734, v1736);
+    int16x8_t v1738 = vsubq_s16(v775, v778);
+    int16x8_t v1739 = vsubq_s16(v781, v787);
+    int16x8_t v1740_tmp = vqrdmulhq_n_s16(v1739, 10045);
+    int16x8_t v1740 = vaddq_s16(v1740_tmp, v1739);
+    int16x8_t v1741 = vaddq_s16(v1738, v1740);
+    int16x8_t v1742 = vqrdmulhq_n_s16(v1741, 19705);
+    int16x8_t v1743 = vaddq_s16(v1737, v1742);
+    int16x8_t v1744 = vqrdmulhq_n_s16(v1743, 17121);
+    int16x8_t v1745 = vaddq_s16(v1733, v1744);
+    int16x8_t v1746 = vsubq_s16(v798, v805);
+    int16x8_t v1747 = vsubq_s16(v814, v822);
+    int16x8_t v1748_tmp = vqrdmulhq_n_s16(v1747, 10045);
+    int16x8_t v1748 = vaddq_s16(v1748_tmp, v1747);
+    int16x8_t v1749 = vaddq_s16(v1746, v1748);
+    int16x8_t v1750 = vsubq_s16(v833, v848);
+    int16x8_t v1751 = vsubq_s16(v851, v860);
+    int16x8_t v1752_tmp = vqrdmulhq_n_s16(v1751, 10045);
+    int16x8_t v1752 = vaddq_s16(v1752_tmp, v1751);
+    int16x8_t v1753 = vaddq_s16(v1750, v1752);
+    int16x8_t v1754 = vqrdmulhq_n_s16(v1753, 19705);
+    int16x8_t v1755 = vaddq_s16(v1749, v1754);
+    int16x8_t v1756 = vsubq_s16(v867, v870);
+    int16x8_t v1757 = vsubq_s16(v875, v879);
+    int16x8_t v1758_tmp = vqrdmulhq_n_s16(v1757, 10045);
+    int16x8_t v1758 = vaddq_s16(v1758_tmp, v1757);
+    int16x8_t v1759 = vaddq_s16(v1756, v1758);
+    int16x8_t v1760 = vsubq_s16(v884, v887);
+    int16x8_t v1761 = vsubq_s16(v890, v897);
+    int16x8_t v1762_tmp = vqrdmulhq_n_s16(v1761, 10045);
+    int16x8_t v1762 = vaddq_s16(v1762_tmp, v1761);
+    int16x8_t v1763 = vaddq_s16(v1760, v1762);
+    int16x8_t v1764 = vqrdmulhq_n_s16(v1763, 19705);
+    int16x8_t v1765 = vaddq_s16(v1759, v1764);
+    int16x8_t v1766 = vqrdmulhq_n_s16(v1765, 17121);
+    int16x8_t v1767 = vaddq_s16(v1755, v1766);
+    int16x8_t v1768 = vqrdmulhq_n_s16(v1767, 16563);
+    int16x8_t v1769 = vaddq_s16(v1745, v1768);
+    int16x8_t v1770 = vsubq_s16(v910, v917);
+    int16x8_t v1771 = vsubq_s16(v926, v934);
+    int16x8_t v1772_tmp = vqrdmulhq_n_s16(v1771, 10045);
+    int16x8_t v1772 = vaddq_s16(v1772_tmp, v1771);
+    int16x8_t v1773 = vaddq_s16(v1770, v1772);
+    int16x8_t v1774 = vsubq_s16(v945, v960);
+    int16x8_t v1775 = vsubq_s16(v963, v972);
+    int16x8_t v1776_tmp = vqrdmulhq_n_s16(v1775, 10045);
+    int16x8_t v1776 = vaddq_s16(v1776_tmp, v1775);
+    int16x8_t v1777 = vaddq_s16(v1774, v1776);
+    int16x8_t v1778 = vqrdmulhq_n_s16(v1777, 19705);
+    int16x8_t v1779 = vaddq_s16(v1773, v1778);
+    int16x8_t v1780 = vsubq_s16(v985, v1000);
+    int16x8_t v1781 = vsubq_s16(v1017, v1033);
+    int16x8_t v1782_tmp = vqrdmulhq_n_s16(v1781, 10045);
+    int16x8_t v1782 = vaddq_s16(v1782_tmp, v1781);
+    int16x8_t v1783 = vaddq_s16(v1780, v1782);
+    int16x8_t v1784 = vsubq_s16(v1038, v1041);
+    int16x8_t v1785 = vsubq_s16(v1044, v1054);
+    int16x8_t v1786_tmp = vqrdmulhq_n_s16(v1785, 10045);
+    int16x8_t v1786 = vaddq_s16(v1786_tmp, v1785);
+    int16x8_t v1787 = vaddq_s16(v1784, v1786);
+    int16x8_t v1788 = vqrdmulhq_n_s16(v1787, 19705);
+    int16x8_t v1789 = vaddq_s16(v1783, v1788);
+    int16x8_t v1790 = vqrdmulhq_n_s16(v1789, 17121);
+    int16x8_t v1791 = vaddq_s16(v1779, v1790);
+    int16x8_t v1792 = vsubq_s16(v1063, v1066);
+    int16x8_t v1793 = vsubq_s16(v1071, v1075);
+    int16x8_t v1794_tmp = vqrdmulhq_n_s16(v1793, 10045);
+    int16x8_t v1794 = vaddq_s16(v1794_tmp, v1793);
+    int16x8_t v1795 = vaddq_s16(v1792, v1794);
+    int16x8_t v1796 = vsubq_s16(v1082, v1089);
+    int16x8_t v1797 = vsubq_s16(v1092, v1097);
+    int16x8_t v1798_tmp = vqrdmulhq_n_s16(v1797, 10045);
+    int16x8_t v1798 = vaddq_s16(v1798_tmp, v1797);
+    int16x8_t v1799 = vaddq_s16(v1796, v1798);
+    int16x8_t v1800 = vqrdmulhq_n_s16(v1799, 19705);
+    int16x8_t v1801 = vaddq_s16(v1795, v1800);
+    int16x8_t v1802 = vsubq_s16(v1104, v1107);
+    int16x8_t v1803 = vsubq_s16(v1112, v1116);
+    int16x8_t v1804_tmp = vqrdmulhq_n_s16(v1803, 10045);
+    int16x8_t v1804 = vaddq_s16(v1804_tmp, v1803);
+    int16x8_t v1805 = vaddq_s16(v1802, v1804);
+    int16x8_t v1806 = vsubq_s16(v1121, v1124);
+    int16x8_t v1807 = vsubq_s16(v1127, v1135);
+    int16x8_t v1808_tmp = vqrdmulhq_n_s16(v1807, 10045);
+    int16x8_t v1808 = vaddq_s16(v1808_tmp, v1807);
+    int16x8_t v1809 = vaddq_s16(v1806, v1808);
+    int16x8_t v1810 = vqrdmulhq_n_s16(v1809, 19705);
+    int16x8_t v1811 = vaddq_s16(v1805, v1810);
+    int16x8_t v1812 = vqrdmulhq_n_s16(v1811, 17121);
+    int16x8_t v1813 = vaddq_s16(v1801, v1812);
+    int16x8_t v1814 = vqrdmulhq_n_s16(v1813, 16563);
+    int16x8_t v1815 = vaddq_s16(v1791, v1814);
+    int16x8_t v1816 = vqrdmulhq_n_s16(v1815, 16429);
+    int16x8_t v1817 = vaddq_s16(v1769, v1816);
+    int16x8_t v1818 = vsubq_s16(v1148, v1151);
+    int16x8_t v1819 = vsubq_s16(v1156, v1160);
+    int16x8_t v1820_tmp = vqrdmulhq_n_s16(v1819, 10045);
+    int16x8_t v1820 = vaddq_s16(v1820_tmp, v1819);
+    int16x8_t v1821 = vaddq_s16(v1818, v1820);
+    int16x8_t v1822 = vsubq_s16(v1167, v1174);
+    int16x8_t v1823 = vsubq_s16(v1177, v1182);
+    int16x8_t v1824_tmp = vqrdmulhq_n_s16(v1823, 10045);
+    int16x8_t v1824 = vaddq_s16(v1824_tmp, v1823);
+    int16x8_t v1825 = vaddq_s16(v1822, v1824);
+    int16x8_t v1826 = vqrdmulhq_n_s16(v1825, 19705);
+    int16x8_t v1827 = vaddq_s16(v1821, v1826);
+    int16x8_t v1828 = vsubq_s16(v1191, v1198);
+    int16x8_t v1829 = vsubq_s16(v1207, v1215);
+    int16x8_t v1830_tmp = vqrdmulhq_n_s16(v1829, 10045);
+    int16x8_t v1830 = vaddq_s16(v1830_tmp, v1829);
+    int16x8_t v1831 = vaddq_s16(v1828, v1830);
+    int16x8_t v1832 = vsubq_s16(v1220, v1223);
+    int16x8_t v1833 = vsubq_s16(v1226, v1232);
+    int16x8_t v1834_tmp = vqrdmulhq_n_s16(v1833, 10045);
+    int16x8_t v1834 = vaddq_s16(v1834_tmp, v1833);
+    int16x8_t v1835 = vaddq_s16(v1832, v1834);
+    int16x8_t v1836 = vqrdmulhq_n_s16(v1835, 19705);
+    int16x8_t v1837 = vaddq_s16(v1831, v1836);
+    int16x8_t v1838 = vqrdmulhq_n_s16(v1837, 17121);
+    int16x8_t v1839 = vaddq_s16(v1827, v1838);
+    int16x8_t v1840 = vsubq_s16(v1243, v1250);
+    int16x8_t v1841 = vsubq_s16(v1259, v1267);
+    int16x8_t v1842_tmp = vqrdmulhq_n_s16(v1841, 10045);
+    int16x8_t v1842 = vaddq_s16(v1842_tmp, v1841);
+    int16x8_t v1843 = vaddq_s16(v1840, v1842);
+    int16x8_t v1844 = vsubq_s16(v1278, v1293);
+    int16x8_t v1845 = vsubq_s16(v1296, v1305);
+    int16x8_t v1846_tmp = vqrdmulhq_n_s16(v1845, 10045);
+    int16x8_t v1846 = vaddq_s16(v1846_tmp, v1845);
+    int16x8_t v1847 = vaddq_s16(v1844, v1846);
+    int16x8_t v1848 = vqrdmulhq_n_s16(v1847, 19705);
+    int16x8_t v1849 = vaddq_s16(v1843, v1848);
+    int16x8_t v1850 = vsubq_s16(v1312, v1315);
+    int16x8_t v1851 = vsubq_s16(v1320, v1324);
+    int16x8_t v1852_tmp = vqrdmulhq_n_s16(v1851, 10045);
+    int16x8_t v1852 = vaddq_s16(v1852_tmp, v1851);
+    int16x8_t v1853 = vaddq_s16(v1850, v1852);
+    int16x8_t v1854 = vsubq_s16(v1329, v1332);
+    int16x8_t v1855 = vsubq_s16(v1335, v1342);
+    int16x8_t v1856_tmp = vqrdmulhq_n_s16(v1855, 10045);
+    int16x8_t v1856 = vaddq_s16(v1856_tmp, v1855);
+    int16x8_t v1857 = vaddq_s16(v1854, v1856);
+    int16x8_t v1858 = vqrdmulhq_n_s16(v1857, 19705);
+    int16x8_t v1859 = vaddq_s16(v1853, v1858);
+    int16x8_t v1860 = vqrdmulhq_n_s16(v1859, 17121);
+    int16x8_t v1861 = vaddq_s16(v1849, v1860);
+    int16x8_t v1862 = vqrdmulhq_n_s16(v1861, 16563);
+    int16x8_t v1863 = vaddq_s16(v1839, v1862);
+    int16x8_t v1864 = vsubq_s16(v1353, v1356);
+    int16x8_t v1865 = vsubq_s16(v1361, v1365);
+    int16x8_t v1866_tmp = vqrdmulhq_n_s16(v1865, 10045);
+    int16x8_t v1866 = vaddq_s16(v1866_tmp, v1865);
+    int16x8_t v1867 = vaddq_s16(v1864, v1866);
+    int16x8_t v1868 = vsubq_s16(v1372, v1379);
+    int16x8_t v1869 = vsubq_s16(v1382, v1387);
+    int16x8_t v1870_tmp = vqrdmulhq_n_s16(v1869, 10045);
+    int16x8_t v1870 = vaddq_s16(v1870_tmp, v1869);
+    int16x8_t v1871 = vaddq_s16(v1868, v1870);
+    int16x8_t v1872 = vqrdmulhq_n_s16(v1871, 19705);
+    int16x8_t v1873 = vaddq_s16(v1867, v1872);
+    int16x8_t v1874 = vsubq_s16(v1396, v1403);
+    int16x8_t v1875 = vsubq_s16(v1412, v1420);
+    int16x8_t v1876_tmp = vqrdmulhq_n_s16(v1875, 10045);
+    int16x8_t v1876 = vaddq_s16(v1876_tmp, v1875);
+    int16x8_t v1877 = vaddq_s16(v1874, v1876);
+    int16x8_t v1878 = vsubq_s16(v1425, v1428);
+    int16x8_t v1879 = vsubq_s16(v1431, v1437);
+    int16x8_t v1880_tmp = vqrdmulhq_n_s16(v1879, 10045);
+    int16x8_t v1880 = vaddq_s16(v1880_tmp, v1879);
+    int16x8_t v1881 = vaddq_s16(v1878, v1880);
+    int16x8_t v1882 = vqrdmulhq_n_s16(v1881, 19705);
+    int16x8_t v1883 = vaddq_s16(v1877, v1882);
+    int16x8_t v1884 = vqrdmulhq_n_s16(v1883, 17121);
+    int16x8_t v1885 = vaddq_s16(v1873, v1884);
+    int16x8_t v1886 = vsubq_s16(v1446, v1449);
+    int16x8_t v1887 = vsubq_s16(v1454, v1458);
+    int16x8_t v1888_tmp = vqrdmulhq_n_s16(v1887, 10045);
+    int16x8_t v1888 = vaddq_s16(v1888_tmp, v1887);
+    int16x8_t v1889 = vaddq_s16(v1886, v1888);
+    int16x8_t v1890 = vsubq_s16(v1465, v1472);
+    int16x8_t v1891 = vsubq_s16(v1475, v1480);
+    int16x8_t v1892_tmp = vqrdmulhq_n_s16(v1891, 10045);
+    int16x8_t v1892 = vaddq_s16(v1892_tmp, v1891);
+    int16x8_t v1893 = vaddq_s16(v1890, v1892);
+    int16x8_t v1894 = vqrdmulhq_n_s16(v1893, 19705);
+    int16x8_t v1895 = vaddq_s16(v1889, v1894);
+    int16x8_t v1896 = vsubq_s16(v1487, v1490);
+    int16x8_t v1897 = vsubq_s16(v1495, v1499);
+    int16x8_t v1898_tmp = vqrdmulhq_n_s16(v1897, 10045);
+    int16x8_t v1898 = vaddq_s16(v1898_tmp, v1897);
+    int16x8_t v1899 = vaddq_s16(v1896, v1898);
+    int16x8_t v1900 = vsubq_s16(v1504, v1507);
+    int16x8_t v1901 = vsubq_s16(v1510, v1518);
+    int16x8_t v1902_tmp = vqrdmulhq_n_s16(v1901, 10045);
+    int16x8_t v1902 = vaddq_s16(v1902_tmp, v1901);
+    int16x8_t v1903 = vaddq_s16(v1900, v1902);
+    int16x8_t v1904 = vqrdmulhq_n_s16(v1903, 19705);
+    int16x8_t v1905 = vaddq_s16(v1899, v1904);
+    int16x8_t v1906 = vqrdmulhq_n_s16(v1905, 17121);
+    int16x8_t v1907 = vaddq_s16(v1895, v1906);
+    int16x8_t v1908 = vqrdmulhq_n_s16(v1907, 16563);
+    int16x8_t v1909 = vaddq_s16(v1885, v1908);
+    int16x8_t v1910 = vqrdmulhq_n_s16(v1909, 16429);
+    int16x8_t v1911 = vaddq_s16(v1863, v1910);
+    int16x8_t v1912 = vqrdmulhq_n_s16(v1911, 16395);
+    int16x8_t v1913 = vaddq_s16(v1817, v1912);
+    int16x8_t v1914 = vqrdmulhq_n_s16(v1913, 16387);
+    int16x8_t v1915 = vaddq_s16(v1723, v1914);
+    int16x8_t v1916 = vsubq_s16(v1534, v1536);
+    int16x8_t v1917 = vsubq_s16(v1538, v1540);
+    int16x8_t v1918 = vqrdmulhq_n_s16(v1917, 29490);
+    int16x8_t v1919 = vaddq_s16(v1916, v1918);
+    int16x8_t v1920 = vsubq_s16(v1544, v1546);
+    int16x8_t v1921 = vsubq_s16(v1548, v1550);
+    int16x8_t v1922 = vqrdmulhq_n_s16(v1921, 29490);
+    int16x8_t v1923 = vaddq_s16(v1920, v1922);
+    int16x8_t v1924 = vqrdmulhq_n_s16(v1923, 18578);
+    int16x8_t v1925 = vaddq_s16(v1919, v1924);
+    int16x8_t v1926 = vsubq_s16(v1556, v1558);
+    int16x8_t v1927 = vsubq_s16(v1560, v1562);
+    int16x8_t v1928 = vqrdmulhq_n_s16(v1927, 29490);
+    int16x8_t v1929 = vaddq_s16(v1926, v1928);
+    int16x8_t v1930 = vsubq_s16(v1566, v1568);
+    int16x8_t v1931 = vsubq_s16(v1570, v1572);
+    int16x8_t v1932 = vqrdmulhq_n_s16(v1931, 29490);
+    int16x8_t v1933 = vaddq_s16(v1930, v1932);
+    int16x8_t v1934 = vqrdmulhq_n_s16(v1933, 18578);
+    int16x8_t v1935 = vaddq_s16(v1929, v1934);
+    int16x8_t v1936 = vqrdmulhq_n_s16(v1935, 16890);
+    int16x8_t v1937 = vaddq_s16(v1925, v1936);
+    int16x8_t v1938 = vsubq_s16(v1580, v1582);
+    int16x8_t v1939 = vsubq_s16(v1584, v1586);
+    int16x8_t v1940 = vqrdmulhq_n_s16(v1939, 29490);
+    int16x8_t v1941 = vaddq_s16(v1938, v1940);
+    int16x8_t v1942 = vsubq_s16(v1590, v1592);
+    int16x8_t v1943 = vsubq_s16(v1594, v1596);
+    int16x8_t v1944 = vqrdmulhq_n_s16(v1943, 29490);
+    int16x8_t v1945 = vaddq_s16(v1942, v1944);
+    int16x8_t v1946 = vqrdmulhq_n_s16(v1945, 18578);
+    int16x8_t v1947 = vaddq_s16(v1941, v1946);
+    int16x8_t v1948 = vsubq_s16(v1602, v1604);
+    int16x8_t v1949 = vsubq_s16(v1606, v1608);
+    int16x8_t v1950 = vqrdmulhq_n_s16(v1949, 29490);
+    int16x8_t v1951 = vaddq_s16(v1948, v1950);
+    int16x8_t v1952 = vsubq_s16(v1612, v1614);
+    int16x8_t v1953 = vsubq_s16(v1616, v1618);
+    int16x8_t v1954 = vqrdmulhq_n_s16(v1953, 29490);
+    int16x8_t v1955 = vaddq_s16(v1952, v1954);
+    int16x8_t v1956 = vqrdmulhq_n_s16(v1955, 18578);
+    int16x8_t v1957 = vaddq_s16(v1951, v1956);
+    int16x8_t v1958 = vqrdmulhq_n_s16(v1957, 16890);
+    int16x8_t v1959 = vaddq_s16(v1947, v1958);
+    int16x8_t v1960 = vqrdmulhq_n_s16(v1959, 16508);
+    int16x8_t v1961 = vaddq_s16(v1937, v1960);
+    int16x8_t v1962 = vsubq_s16(v1628, v1630);
+    int16x8_t v1963 = vsubq_s16(v1632, v1634);
+    int16x8_t v1964 = vqrdmulhq_n_s16(v1963, 29490);
+    int16x8_t v1965 = vaddq_s16(v1962, v1964);
+    int16x8_t v1966 = vsubq_s16(v1638, v1640);
+    int16x8_t v1967 = vsubq_s16(v1642, v1644);
+    int16x8_t v1968 = vqrdmulhq_n_s16(v1967, 29490);
+    int16x8_t v1969 = vaddq_s16(v1966, v1968);
+    int16x8_t v1970 = vqrdmulhq_n_s16(v1969, 18578);
+    int16x8_t v1971 = vaddq_s16(v1965, v1970);
+    int16x8_t v1972 = vsubq_s16(v1650, v1652);
+    int16x8_t v1973 = vsubq_s16(v1654, v1656);
+    int16x8_t v1974 = vqrdmulhq_n_s16(v1973, 29490);
+    int16x8_t v1975 = vaddq_s16(v1972, v1974);
+    int16x8_t v1976 = vsubq_s16(v1660, v1662);
+    int16x8_t v1977 = vsubq_s16(v1664, v1666);
+    int16x8_t v1978 = vqrdmulhq_n_s16(v1977, 29490);
+    int16x8_t v1979 = vaddq_s16(v1976, v1978);
+    int16x8_t v1980 = vqrdmulhq_n_s16(v1979, 18578);
+    int16x8_t v1981 = vaddq_s16(v1975, v1980);
+    int16x8_t v1982 = vqrdmulhq_n_s16(v1981, 16890);
+    int16x8_t v1983 = vaddq_s16(v1971, v1982);
+    int16x8_t v1984 = vsubq_s16(v1674, v1676);
+    int16x8_t v1985 = vsubq_s16(v1678, v1680);
+    int16x8_t v1986 = vqrdmulhq_n_s16(v1985, 29490);
+    int16x8_t v1987 = vaddq_s16(v1984, v1986);
+    int16x8_t v1988 = vsubq_s16(v1684, v1686);
+    int16x8_t v1989 = vsubq_s16(v1688, v1690);
+    int16x8_t v1990 = vqrdmulhq_n_s16(v1989, 29490);
+    int16x8_t v1991 = vaddq_s16(v1988, v1990);
+    int16x8_t v1992 = vqrdmulhq_n_s16(v1991, 18578);
+    int16x8_t v1993 = vaddq_s16(v1987, v1992);
+    int16x8_t v1994 = vsubq_s16(v1696, v1698);
+    int16x8_t v1995 = vsubq_s16(v1700, v1702);
+    int16x8_t v1996 = vqrdmulhq_n_s16(v1995, 29490);
+    int16x8_t v1997 = vaddq_s16(v1994, v1996);
+    int16x8_t v1998 = vsubq_s16(v1706, v1708);
+    int16x8_t v1999 = vsubq_s16(v1710, v1712);
+    int16x8_t v2000 = vqrdmulhq_n_s16(v1999, 29490);
+    int16x8_t v2001 = vaddq_s16(v1998, v2000);
+    int16x8_t v2002 = vqrdmulhq_n_s16(v2001, 18578);
+    int16x8_t v2003 = vaddq_s16(v1997, v2002);
+    int16x8_t v2004 = vqrdmulhq_n_s16(v2003, 16890);
+    int16x8_t v2005 = vaddq_s16(v1993, v2004);
+    int16x8_t v2006 = vqrdmulhq_n_s16(v2005, 16508);
+    int16x8_t v2007 = vaddq_s16(v1983, v2006);
+    int16x8_t v2008 = vqrdmulhq_n_s16(v2007, 16415);
+    int16x8_t v2009 = vaddq_s16(v1961, v2008);
+    int16x8_t v2010 = vsubq_s16(v1724, v1726);
+    int16x8_t v2011 = vsubq_s16(v1728, v1730);
+    int16x8_t v2012 = vqrdmulhq_n_s16(v2011, 29490);
+    int16x8_t v2013 = vaddq_s16(v2010, v2012);
+    int16x8_t v2014 = vsubq_s16(v1734, v1736);
+    int16x8_t v2015 = vsubq_s16(v1738, v1740);
+    int16x8_t v2016 = vqrdmulhq_n_s16(v2015, 29490);
+    int16x8_t v2017 = vaddq_s16(v2014, v2016);
+    int16x8_t v2018 = vqrdmulhq_n_s16(v2017, 18578);
+    int16x8_t v2019 = vaddq_s16(v2013, v2018);
+    int16x8_t v2020 = vsubq_s16(v1746, v1748);
+    int16x8_t v2021 = vsubq_s16(v1750, v1752);
+    int16x8_t v2022 = vqrdmulhq_n_s16(v2021, 29490);
+    int16x8_t v2023 = vaddq_s16(v2020, v2022);
+    int16x8_t v2024 = vsubq_s16(v1756, v1758);
+    int16x8_t v2025 = vsubq_s16(v1760, v1762);
+    int16x8_t v2026 = vqrdmulhq_n_s16(v2025, 29490);
+    int16x8_t v2027 = vaddq_s16(v2024, v2026);
+    int16x8_t v2028 = vqrdmulhq_n_s16(v2027, 18578);
+    int16x8_t v2029 = vaddq_s16(v2023, v2028);
+    int16x8_t v2030 = vqrdmulhq_n_s16(v2029, 16890);
+    int16x8_t v2031 = vaddq_s16(v2019, v2030);
+    int16x8_t v2032 = vsubq_s16(v1770, v1772);
+    int16x8_t v2033 = vsubq_s16(v1774, v1776);
+    int16x8_t v2034 = vqrdmulhq_n_s16(v2033, 29490);
+    int16x8_t v2035 = vaddq_s16(v2032, v2034);
+    int16x8_t v2036 = vsubq_s16(v1780, v1782);
+    int16x8_t v2037 = vsubq_s16(v1784, v1786);
+    int16x8_t v2038 = vqrdmulhq_n_s16(v2037, 29490);
+    int16x8_t v2039 = vaddq_s16(v2036, v2038);
+    int16x8_t v2040 = vqrdmulhq_n_s16(v2039, 18578);
+    int16x8_t v2041 = vaddq_s16(v2035, v2040);
+    int16x8_t v2042 = vsubq_s16(v1792, v1794);
+    int16x8_t v2043 = vsubq_s16(v1796, v1798);
+    int16x8_t v2044 = vqrdmulhq_n_s16(v2043, 29490);
+    int16x8_t v2045 = vaddq_s16(v2042, v2044);
+    int16x8_t v2046 = vsubq_s16(v1802, v1804);
+    int16x8_t v2047 = vsubq_s16(v1806, v1808);
+    int16x8_t v2048 = vqrdmulhq_n_s16(v2047, 29490);
+    int16x8_t v2049 = vaddq_s16(v2046, v2048);
+    int16x8_t v2050 = vqrdmulhq_n_s16(v2049, 18578);
+    int16x8_t v2051 = vaddq_s16(v2045, v2050);
+    int16x8_t v2052 = vqrdmulhq_n_s16(v2051, 16890);
+    int16x8_t v2053 = vaddq_s16(v2041, v2052);
+    int16x8_t v2054 = vqrdmulhq_n_s16(v2053, 16508);
+    int16x8_t v2055 = vaddq_s16(v2031, v2054);
+    int16x8_t v2056 = vsubq_s16(v1818, v1820);
+    int16x8_t v2057 = vsubq_s16(v1822, v1824);
+    int16x8_t v2058 = vqrdmulhq_n_s16(v2057, 29490);
+    int16x8_t v2059 = vaddq_s16(v2056, v2058);
+    int16x8_t v2060 = vsubq_s16(v1828, v1830);
+    int16x8_t v2061 = vsubq_s16(v1832, v1834);
+    int16x8_t v2062 = vqrdmulhq_n_s16(v2061, 29490);
+    int16x8_t v2063 = vaddq_s16(v2060, v2062);
+    int16x8_t v2064 = vqrdmulhq_n_s16(v2063, 18578);
+    int16x8_t v2065 = vaddq_s16(v2059, v2064);
+    int16x8_t v2066 = vsubq_s16(v1840, v1842);
+    int16x8_t v2067 = vsubq_s16(v1844, v1846);
+    int16x8_t v2068 = vqrdmulhq_n_s16(v2067, 29490);
+    int16x8_t v2069 = vaddq_s16(v2066, v2068);
+    int16x8_t v2070 = vsubq_s16(v1850, v1852);
+    int16x8_t v2071 = vqrdmulhq_n_s16(v2070, 18578);
+    int16x8_t v2072 = vsubq_s16(v1854, v1856);
+    int16x8_t v2073 = vqrdmulhq_n_s16(v2072, 16719);
+    int16x8_t v2074 = vaddq_s16(v2071, v2073);
+    int16x8_t v2075 = vaddq_s16(v2069, v2074);
+    int16x8_t v2076 = vqrdmulhq_n_s16(v2075, 16890);
+    int16x8_t v2077 = vaddq_s16(v2065, v2076);
+    int16x8_t v2078 = vsubq_s16(v1864, v1866);
+    int16x8_t v2079 = vsubq_s16(v1868, v1870);
+    int16x8_t v2080 = vqrdmulhq_n_s16(v2079, 29490);
+    int16x8_t v2081 = vaddq_s16(v2078, v2080);
+    int16x8_t v2082 = vsubq_s16(v1874, v1876);
+    int16x8_t v2083 = vsubq_s16(v1878, v1880);
+    int16x8_t v2084 = vqrdmulhq_n_s16(v2083, 29490);
+    int16x8_t v2085 = vaddq_s16(v2082, v2084);
+    int16x8_t v2086 = vqrdmulhq_n_s16(v2085, 18578);
+    int16x8_t v2087 = vaddq_s16(v2081, v2086);
+    int16x8_t v2088 = vsubq_s16(v1886, v1888);
+    int16x8_t v2089 = vsubq_s16(v1890, v1892);
+    int16x8_t v2090 = vqrdmulhq_n_s16(v2089, 29490);
+    int16x8_t v2091 = vaddq_s16(v2088, v2090);
+    int16x8_t v2092 = vsubq_s16(v1896, v1898);
+    int16x8_t v2093 = vsubq_s16(v1900, v1902);
+    int16x8_t v2094 = vqrdmulhq_n_s16(v2093, 29490);
+    int16x8_t v2095 = vaddq_s16(v2092, v2094);
+    int16x8_t v2096 = vqrdmulhq_n_s16(v2095, 18578);
+    int16x8_t v2097 = vaddq_s16(v2091, v2096);
+    int16x8_t v2098 = vqrdmulhq_n_s16(v2097, 16890);
+    int16x8_t v2099 = vaddq_s16(v2087, v2098);
+    int16x8_t v2100 = vqrdmulhq_n_s16(v2099, 16508);
+    int16x8_t v2101 = vaddq_s16(v2077, v2100);
+    int16x8_t v2102 = vqrdmulhq_n_s16(v2101, 16415);
+    int16x8_t v2103 = vaddq_s16(v2055, v2102);
+    int16x8_t v2104 = vqrdmulhq_n_s16(v2103, 16392);
+    int16x8_t v2105 = vaddq_s16(v2009, v2104);
+    int16x8_t v2106 = vsubq_s16(v2, v8);
+    int16x8_t v2107 = vsubq_s16(v15, v22);
+    int16x8_t v2108_tmp = vqrdmulhq_n_s16(v2107, 18446);
+    int16x8_t v2108 = vmlaq_n_s16(v2108_tmp, v2107, 2);
+    int16x8_t v2109 = vaddq_s16(v2106, v2108);
+    int16x8_t v2110 = vsubq_s16(v31, v41);
+    int16x8_t v2111 = vsubq_s16(v48, v56);
+    int16x8_t v2112_tmp = vqrdmulhq_n_s16(v2111, 18446);
+    int16x8_t v2112 = vmlaq_n_s16(v2112_tmp, v2111, 2);
+    int16x8_t v2113 = vaddq_s16(v2110, v2112);
+    int16x8_t v2114 = vqrdmulhq_n_s16(v2113, 21195);
+    int16x8_t v2115 = vaddq_s16(v2109, v2114);
+    int16x8_t v2116 = vsubq_s16(v67, v77);
+    int16x8_t v2117 = vsubq_s16(v90, v99);
+    int16x8_t v2118_tmp = vqrdmulhq_n_s16(v2117, 18446);
+    int16x8_t v2118 = vmlaq_n_s16(v2118_tmp, v2117, 2);
+    int16x8_t v2119 = vaddq_s16(v2116, v2118);
+    int16x8_t v2120 = vsubq_s16(v108, v118);
+    int16x8_t v2121 = vsubq_s16(v125, v134);
+    int16x8_t v2122_tmp = vqrdmulhq_n_s16(v2121, 18446);
+    int16x8_t v2122 = vmlaq_n_s16(v2122_tmp, v2121, 2);
+    int16x8_t v2123 = vaddq_s16(v2120, v2122);
+    int16x8_t v2124 = vqrdmulhq_n_s16(v2123, 21195);
+    int16x8_t v2125 = vaddq_s16(v2119, v2124);
+    int16x8_t v2126 = vqrdmulhq_n_s16(v2125, 17401);
+    int16x8_t v2127 = vaddq_s16(v2115, v2126);
+    int16x8_t v2128 = vsubq_s16(v147, v157);
+    int16x8_t v2129 = vsubq_s16(v170, v179);
+    int16x8_t v2130_tmp = vqrdmulhq_n_s16(v2129, 18446);
+    int16x8_t v2130 = vmlaq_n_s16(v2130_tmp, v2129, 2);
+    int16x8_t v2131 = vaddq_s16(v2128, v2130);
+    int16x8_t v2132 = vsubq_s16(v194, v212);
+    int16x8_t v2133 = vsubq_s16(v219, v229);
+    int16x8_t v2134_tmp = vqrdmulhq_n_s16(v2133, 18446);
+    int16x8_t v2134 = vmlaq_n_s16(v2134_tmp, v2133, 2);
+    int16x8_t v2135 = vaddq_s16(v2132, v2134);
+    int16x8_t v2136 = vqrdmulhq_n_s16(v2135, 21195);
+    int16x8_t v2137 = vaddq_s16(v2131, v2136);
+    int16x8_t v2138 = vsubq_s16(v240, v250);
+    int16x8_t v2139 = vsubq_s16(v263, v272);
+    int16x8_t v2140_tmp = vqrdmulhq_n_s16(v2139, 18446);
+    int16x8_t v2140 = vmlaq_n_s16(v2140_tmp, v2139, 2);
+    int16x8_t v2141 = vaddq_s16(v2138, v2140);
+    int16x8_t v2142 = vsubq_s16(v281, v291);
+    int16x8_t v2143 = vsubq_s16(v298, v308);
+    int16x8_t v2144_tmp = vqrdmulhq_n_s16(v2143, 18446);
+    int16x8_t v2144 = vmlaq_n_s16(v2144_tmp, v2143, 2);
+    int16x8_t v2145 = vaddq_s16(v2142, v2144);
+    int16x8_t v2146 = vqrdmulhq_n_s16(v2145, 21195);
+    int16x8_t v2147 = vaddq_s16(v2141, v2146);
+    int16x8_t v2148 = vqrdmulhq_n_s16(v2147, 17401);
+    int16x8_t v2149 = vaddq_s16(v2137, v2148);
+    int16x8_t v2150 = vqrdmulhq_n_s16(v2149, 16629);
+    int16x8_t v2151 = vaddq_s16(v2127, v2150);
+    int16x8_t v2152 = vsubq_s16(v323, v333);
+    int16x8_t v2153 = vsubq_s16(v346, v355);
+    int16x8_t v2154_tmp = vqrdmulhq_n_s16(v2153, 18446);
+    int16x8_t v2154 = vmlaq_n_s16(v2154_tmp, v2153, 2);
+    int16x8_t v2155 = vaddq_s16(v2152, v2154);
+    int16x8_t v2156 = vsubq_s16(v370, v388);
+    int16x8_t v2157 = vsubq_s16(v395, v405);
+    int16x8_t v2158_tmp = vqrdmulhq_n_s16(v2157, 18446);
+    int16x8_t v2158 = vmlaq_n_s16(v2158_tmp, v2157, 2);
+    int16x8_t v2159 = vaddq_s16(v2156, v2158);
+    int16x8_t v2160 = vqrdmulhq_n_s16(v2159, 21195);
+    int16x8_t v2161 = vaddq_s16(v2155, v2160);
+    int16x8_t v2162 = vsubq_s16(v422, v440);
+    int16x8_t v2163 = vsubq_s16(v465, v478);
+    int16x8_t v2164_tmp = vqrdmulhq_n_s16(v2163, 18446);
+    int16x8_t v2164 = vmlaq_n_s16(v2164_tmp, v2163, 2);
+    int16x8_t v2165 = vaddq_s16(v2162, v2164);
+    int16x8_t v2166 = vsubq_s16(v487, v497);
+    int16x8_t v2167 = vsubq_s16(v504, v515);
+    int16x8_t v2168_tmp = vqrdmulhq_n_s16(v2167, 18446);
+    int16x8_t v2168 = vmlaq_n_s16(v2168_tmp, v2167, 2);
+    int16x8_t v2169 = vaddq_s16(v2166, v2168);
+    int16x8_t v2170 = vqrdmulhq_n_s16(v2169, 21195);
+    int16x8_t v2171 = vaddq_s16(v2165, v2170);
+    int16x8_t v2172 = vqrdmulhq_n_s16(v2171, 17401);
+    int16x8_t v2173 = vaddq_s16(v2161, v2172);
+    int16x8_t v2174 = vsubq_s16(v528, v538);
+    int16x8_t v2175 = vsubq_s16(v551, v560);
+    int16x8_t v2176_tmp = vqrdmulhq_n_s16(v2175, 18446);
+    int16x8_t v2176 = vmlaq_n_s16(v2176_tmp, v2175, 2);
+    int16x8_t v2177 = vaddq_s16(v2174, v2176);
+    int16x8_t v2178 = vsubq_s16(v575, v593);
+    int16x8_t v2179 = vsubq_s16(v600, v610);
+    int16x8_t v2180_tmp = vqrdmulhq_n_s16(v2179, 18446);
+    int16x8_t v2180 = vmlaq_n_s16(v2180_tmp, v2179, 2);
+    int16x8_t v2181 = vaddq_s16(v2178, v2180);
+    int16x8_t v2182 = vqrdmulhq_n_s16(v2181, 21195);
+    int16x8_t v2183 = vaddq_s16(v2177, v2182);
+    int16x8_t v2184 = vsubq_s16(v621, v631);
+    int16x8_t v2185 = vsubq_s16(v644, v653);
+    int16x8_t v2186_tmp = vqrdmulhq_n_s16(v2185, 18446);
+    int16x8_t v2186 = vmlaq_n_s16(v2186_tmp, v2185, 2);
+    int16x8_t v2187 = vaddq_s16(v2184, v2186);
+    int16x8_t v2188 = vsubq_s16(v662, v672);
+    int16x8_t v2189 = vsubq_s16(v679, v690);
+    int16x8_t v2190_tmp = vqrdmulhq_n_s16(v2189, 18446);
+    int16x8_t v2190 = vmlaq_n_s16(v2190_tmp, v2189, 2);
+    int16x8_t v2191 = vaddq_s16(v2188, v2190);
+    int16x8_t v2192 = vqrdmulhq_n_s16(v2191, 21195);
+    int16x8_t v2193 = vaddq_s16(v2187, v2192);
+    int16x8_t v2194 = vqrdmulhq_n_s16(v2193, 17401);
+    int16x8_t v2195 = vaddq_s16(v2183, v2194);
+    int16x8_t v2196 = vqrdmulhq_n_s16(v2195, 16629);
+    int16x8_t v2197 = vaddq_s16(v2173, v2196);
+    int16x8_t v2198 = vqrdmulhq_n_s16(v2197, 16445);
+    int16x8_t v2199 = vaddq_s16(v2151, v2198);
+    int16x8_t v2200 = vsubq_s16(v707, v717);
+    int16x8_t v2201 = vsubq_s16(v730, v739);
+    int16x8_t v2202_tmp = vqrdmulhq_n_s16(v2201, 18446);
+    int16x8_t v2202 = vmlaq_n_s16(v2202_tmp, v2201, 2);
+    int16x8_t v2203 = vaddq_s16(v2200, v2202);
+    int16x8_t v2204 = vsubq_s16(v754, v772);
+    int16x8_t v2205 = vsubq_s16(v779, v789);
+    int16x8_t v2206_tmp = vqrdmulhq_n_s16(v2205, 18446);
+    int16x8_t v2206 = vmlaq_n_s16(v2206_tmp, v2205, 2);
+    int16x8_t v2207 = vaddq_s16(v2204, v2206);
+    int16x8_t v2208 = vqrdmulhq_n_s16(v2207, 21195);
+    int16x8_t v2209 = vaddq_s16(v2203, v2208);
+    int16x8_t v2210 = vsubq_s16(v806, v824);
+    int16x8_t v2211 = vsubq_s16(v849, v862);
+    int16x8_t v2212_tmp = vqrdmulhq_n_s16(v2211, 18446);
+    int16x8_t v2212 = vmlaq_n_s16(v2212_tmp, v2211, 2);
+    int16x8_t v2213 = vaddq_s16(v2210, v2212);
+    int16x8_t v2214 = vsubq_s16(v871, v881);
+    int16x8_t v2215 = vsubq_s16(v888, v899);
+    int16x8_t v2216_tmp = vqrdmulhq_n_s16(v2215, 18446);
+    int16x8_t v2216 = vmlaq_n_s16(v2216_tmp, v2215, 2);
+    int16x8_t v2217 = vaddq_s16(v2214, v2216);
+    int16x8_t v2218 = vqrdmulhq_n_s16(v2217, 21195);
+    int16x8_t v2219 = vaddq_s16(v2213, v2218);
+    int16x8_t v2220 = vqrdmulhq_n_s16(v2219, 17401);
+    int16x8_t v2221 = vaddq_s16(v2209, v2220);
+    int16x8_t v2222 = vsubq_s16(v918, v936);
+    int16x8_t v2223 = vsubq_s16(v961, v974);
+    int16x8_t v2224_tmp = vqrdmulhq_n_s16(v2223, 18446);
+    int16x8_t v2224 = vmlaq_n_s16(v2224_tmp, v2223, 2);
+    int16x8_t v2225 = vaddq_s16(v2222, v2224);
+    int16x8_t v2226 = vsubq_s16(v1001, v1035);
+    int16x8_t v2227 = vsubq_s16(v1042, v1056);
+    int16x8_t v2228_tmp = vqrdmulhq_n_s16(v2227, 18446);
+    int16x8_t v2228 = vmlaq_n_s16(v2228_tmp, v2227, 2);
+    int16x8_t v2229 = vaddq_s16(v2226, v2228);
+    int16x8_t v2230 = vqrdmulhq_n_s16(v2229, 21195);
+    int16x8_t v2231 = vaddq_s16(v2225, v2230);
+    int16x8_t v2232 = vsubq_s16(v1067, v1077);
+    int16x8_t v2233 = vsubq_s16(v1090, v1099);
+    int16x8_t v2234_tmp = vqrdmulhq_n_s16(v2233, 18446);
+    int16x8_t v2234 = vmlaq_n_s16(v2234_tmp, v2233, 2);
+    int16x8_t v2235 = vaddq_s16(v2232, v2234);
+    int16x8_t v2236 = vsubq_s16(v1108, v1118);
+    int16x8_t v2237 = vsubq_s16(v1125, v1137);
+    int16x8_t v2238_tmp = vqrdmulhq_n_s16(v2237, 18446);
+    int16x8_t v2238 = vmlaq_n_s16(v2238_tmp, v2237, 2);
+    int16x8_t v2239 = vaddq_s16(v2236, v2238);
+    int16x8_t v2240 = vqrdmulhq_n_s16(v2239, 21195);
+    int16x8_t v2241 = vaddq_s16(v2235, v2240);
+    int16x8_t v2242 = vqrdmulhq_n_s16(v2241, 17401);
+    int16x8_t v2243 = vaddq_s16(v2231, v2242);
+    int16x8_t v2244 = vqrdmulhq_n_s16(v2243, 16629);
+    int16x8_t v2245 = vaddq_s16(v2221, v2244);
+    int16x8_t v2246 = vsubq_s16(v1152, v1162);
+    int16x8_t v2247 = vsubq_s16(v1175, v1184);
+    int16x8_t v2248_tmp = vqrdmulhq_n_s16(v2247, 18446);
+    int16x8_t v2248 = vmlaq_n_s16(v2248_tmp, v2247, 2);
+    int16x8_t v2249 = vaddq_s16(v2246, v2248);
+    int16x8_t v2250 = vsubq_s16(v1199, v1217);
+    int16x8_t v2251 = vsubq_s16(v1224, v1234);
+    int16x8_t v2252_tmp = vqrdmulhq_n_s16(v2251, 18446);
+    int16x8_t v2252 = vmlaq_n_s16(v2252_tmp, v2251, 2);
+    int16x8_t v2253 = vaddq_s16(v2250, v2252);
+    int16x8_t v2254 = vqrdmulhq_n_s16(v2253, 21195);
+    int16x8_t v2255 = vaddq_s16(v2249, v2254);
+    int16x8_t v2256 = vsubq_s16(v1251, v1269);
+    int16x8_t v2257 = vsubq_s16(v1294, v1307);
+    int16x8_t v2258_tmp = vqrdmulhq_n_s16(v2257, 18446);
+    int16x8_t v2258 = vmlaq_n_s16(v2258_tmp, v2257, 2);
+    int16x8_t v2259 = vaddq_s16(v2256, v2258);
+    int16x8_t v2260 = vsubq_s16(v1316, v1326);
+    int16x8_t v2261 = vsubq_s16(v1333, v1344);
+    int16x8_t v2262_tmp = vqrdmulhq_n_s16(v2261, 18446);
+    int16x8_t v2262 = vmlaq_n_s16(v2262_tmp, v2261, 2);
+    int16x8_t v2263 = vaddq_s16(v2260, v2262);
+    int16x8_t v2264 = vqrdmulhq_n_s16(v2263, 21195);
+    int16x8_t v2265 = vaddq_s16(v2259, v2264);
+    int16x8_t v2266 = vqrdmulhq_n_s16(v2265, 17401);
+    int16x8_t v2267 = vaddq_s16(v2255, v2266);
+    int16x8_t v2268 = vsubq_s16(v1357, v1367);
+    int16x8_t v2269 = vsubq_s16(v1380, v1389);
+    int16x8_t v2270_tmp = vqrdmulhq_n_s16(v2269, 18446);
+    int16x8_t v2270 = vmlaq_n_s16(v2270_tmp, v2269, 2);
+    int16x8_t v2271 = vaddq_s16(v2268, v2270);
+    int16x8_t v2272 = vsubq_s16(v1404, v1422);
+    int16x8_t v2273 = vsubq_s16(v1429, v1439);
+    int16x8_t v2274_tmp = vqrdmulhq_n_s16(v2273, 18446);
+    int16x8_t v2274 = vmlaq_n_s16(v2274_tmp, v2273, 2);
+    int16x8_t v2275 = vaddq_s16(v2272, v2274);
+    int16x8_t v2276 = vqrdmulhq_n_s16(v2275, 21195);
+    int16x8_t v2277 = vaddq_s16(v2271, v2276);
+    int16x8_t v2278 = vsubq_s16(v1450, v1460);
+    int16x8_t v2279 = vsubq_s16(v1473, v1482);
+    int16x8_t v2280_tmp = vqrdmulhq_n_s16(v2279, 18446);
+    int16x8_t v2280 = vmlaq_n_s16(v2280_tmp, v2279, 2);
+    int16x8_t v2281 = vaddq_s16(v2278, v2280);
+    int16x8_t v2282 = vsubq_s16(v1491, v1501);
+    int16x8_t v2283 = vsubq_s16(v1508, v1520);
+    int16x8_t v2284_tmp = vqrdmulhq_n_s16(v2283, 18446);
+    int16x8_t v2284 = vmlaq_n_s16(v2284_tmp, v2283, 2);
+    int16x8_t v2285 = vaddq_s16(v2282, v2284);
+    int16x8_t v2286 = vqrdmulhq_n_s16(v2285, 21195);
+    int16x8_t v2287 = vaddq_s16(v2281, v2286);
+    int16x8_t v2288 = vqrdmulhq_n_s16(v2287, 17401);
+    int16x8_t v2289 = vaddq_s16(v2277, v2288);
+    int16x8_t v2290 = vqrdmulhq_n_s16(v2289, 16629);
+    int16x8_t v2291 = vaddq_s16(v2267, v2290);
+    int16x8_t v2292 = vqrdmulhq_n_s16(v2291, 16445);
+    int16x8_t v2293 = vaddq_s16(v2245, v2292);
+    int16x8_t v2294 = vqrdmulhq_n_s16(v2293, 16399);
+    int16x8_t v2295 = vaddq_s16(v2199, v2294);
+    int16x8_t v2296 = vsubq_s16(v2106, v2108);
+    int16x8_t v2297 = vsubq_s16(v2110, v2112);
+    int16x8_t v2298 = vqrdmulhq_n_s16(v2297, 25826);
+    int16x8_t v2299 = vaddq_s16(v2296, v2298);
+    int16x8_t v2300 = vsubq_s16(v2116, v2118);
+    int16x8_t v2301 = vsubq_s16(v2120, v2122);
+    int16x8_t v2302 = vqrdmulhq_n_s16(v2301, 25826);
+    int16x8_t v2303 = vaddq_s16(v2300, v2302);
+    int16x8_t v2304 = vqrdmulhq_n_s16(v2303, 18124);
+    int16x8_t v2305 = vaddq_s16(v2299, v2304);
+    int16x8_t v2306 = vsubq_s16(v2128, v2130);
+    int16x8_t v2307 = vsubq_s16(v2132, v2134);
+    int16x8_t v2308 = vqrdmulhq_n_s16(v2307, 25826);
+    int16x8_t v2309 = vaddq_s16(v2306, v2308);
+    int16x8_t v2310 = vsubq_s16(v2138, v2140);
+    int16x8_t v2311 = vsubq_s16(v2142, v2144);
+    int16x8_t v2312 = vqrdmulhq_n_s16(v2311, 25826);
+    int16x8_t v2313 = vaddq_s16(v2310, v2312);
+    int16x8_t v2314 = vqrdmulhq_n_s16(v2313, 18124);
+    int16x8_t v2315 = vaddq_s16(v2309, v2314);
+    int16x8_t v2316 = vqrdmulhq_n_s16(v2315, 16792);
+    int16x8_t v2317 = vaddq_s16(v2305, v2316);
+    int16x8_t v2318 = vsubq_s16(v2152, v2154);
+    int16x8_t v2319 = vsubq_s16(v2156, v2158);
+    int16x8_t v2320 = vqrdmulhq_n_s16(v2319, 25826);
+    int16x8_t v2321 = vaddq_s16(v2318, v2320);
+    int16x8_t v2322 = vsubq_s16(v2162, v2164);
+    int16x8_t v2323 = vsubq_s16(v2166, v2168);
+    int16x8_t v2324 = vqrdmulhq_n_s16(v2323, 25826);
+    int16x8_t v2325 = vaddq_s16(v2322, v2324);
+    int16x8_t v2326 = vqrdmulhq_n_s16(v2325, 18124);
+    int16x8_t v2327 = vaddq_s16(v2321, v2326);
+    int16x8_t v2328 = vsubq_s16(v2174, v2176);
+    int16x8_t v2329 = vsubq_s16(v2178, v2180);
+    int16x8_t v2330 = vqrdmulhq_n_s16(v2329, 25826);
+    int16x8_t v2331 = vaddq_s16(v2328, v2330);
+    int16x8_t v2332 = vsubq_s16(v2184, v2186);
+    int16x8_t v2333 = vsubq_s16(v2188, v2190);
+    int16x8_t v2334 = vqrdmulhq_n_s16(v2333, 25826);
+    int16x8_t v2335 = vaddq_s16(v2332, v2334);
+    int16x8_t v2336 = vqrdmulhq_n_s16(v2335, 18124);
+    int16x8_t v2337 = vaddq_s16(v2331, v2336);
+    int16x8_t v2338 = vqrdmulhq_n_s16(v2337, 16792);
+    int16x8_t v2339 = vaddq_s16(v2327, v2338);
+    int16x8_t v2340 = vqrdmulhq_n_s16(v2339, 16484);
+    int16x8_t v2341 = vaddq_s16(v2317, v2340);
+    int16x8_t v2342 = vsubq_s16(v2200, v2202);
+    int16x8_t v2343 = vsubq_s16(v2204, v2206);
+    int16x8_t v2344 = vqrdmulhq_n_s16(v2343, 25826);
+    int16x8_t v2345 = vaddq_s16(v2342, v2344);
+    int16x8_t v2346 = vsubq_s16(v2210, v2212);
+    int16x8_t v2347 = vsubq_s16(v2214, v2216);
+    int16x8_t v2348 = vqrdmulhq_n_s16(v2347, 25826);
+    int16x8_t v2349 = vaddq_s16(v2346, v2348);
+    int16x8_t v2350 = vqrdmulhq_n_s16(v2349, 18124);
+    int16x8_t v2351 = vaddq_s16(v2345, v2350);
+    int16x8_t v2352 = vsubq_s16(v2222, v2224);
+    int16x8_t v2353 = vsubq_s16(v2226, v2228);
+    int16x8_t v2354 = vqrdmulhq_n_s16(v2353, 25826);
+    int16x8_t v2355 = vaddq_s16(v2352, v2354);
+    int16x8_t v2356 = vsubq_s16(v2232, v2234);
+    int16x8_t v2357 = vsubq_s16(v2236, v2238);
+    int16x8_t v2358 = vqrdmulhq_n_s16(v2357, 25826);
+    int16x8_t v2359 = vaddq_s16(v2356, v2358);
+    int16x8_t v2360 = vqrdmulhq_n_s16(v2359, 18124);
+    int16x8_t v2361 = vaddq_s16(v2355, v2360);
+    int16x8_t v2362 = vqrdmulhq_n_s16(v2361, 16792);
+    int16x8_t v2363 = vaddq_s16(v2351, v2362);
+    int16x8_t v2364 = vsubq_s16(v2246, v2248);
+    int16x8_t v2365 = vsubq_s16(v2250, v2252);
+    int16x8_t v2366 = vqrdmulhq_n_s16(v2365, 25826);
+    int16x8_t v2367 = vaddq_s16(v2364, v2366);
+    int16x8_t v2368 = vsubq_s16(v2256, v2258);
+    int16x8_t v2369 = vsubq_s16(v2260, v2262);
+    int16x8_t v2370 = vqrdmulhq_n_s16(v2369, 25826);
+    int16x8_t v2371 = vaddq_s16(v2368, v2370);
+    int16x8_t v2372 = vqrdmulhq_n_s16(v2371, 18124);
+    int16x8_t v2373 = vaddq_s16(v2367, v2372);
+    int16x8_t v2374 = vsubq_s16(v2268, v2270);
+    int16x8_t v2375 = vsubq_s16(v2272, v2274);
+    int16x8_t v2376 = vqrdmulhq_n_s16(v2375, 25826);
+    int16x8_t v2377 = vaddq_s16(v2374, v2376);
+    int16x8_t v2378 = vsubq_s16(v2278, v2280);
+    int16x8_t v2379 = vsubq_s16(v2282, v2284);
+    int16x8_t v2380 = vqrdmulhq_n_s16(v2379, 25826);
+    int16x8_t v2381 = vaddq_s16(v2378, v2380);
+    int16x8_t v2382 = vqrdmulhq_n_s16(v2381, 18124);
+    int16x8_t v2383 = vaddq_s16(v2377, v2382);
+    int16x8_t v2384 = vqrdmulhq_n_s16(v2383, 16792);
+    int16x8_t v2385 = vaddq_s16(v2373, v2384);
+    int16x8_t v2386 = vqrdmulhq_n_s16(v2385, 16484);
+    int16x8_t v2387 = vaddq_s16(v2363, v2386);
+    int16x8_t v2388 = vqrdmulhq_n_s16(v2387, 16409);
+    int16x8_t v2389 = vaddq_s16(v2341, v2388);
+    int16x8_t v2390 = vsubq_s16(v1916, v1918);
+    int16x8_t v2391 = vsubq_s16(v1920, v1922);
+    int16x8_t v2392_tmp = vqrdmulhq_n_s16(v2391, 1988);
+    int16x8_t v2392 = vaddq_s16(v2392_tmp, v2391);
+    int16x8_t v2393 = vaddq_s16(v2390, v2392);
+    int16x8_t v2394 = vsubq_s16(v1926, v1928);
+    int16x8_t v2395 = vsubq_s16(v1930, v1932);
+    int16x8_t v2396_tmp = vqrdmulhq_n_s16(v2395, 1988);
+    int16x8_t v2396 = vaddq_s16(v2396_tmp, v2395);
+    int16x8_t v2397 = vaddq_s16(v2394, v2396);
+    int16x8_t v2398 = vqrdmulhq_n_s16(v2397, 19102);
+    int16x8_t v2399 = vaddq_s16(v2393, v2398);
+    int16x8_t v2400 = vsubq_s16(v1938, v1940);
+    int16x8_t v2401 = vsubq_s16(v1942, v1944);
+    int16x8_t v2402_tmp = vqrdmulhq_n_s16(v2401, 1988);
+    int16x8_t v2402 = vaddq_s16(v2402_tmp, v2401);
+    int16x8_t v2403 = vaddq_s16(v2400, v2402);
+    int16x8_t v2404 = vsubq_s16(v1948, v1950);
+    int16x8_t v2405 = vsubq_s16(v1952, v1954);
+    int16x8_t v2406_tmp = vqrdmulhq_n_s16(v2405, 1988);
+    int16x8_t v2406 = vaddq_s16(v2406_tmp, v2405);
+    int16x8_t v2407 = vaddq_s16(v2404, v2406);
+    int16x8_t v2408 = vqrdmulhq_n_s16(v2407, 19102);
+    int16x8_t v2409 = vaddq_s16(v2403, v2408);
+    int16x8_t v2410 = vqrdmulhq_n_s16(v2409, 17000);
+    int16x8_t v2411 = vaddq_s16(v2399, v2410);
+    int16x8_t v2412 = vsubq_s16(v1962, v1964);
+    int16x8_t v2413 = vsubq_s16(v1966, v1968);
+    int16x8_t v2414_tmp = vqrdmulhq_n_s16(v2413, 1988);
+    int16x8_t v2414 = vaddq_s16(v2414_tmp, v2413);
+    int16x8_t v2415 = vaddq_s16(v2412, v2414);
+    int16x8_t v2416 = vsubq_s16(v1972, v1974);
+    int16x8_t v2417 = vsubq_s16(v1976, v1978);
+    int16x8_t v2418_tmp = vqrdmulhq_n_s16(v2417, 1988);
+    int16x8_t v2418 = vaddq_s16(v2418_tmp, v2417);
+    int16x8_t v2419 = vaddq_s16(v2416, v2418);
+    int16x8_t v2420 = vqrdmulhq_n_s16(v2419, 19102);
+    int16x8_t v2421 = vaddq_s16(v2415, v2420);
+    int16x8_t v2422 = vsubq_s16(v1984, v1986);
+    int16x8_t v2423 = vsubq_s16(v1988, v1990);
+    int16x8_t v2424_tmp = vqrdmulhq_n_s16(v2423, 1988);
+    int16x8_t v2424 = vaddq_s16(v2424_tmp, v2423);
+    int16x8_t v2425 = vaddq_s16(v2422, v2424);
+    int16x8_t v2426 = vsubq_s16(v1994, v1996);
+    int16x8_t v2427 = vsubq_s16(v1998, v2000);
+    int16x8_t v2428_tmp = vqrdmulhq_n_s16(v2427, 1988);
+    int16x8_t v2428 = vaddq_s16(v2428_tmp, v2427);
+    int16x8_t v2429 = vaddq_s16(v2426, v2428);
+    int16x8_t v2430 = vqrdmulhq_n_s16(v2429, 19102);
+    int16x8_t v2431 = vaddq_s16(v2425, v2430);
+    int16x8_t v2432 = vqrdmulhq_n_s16(v2431, 17000);
+    int16x8_t v2433 = vaddq_s16(v2421, v2432);
+    int16x8_t v2434 = vqrdmulhq_n_s16(v2433, 16534);
+    int16x8_t v2435 = vaddq_s16(v2411, v2434);
+    int16x8_t v2436 = vsubq_s16(v2010, v2012);
+    int16x8_t v2437 = vsubq_s16(v2014, v2016);
+    int16x8_t v2438_tmp = vqrdmulhq_n_s16(v2437, 1988);
+    int16x8_t v2438 = vaddq_s16(v2438_tmp, v2437);
+    int16x8_t v2439 = vaddq_s16(v2436, v2438);
+    int16x8_t v2440 = vsubq_s16(v2020, v2022);
+    int16x8_t v2441 = vsubq_s16(v2024, v2026);
+    int16x8_t v2442_tmp = vqrdmulhq_n_s16(v2441, 1988);
+    int16x8_t v2442 = vaddq_s16(v2442_tmp, v2441);
+    int16x8_t v2443 = vaddq_s16(v2440, v2442);
+    int16x8_t v2444 = vqrdmulhq_n_s16(v2443, 19102);
+    int16x8_t v2445 = vaddq_s16(v2439, v2444);
+    int16x8_t v2446 = vsubq_s16(v2032, v2034);
+    int16x8_t v2447 = vsubq_s16(v2036, v2038);
+    int16x8_t v2448_tmp = vqrdmulhq_n_s16(v2447, 1988);
+    int16x8_t v2448 = vaddq_s16(v2448_tmp, v2447);
+    int16x8_t v2449 = vaddq_s16(v2446, v2448);
+    int16x8_t v2450 = vsubq_s16(v2042, v2044);
+    int16x8_t v2451 = vsubq_s16(v2046, v2048);
+    int16x8_t v2452_tmp = vqrdmulhq_n_s16(v2451, 1988);
+    int16x8_t v2452 = vaddq_s16(v2452_tmp, v2451);
+    int16x8_t v2453 = vaddq_s16(v2450, v2452);
+    int16x8_t v2454 = vqrdmulhq_n_s16(v2453, 19102);
+    int16x8_t v2455 = vaddq_s16(v2449, v2454);
+    int16x8_t v2456 = vqrdmulhq_n_s16(v2455, 17000);
+    int16x8_t v2457 = vaddq_s16(v2445, v2456);
+    int16x8_t v2458 = vsubq_s16(v2056, v2058);
+    int16x8_t v2459 = vsubq_s16(v2060, v2062);
+    int16x8_t v2460_tmp = vqrdmulhq_n_s16(v2459, 1988);
+    int16x8_t v2460 = vaddq_s16(v2460_tmp, v2459);
+    int16x8_t v2461 = vaddq_s16(v2458, v2460);
+    int16x8_t v2462 = vsubq_s16(v2066, v2068);
+    int16x8_t v2463 = vqrdmulhq_n_s16(v2072, 29490);
+    int16x8_t v2464 = vsubq_s16(v2070, v2463);
+    int16x8_t v2465_tmp = vqrdmulhq_n_s16(v2464, 1988);
+    int16x8_t v2465 = vaddq_s16(v2465_tmp, v2464);
+    int16x8_t v2466 = vaddq_s16(v2462, v2465);
+    int16x8_t v2467 = vqrdmulhq_n_s16(v2466, 19102);
+    int16x8_t v2468 = vaddq_s16(v2461, v2467);
+    int16x8_t v2469 = vsubq_s16(v2078, v2080);
+    int16x8_t v2470 = vsubq_s16(v2082, v2084);
+    int16x8_t v2471_tmp = vqrdmulhq_n_s16(v2470, 1988);
+    int16x8_t v2471 = vaddq_s16(v2471_tmp, v2470);
+    int16x8_t v2472 = vaddq_s16(v2469, v2471);
+    int16x8_t v2473 = vsubq_s16(v2088, v2090);
+    int16x8_t v2474 = vsubq_s16(v2092, v2094);
+    int16x8_t v2475_tmp = vqrdmulhq_n_s16(v2474, 1988);
+    int16x8_t v2475 = vaddq_s16(v2475_tmp, v2474);
+    int16x8_t v2476 = vaddq_s16(v2473, v2475);
+    int16x8_t v2477 = vqrdmulhq_n_s16(v2476, 19102);
+    int16x8_t v2478 = vaddq_s16(v2472, v2477);
+    int16x8_t v2479 = vqrdmulhq_n_s16(v2478, 17000);
+    int16x8_t v2480 = vaddq_s16(v2468, v2479);
+    int16x8_t v2481 = vqrdmulhq_n_s16(v2480, 16534);
+    int16x8_t v2482 = vaddq_s16(v2457, v2481);
+    int16x8_t v2483 = vqrdmulhq_n_s16(v2482, 16421);
+    int16x8_t v2484 = vaddq_s16(v2435, v2483);
+    int16x8_t v2485 = vsubq_s16(v1537, v1542);
+    int16x8_t v2486 = vsubq_s16(v1547, v1552);
+    int16x8_t v2487_tmp = vqrdmulhq_n_s16(v2486, 23673);
+    int16x8_t v2487 = vaddq_s16(v2487_tmp, v2486);
+    int16x8_t v2488 = vaddq_s16(v2485, v2487);
+    int16x8_t v2489 = vsubq_s16(v1559, v1564);
+    int16x8_t v2490 = vsubq_s16(v1569, v1574);
+    int16x8_t v2491_tmp = vqrdmulhq_n_s16(v2490, 23673);
+    int16x8_t v2491 = vaddq_s16(v2491_tmp, v2490);
+    int16x8_t v2492 = vaddq_s16(v2489, v2491);
+    int16x8_t v2493 = vqrdmulhq_n_s16(v2492, 20398);
+    int16x8_t v2494 = vaddq_s16(v2488, v2493);
+    int16x8_t v2495 = vsubq_s16(v1583, v1588);
+    int16x8_t v2496 = vsubq_s16(v1593, v1598);
+    int16x8_t v2497_tmp = vqrdmulhq_n_s16(v2496, 23673);
+    int16x8_t v2497 = vaddq_s16(v2497_tmp, v2496);
+    int16x8_t v2498 = vaddq_s16(v2495, v2497);
+    int16x8_t v2499 = vsubq_s16(v1605, v1610);
+    int16x8_t v2500 = vsubq_s16(v1615, v1620);
+    int16x8_t v2501_tmp = vqrdmulhq_n_s16(v2500, 23673);
+    int16x8_t v2501 = vaddq_s16(v2501_tmp, v2500);
+    int16x8_t v2502 = vaddq_s16(v2499, v2501);
+    int16x8_t v2503 = vqrdmulhq_n_s16(v2502, 20398);
+    int16x8_t v2504 = vaddq_s16(v2498, v2503);
+    int16x8_t v2505 = vqrdmulhq_n_s16(v2504, 17255);
+    int16x8_t v2506 = vaddq_s16(v2494, v2505);
+    int16x8_t v2507 = vsubq_s16(v1631, v1636);
+    int16x8_t v2508 = vsubq_s16(v1641, v1646);
+    int16x8_t v2509_tmp = vqrdmulhq_n_s16(v2508, 23673);
+    int16x8_t v2509 = vaddq_s16(v2509_tmp, v2508);
+    int16x8_t v2510 = vaddq_s16(v2507, v2509);
+    int16x8_t v2511 = vsubq_s16(v1653, v1658);
+    int16x8_t v2512 = vsubq_s16(v1663, v1668);
+    int16x8_t v2513_tmp = vqrdmulhq_n_s16(v2512, 23673);
+    int16x8_t v2513 = vaddq_s16(v2513_tmp, v2512);
+    int16x8_t v2514 = vaddq_s16(v2511, v2513);
+    int16x8_t v2515 = vqrdmulhq_n_s16(v2514, 20398);
+    int16x8_t v2516 = vaddq_s16(v2510, v2515);
+    int16x8_t v2517 = vsubq_s16(v1677, v1682);
+    int16x8_t v2518 = vsubq_s16(v1687, v1692);
+    int16x8_t v2519_tmp = vqrdmulhq_n_s16(v2518, 23673);
+    int16x8_t v2519 = vaddq_s16(v2519_tmp, v2518);
+    int16x8_t v2520 = vaddq_s16(v2517, v2519);
+    int16x8_t v2521 = vsubq_s16(v1699, v1704);
+    int16x8_t v2522 = vsubq_s16(v1709, v1714);
+    int16x8_t v2523_tmp = vqrdmulhq_n_s16(v2522, 23673);
+    int16x8_t v2523 = vaddq_s16(v2523_tmp, v2522);
+    int16x8_t v2524 = vaddq_s16(v2521, v2523);
+    int16x8_t v2525 = vqrdmulhq_n_s16(v2524, 20398);
+    int16x8_t v2526 = vaddq_s16(v2520, v2525);
+    int16x8_t v2527 = vqrdmulhq_n_s16(v2526, 17255);
+    int16x8_t v2528 = vaddq_s16(v2516, v2527);
+    int16x8_t v2529 = vqrdmulhq_n_s16(v2528, 16595);
+    int16x8_t v2530 = vaddq_s16(v2506, v2529);
+    int16x8_t v2531 = vsubq_s16(v1727, v1732);
+    int16x8_t v2532 = vsubq_s16(v1737, v1742);
+    int16x8_t v2533_tmp = vqrdmulhq_n_s16(v2532, 23673);
+    int16x8_t v2533 = vaddq_s16(v2533_tmp, v2532);
+    int16x8_t v2534 = vaddq_s16(v2531, v2533);
+    int16x8_t v2535 = vsubq_s16(v1749, v1754);
+    int16x8_t v2536 = vsubq_s16(v1759, v1764);
+    int16x8_t v2537_tmp = vqrdmulhq_n_s16(v2536, 23673);
+    int16x8_t v2537 = vaddq_s16(v2537_tmp, v2536);
+    int16x8_t v2538 = vaddq_s16(v2535, v2537);
+    int16x8_t v2539 = vqrdmulhq_n_s16(v2538, 20398);
+    int16x8_t v2540 = vaddq_s16(v2534, v2539);
+    int16x8_t v2541 = vsubq_s16(v1773, v1778);
+    int16x8_t v2542 = vsubq_s16(v1783, v1788);
+    int16x8_t v2543_tmp = vqrdmulhq_n_s16(v2542, 23673);
+    int16x8_t v2543 = vaddq_s16(v2543_tmp, v2542);
+    int16x8_t v2544 = vaddq_s16(v2541, v2543);
+    int16x8_t v2545 = vsubq_s16(v1795, v1800);
+    int16x8_t v2546 = vsubq_s16(v1805, v1810);
+    int16x8_t v2547_tmp = vqrdmulhq_n_s16(v2546, 23673);
+    int16x8_t v2547 = vaddq_s16(v2547_tmp, v2546);
+    int16x8_t v2548 = vaddq_s16(v2545, v2547);
+    int16x8_t v2549 = vqrdmulhq_n_s16(v2548, 20398);
+    int16x8_t v2550 = vaddq_s16(v2544, v2549);
+    int16x8_t v2551 = vqrdmulhq_n_s16(v2550, 17255);
+    int16x8_t v2552 = vaddq_s16(v2540, v2551);
+    int16x8_t v2553 = vsubq_s16(v1821, v1826);
+    int16x8_t v2554 = vsubq_s16(v1831, v1836);
+    int16x8_t v2555_tmp = vqrdmulhq_n_s16(v2554, 23673);
+    int16x8_t v2555 = vaddq_s16(v2555_tmp, v2554);
+    int16x8_t v2556 = vaddq_s16(v2553, v2555);
+    int16x8_t v2557 = vsubq_s16(v1843, v1848);
+    int16x8_t v2558 = vsubq_s16(v1853, v1858);
+    int16x8_t v2559_tmp = vqrdmulhq_n_s16(v2558, 23673);
+    int16x8_t v2559 = vaddq_s16(v2559_tmp, v2558);
+    int16x8_t v2560 = vaddq_s16(v2557, v2559);
+    int16x8_t v2561 = vqrdmulhq_n_s16(v2560, 20398);
+    int16x8_t v2562 = vaddq_s16(v2556, v2561);
+    int16x8_t v2563 = vsubq_s16(v1867, v1872);
+    int16x8_t v2564 = vsubq_s16(v1877, v1882);
+    int16x8_t v2565_tmp = vqrdmulhq_n_s16(v2564, 23673);
+    int16x8_t v2565 = vaddq_s16(v2565_tmp, v2564);
+    int16x8_t v2566 = vaddq_s16(v2563, v2565);
+    int16x8_t v2567 = vsubq_s16(v1889, v1894);
+    int16x8_t v2568 = vsubq_s16(v1899, v1904);
+    int16x8_t v2569_tmp = vqrdmulhq_n_s16(v2568, 23673);
+    int16x8_t v2569 = vaddq_s16(v2569_tmp, v2568);
+    int16x8_t v2570 = vaddq_s16(v2567, v2569);
+    int16x8_t v2571 = vqrdmulhq_n_s16(v2570, 20398);
+    int16x8_t v2572 = vaddq_s16(v2566, v2571);
+    int16x8_t v2573 = vqrdmulhq_n_s16(v2572, 17255);
+    int16x8_t v2574 = vaddq_s16(v2562, v2573);
+    int16x8_t v2575 = vqrdmulhq_n_s16(v2574, 16595);
+    int16x8_t v2576 = vaddq_s16(v2552, v2575);
+    int16x8_t v2577 = vqrdmulhq_n_s16(v2576, 16436);
+    int16x8_t v2578 = vaddq_s16(v2530, v2577);
+    int16x8_t v2579 = vsubq_s16(v9, v24);
+    int16x8_t v2580 = vsubq_s16(v42, v58);
+    int16x8_t v2581_tmp = vqrdmulhq_n_s16(v2580, 3314);
+    int16x8_t v2581 = vmlaq_n_s16(v2581_tmp, v2580, 5);
+    int16x8_t v2582 = vaddq_s16(v2579, v2581);
+    int16x8_t v2583 = vsubq_s16(v78, v101);
+    int16x8_t v2584 = vsubq_s16(v119, v136);
+    int16x8_t v2585_tmp = vqrdmulhq_n_s16(v2584, 3314);
+    int16x8_t v2585 = vmlaq_n_s16(v2585_tmp, v2584, 5);
+    int16x8_t v2586 = vaddq_s16(v2583, v2585);
+    int16x8_t v2587 = vqrdmulhq_n_s16(v2586, 22112);
+    int16x8_t v2588 = vaddq_s16(v2582, v2587);
+    int16x8_t v2589 = vsubq_s16(v158, v181);
+    int16x8_t v2590 = vsubq_s16(v213, v231);
+    int16x8_t v2591_tmp = vqrdmulhq_n_s16(v2590, 3314);
+    int16x8_t v2591 = vmlaq_n_s16(v2591_tmp, v2590, 5);
+    int16x8_t v2592 = vaddq_s16(v2589, v2591);
+    int16x8_t v2593 = vsubq_s16(v251, v274);
+    int16x8_t v2594 = vsubq_s16(v292, v310);
+    int16x8_t v2595_tmp = vqrdmulhq_n_s16(v2594, 3314);
+    int16x8_t v2595 = vmlaq_n_s16(v2595_tmp, v2594, 5);
+    int16x8_t v2596 = vaddq_s16(v2593, v2595);
+    int16x8_t v2597 = vqrdmulhq_n_s16(v2596, 22112);
+    int16x8_t v2598 = vaddq_s16(v2592, v2597);
+    int16x8_t v2599 = vqrdmulhq_n_s16(v2598, 17561);
+    int16x8_t v2600 = vaddq_s16(v2588, v2599);
+    int16x8_t v2601 = vsubq_s16(v334, v357);
+    int16x8_t v2602 = vsubq_s16(v389, v407);
+    int16x8_t v2603_tmp = vqrdmulhq_n_s16(v2602, 3314);
+    int16x8_t v2603 = vmlaq_n_s16(v2603_tmp, v2602, 5);
+    int16x8_t v2604 = vaddq_s16(v2601, v2603);
+    int16x8_t v2605 = vsubq_s16(v441, v480);
+    int16x8_t v2606 = vsubq_s16(v498, v517);
+    int16x8_t v2607_tmp = vqrdmulhq_n_s16(v2606, 3314);
+    int16x8_t v2607 = vmlaq_n_s16(v2607_tmp, v2606, 5);
+    int16x8_t v2608 = vaddq_s16(v2605, v2607);
+    int16x8_t v2609 = vqrdmulhq_n_s16(v2608, 22112);
+    int16x8_t v2610 = vaddq_s16(v2604, v2609);
+    int16x8_t v2611 = vsubq_s16(v539, v562);
+    int16x8_t v2612 = vsubq_s16(v594, v612);
+    int16x8_t v2613_tmp = vqrdmulhq_n_s16(v2612, 3314);
+    int16x8_t v2613 = vmlaq_n_s16(v2613_tmp, v2612, 5);
+    int16x8_t v2614 = vaddq_s16(v2611, v2613);
+    int16x8_t v2615 = vsubq_s16(v632, v655);
+    int16x8_t v2616 = vsubq_s16(v673, v692);
+    int16x8_t v2617_tmp = vqrdmulhq_n_s16(v2616, 3314);
+    int16x8_t v2617 = vmlaq_n_s16(v2617_tmp, v2616, 5);
+    int16x8_t v2618 = vaddq_s16(v2615, v2617);
+    int16x8_t v2619 = vqrdmulhq_n_s16(v2618, 22112);
+    int16x8_t v2620 = vaddq_s16(v2614, v2619);
+    int16x8_t v2621 = vqrdmulhq_n_s16(v2620, 17561);
+    int16x8_t v2622 = vaddq_s16(v2610, v2621);
+    int16x8_t v2623 = vqrdmulhq_n_s16(v2622, 16666);
+    int16x8_t v2624 = vaddq_s16(v2600, v2623);
+    int16x8_t v2625 = vsubq_s16(v718, v741);
+    int16x8_t v2626 = vsubq_s16(v773, v791);
+    int16x8_t v2627_tmp = vqrdmulhq_n_s16(v2626, 3314);
+    int16x8_t v2627 = vmlaq_n_s16(v2627_tmp, v2626, 5);
+    int16x8_t v2628 = vaddq_s16(v2625, v2627);
+    int16x8_t v2629 = vsubq_s16(v825, v864);
+    int16x8_t v2630 = vsubq_s16(v882, v901);
+    int16x8_t v2631_tmp = vqrdmulhq_n_s16(v2630, 3314);
+    int16x8_t v2631 = vmlaq_n_s16(v2631_tmp, v2630, 5);
+    int16x8_t v2632 = vaddq_s16(v2629, v2631);
+    int16x8_t v2633 = vqrdmulhq_n_s16(v2632, 22112);
+    int16x8_t v2634 = vaddq_s16(v2628, v2633);
+    int16x8_t v2635 = vsubq_s16(v937, v976);
+    int16x8_t v2636 = vsubq_s16(v1036, v1058);
+    int16x8_t v2637_tmp = vqrdmulhq_n_s16(v2636, 3314);
+    int16x8_t v2637 = vmlaq_n_s16(v2637_tmp, v2636, 5);
+    int16x8_t v2638 = vaddq_s16(v2635, v2637);
+    int16x8_t v2639 = vsubq_s16(v1078, v1101);
+    int16x8_t v2640 = vsubq_s16(v1119, v1139);
+    int16x8_t v2641_tmp = vqrdmulhq_n_s16(v2640, 3314);
+    int16x8_t v2641 = vmlaq_n_s16(v2641_tmp, v2640, 5);
+    int16x8_t v2642 = vaddq_s16(v2639, v2641);
+    int16x8_t v2643 = vqrdmulhq_n_s16(v2642, 22112);
+    int16x8_t v2644 = vaddq_s16(v2638, v2643);
+    int16x8_t v2645 = vqrdmulhq_n_s16(v2644, 17561);
+    int16x8_t v2646 = vaddq_s16(v2634, v2645);
+    int16x8_t v2647 = vsubq_s16(v1163, v1186);
+    int16x8_t v2648 = vsubq_s16(v1218, v1236);
+    int16x8_t v2649_tmp = vqrdmulhq_n_s16(v2648, 3314);
+    int16x8_t v2649 = vmlaq_n_s16(v2649_tmp, v2648, 5);
+    int16x8_t v2650 = vaddq_s16(v2647, v2649);
+    int16x8_t v2651 = vsubq_s16(v1270, v1309);
+    int16x8_t v2652 = vsubq_s16(v1327, v1346);
+    int16x8_t v2653_tmp = vqrdmulhq_n_s16(v2652, 3314);
+    int16x8_t v2653 = vmlaq_n_s16(v2653_tmp, v2652, 5);
+    int16x8_t v2654 = vaddq_s16(v2651, v2653);
+    int16x8_t v2655 = vqrdmulhq_n_s16(v2654, 22112);
+    int16x8_t v2656 = vaddq_s16(v2650, v2655);
+    int16x8_t v2657 = vsubq_s16(v1368, v1391);
+    int16x8_t v2658 = vsubq_s16(v1423, v1441);
+    int16x8_t v2659_tmp = vqrdmulhq_n_s16(v2658, 3314);
+    int16x8_t v2659 = vmlaq_n_s16(v2659_tmp, v2658, 5);
+    int16x8_t v2660 = vaddq_s16(v2657, v2659);
+    int16x8_t v2661 = vsubq_s16(v1461, v1484);
+    int16x8_t v2662 = vsubq_s16(v1502, v1522);
+    int16x8_t v2663_tmp = vqrdmulhq_n_s16(v2662, 3314);
+    int16x8_t v2663 = vmlaq_n_s16(v2663_tmp, v2662, 5);
+    int16x8_t v2664 = vaddq_s16(v2661, v2663);
+    int16x8_t v2665 = vqrdmulhq_n_s16(v2664, 22112);
+    int16x8_t v2666 = vaddq_s16(v2660, v2665);
+    int16x8_t v2667 = vqrdmulhq_n_s16(v2666, 17561);
+    int16x8_t v2668 = vaddq_s16(v2656, v2667);
+    int16x8_t v2669 = vqrdmulhq_n_s16(v2668, 16666);
+    int16x8_t v2670 = vaddq_s16(v2646, v2669);
+    int16x8_t v2671 = vqrdmulhq_n_s16(v2670, 16454);
+    int16x8_t v2672 = vaddq_s16(v2624, v2671);
+    int16x8_t v2673 = vsubq_s16(v2579, v2581);
+    int16x8_t v2674 = vsubq_s16(v2583, v2585);
+    int16x8_t v2675 = vqrdmulhq_n_s16(v2674, 24397);
+    int16x8_t v2676 = vaddq_s16(v2673, v2675);
+    int16x8_t v2677 = vsubq_s16(v2589, v2591);
+    int16x8_t v2678 = vsubq_s16(v2593, v2595);
+    int16x8_t v2679 = vqrdmulhq_n_s16(v2678, 24397);
+    int16x8_t v2680 = vaddq_s16(v2677, v2679);
+    int16x8_t v2681 = vqrdmulhq_n_s16(v2680, 17921);
+    int16x8_t v2682 = vaddq_s16(v2676, v2681);
+    int16x8_t v2683 = vsubq_s16(v2601, v2603);
+    int16x8_t v2684 = vsubq_s16(v2605, v2607);
+    int16x8_t v2685 = vqrdmulhq_n_s16(v2684, 24397);
+    int16x8_t v2686 = vaddq_s16(v2683, v2685);
+    int16x8_t v2687 = vsubq_s16(v2611, v2613);
+    int16x8_t v2688 = vsubq_s16(v2615, v2617);
+    int16x8_t v2689 = vqrdmulhq_n_s16(v2688, 24397);
+    int16x8_t v2690 = vaddq_s16(v2687, v2689);
+    int16x8_t v2691 = vqrdmulhq_n_s16(v2690, 17921);
+    int16x8_t v2692 = vaddq_s16(v2686, v2691);
+    int16x8_t v2693 = vqrdmulhq_n_s16(v2692, 16747);
+    int16x8_t v2694 = vaddq_s16(v2682, v2693);
+    int16x8_t v2695 = vsubq_s16(v2625, v2627);
+    int16x8_t v2696 = vsubq_s16(v2629, v2631);
+    int16x8_t v2697 = vqrdmulhq_n_s16(v2696, 24397);
+    int16x8_t v2698 = vaddq_s16(v2695, v2697);
+    int16x8_t v2699 = vsubq_s16(v2635, v2637);
+    int16x8_t v2700 = vsubq_s16(v2639, v2641);
+    int16x8_t v2701 = vqrdmulhq_n_s16(v2700, 24397);
+    int16x8_t v2702 = vaddq_s16(v2699, v2701);
+    int16x8_t v2703 = vqrdmulhq_n_s16(v2702, 17921);
+    int16x8_t v2704 = vaddq_s16(v2698, v2703);
+    int16x8_t v2705 = vsubq_s16(v2647, v2649);
+    int16x8_t v2706 = vsubq_s16(v2651, v2653);
+    int16x8_t v2707 = vqrdmulhq_n_s16(v2706, 24397);
+    int16x8_t v2708 = vaddq_s16(v2705, v2707);
+    int16x8_t v2709 = vsubq_s16(v2657, v2659);
+    int16x8_t v2710 = vsubq_s16(v2661, v2663);
+    int16x8_t v2711 = vqrdmulhq_n_s16(v2710, 24397);
+    int16x8_t v2712 = vaddq_s16(v2709, v2711);
+    int16x8_t v2713 = vqrdmulhq_n_s16(v2712, 17921);
+    int16x8_t v2714 = vaddq_s16(v2708, v2713);
+    int16x8_t v2715 = vqrdmulhq_n_s16(v2714, 16747);
+    int16x8_t v2716 = vaddq_s16(v2704, v2715);
+    int16x8_t v2717 = vqrdmulhq_n_s16(v2716, 16474);
+    int16x8_t v2718 = vaddq_s16(v2694, v2717);
+    int16x8_t v2719 = vsubq_s16(v2485, v2487);
+    int16x8_t v2720 = vsubq_s16(v2489, v2491);
+    int16x8_t v2721 = vqrdmulhq_n_s16(v2720, 27504);
+    int16x8_t v2722 = vaddq_s16(v2719, v2721);
+    int16x8_t v2723 = vsubq_s16(v2495, v2497);
+    int16x8_t v2724 = vsubq_s16(v2499, v2501);
+    int16x8_t v2725 = vqrdmulhq_n_s16(v2724, 27504);
+    int16x8_t v2726 = vaddq_s16(v2723, v2725);
+    int16x8_t v2727 = vqrdmulhq_n_s16(v2726, 18343);
+    int16x8_t v2728 = vaddq_s16(v2722, v2727);
+    int16x8_t v2729 = vsubq_s16(v2507, v2509);
+    int16x8_t v2730 = vsubq_s16(v2511, v2513);
+    int16x8_t v2731 = vqrdmulhq_n_s16(v2730, 27504);
+    int16x8_t v2732 = vaddq_s16(v2729, v2731);
+    int16x8_t v2733 = vsubq_s16(v2517, v2519);
+    int16x8_t v2734 = vsubq_s16(v2521, v2523);
+    int16x8_t v2735 = vqrdmulhq_n_s16(v2734, 27504);
+    int16x8_t v2736 = vaddq_s16(v2733, v2735);
+    int16x8_t v2737 = vqrdmulhq_n_s16(v2736, 18343);
+    int16x8_t v2738 = vaddq_s16(v2732, v2737);
+    int16x8_t v2739 = vqrdmulhq_n_s16(v2738, 16840);
+    int16x8_t v2740 = vaddq_s16(v2728, v2739);
+    int16x8_t v2741 = vsubq_s16(v2531, v2533);
+    int16x8_t v2742 = vsubq_s16(v2535, v2537);
+    int16x8_t v2743 = vqrdmulhq_n_s16(v2742, 27504);
+    int16x8_t v2744 = vaddq_s16(v2741, v2743);
+    int16x8_t v2745 = vsubq_s16(v2541, v2543);
+    int16x8_t v2746 = vsubq_s16(v2545, v2547);
+    int16x8_t v2747 = vqrdmulhq_n_s16(v2746, 27504);
+    int16x8_t v2748 = vaddq_s16(v2745, v2747);
+    int16x8_t v2749 = vqrdmulhq_n_s16(v2748, 18343);
+    int16x8_t v2750 = vaddq_s16(v2744, v2749);
+    int16x8_t v2751 = vsubq_s16(v2553, v2555);
+    int16x8_t v2752 = vsubq_s16(v2557, v2559);
+    int16x8_t v2753 = vqrdmulhq_n_s16(v2752, 27504);
+    int16x8_t v2754 = vaddq_s16(v2751, v2753);
+    int16x8_t v2755 = vsubq_s16(v2563, v2565);
+    int16x8_t v2756 = vsubq_s16(v2567, v2569);
+    int16x8_t v2757 = vqrdmulhq_n_s16(v2756, 27504);
+    int16x8_t v2758 = vaddq_s16(v2755, v2757);
+    int16x8_t v2759 = vqrdmulhq_n_s16(v2758, 18343);
+    int16x8_t v2760 = vaddq_s16(v2754, v2759);
+    int16x8_t v2761 = vqrdmulhq_n_s16(v2760, 16840);
+    int16x8_t v2762 = vaddq_s16(v2750, v2761);
+    int16x8_t v2763 = vqrdmulhq_n_s16(v2762, 16496);
+    int16x8_t v2764 = vaddq_s16(v2740, v2763);
+    int16x8_t v2765 = vsubq_s16(v2390, v2392);
+    int16x8_t v2766 = vsubq_s16(v2394, v2396);
+    int16x8_t v2767 = vqrdmulhq_n_s16(v2766, 31869);
+    int16x8_t v2768 = vaddq_s16(v2765, v2767);
+    int16x8_t v2769 = vsubq_s16(v2400, v2402);
+    int16x8_t v2770 = vsubq_s16(v2404, v2406);
+    int16x8_t v2771 = vqrdmulhq_n_s16(v2770, 31869);
+    int16x8_t v2772 = vaddq_s16(v2769, v2771);
+    int16x8_t v2773 = vqrdmulhq_n_s16(v2772, 18830);
+    int16x8_t v2774 = vaddq_s16(v2768, v2773);
+    int16x8_t v2775 = vsubq_s16(v2412, v2414);
+    int16x8_t v2776 = vsubq_s16(v2416, v2418);
+    int16x8_t v2777 = vqrdmulhq_n_s16(v2776, 31869);
+    int16x8_t v2778 = vaddq_s16(v2775, v2777);
+    int16x8_t v2779 = vsubq_s16(v2422, v2424);
+    int16x8_t v2780 = vsubq_s16(v2426, v2428);
+    int16x8_t v2781 = vqrdmulhq_n_s16(v2780, 31869);
+    int16x8_t v2782 = vaddq_s16(v2779, v2781);
+    int16x8_t v2783 = vqrdmulhq_n_s16(v2782, 18830);
+    int16x8_t v2784 = vaddq_s16(v2778, v2783);
+    int16x8_t v2785 = vqrdmulhq_n_s16(v2784, 16944);
+    int16x8_t v2786 = vaddq_s16(v2774, v2785);
+    int16x8_t v2787 = vsubq_s16(v2436, v2438);
+    int16x8_t v2788 = vsubq_s16(v2440, v2442);
+    int16x8_t v2789 = vqrdmulhq_n_s16(v2788, 31869);
+    int16x8_t v2790 = vaddq_s16(v2787, v2789);
+    int16x8_t v2791 = vsubq_s16(v2446, v2448);
+    int16x8_t v2792 = vsubq_s16(v2450, v2452);
+    int16x8_t v2793 = vqrdmulhq_n_s16(v2792, 31869);
+    int16x8_t v2794 = vaddq_s16(v2791, v2793);
+    int16x8_t v2795 = vqrdmulhq_n_s16(v2794, 18830);
+    int16x8_t v2796 = vaddq_s16(v2790, v2795);
+    int16x8_t v2797 = vsubq_s16(v2458, v2460);
+    int16x8_t v2798 = vsubq_s16(v2462, v2465);
+    int16x8_t v2799 = vqrdmulhq_n_s16(v2798, 31869);
+    int16x8_t v2800 = vaddq_s16(v2797, v2799);
+    int16x8_t v2801 = vsubq_s16(v2469, v2471);
+    int16x8_t v2802 = vsubq_s16(v2473, v2475);
+    int16x8_t v2803 = vqrdmulhq_n_s16(v2802, 31869);
+    int16x8_t v2804 = vaddq_s16(v2801, v2803);
+    int16x8_t v2805 = vqrdmulhq_n_s16(v2804, 18830);
+    int16x8_t v2806 = vaddq_s16(v2800, v2805);
+    int16x8_t v2807 = vqrdmulhq_n_s16(v2806, 16944);
+    int16x8_t v2808 = vaddq_s16(v2796, v2807);
+    int16x8_t v2809 = vqrdmulhq_n_s16(v2808, 16521);
+    int16x8_t v2810 = vaddq_s16(v2786, v2809);
+    int16x8_t v2811 = vsubq_s16(v2296, v2298);
+    int16x8_t v2812 = vsubq_s16(v2300, v2302);
+    int16x8_t v2813_tmp = vqrdmulhq_n_s16(v2812, 5552);
+    int16x8_t v2813 = vaddq_s16(v2813_tmp, v2812);
+    int16x8_t v2814 = vaddq_s16(v2811, v2813);
+    int16x8_t v2815 = vsubq_s16(v2306, v2308);
+    int16x8_t v2816 = vsubq_s16(v2310, v2312);
+    int16x8_t v2817_tmp = vqrdmulhq_n_s16(v2816, 5552);
+    int16x8_t v2817 = vaddq_s16(v2817_tmp, v2816);
+    int16x8_t v2818 = vaddq_s16(v2815, v2817);
+    int16x8_t v2819 = vqrdmulhq_n_s16(v2818, 19393);
+    int16x8_t v2820 = vaddq_s16(v2814, v2819);
+    int16x8_t v2821 = vsubq_s16(v2318, v2320);
+    int16x8_t v2822 = vsubq_s16(v2322, v2324);
+    int16x8_t v2823_tmp = vqrdmulhq_n_s16(v2822, 5552);
+    int16x8_t v2823 = vaddq_s16(v2823_tmp, v2822);
+    int16x8_t v2824 = vaddq_s16(v2821, v2823);
+    int16x8_t v2825 = vsubq_s16(v2328, v2330);
+    int16x8_t v2826 = vsubq_s16(v2332, v2334);
+    int16x8_t v2827_tmp = vqrdmulhq_n_s16(v2826, 5552);
+    int16x8_t v2827 = vaddq_s16(v2827_tmp, v2826);
+    int16x8_t v2828 = vaddq_s16(v2825, v2827);
+    int16x8_t v2829 = vqrdmulhq_n_s16(v2828, 19393);
+    int16x8_t v2830 = vaddq_s16(v2824, v2829);
+    int16x8_t v2831 = vqrdmulhq_n_s16(v2830, 17059);
+    int16x8_t v2832 = vaddq_s16(v2820, v2831);
+    int16x8_t v2833 = vsubq_s16(v2342, v2344);
+    int16x8_t v2834 = vsubq_s16(v2346, v2348);
+    int16x8_t v2835_tmp = vqrdmulhq_n_s16(v2834, 5552);
+    int16x8_t v2835 = vaddq_s16(v2835_tmp, v2834);
+    int16x8_t v2836 = vaddq_s16(v2833, v2835);
+    int16x8_t v2837 = vsubq_s16(v2352, v2354);
+    int16x8_t v2838 = vsubq_s16(v2356, v2358);
+    int16x8_t v2839_tmp = vqrdmulhq_n_s16(v2838, 5552);
+    int16x8_t v2839 = vaddq_s16(v2839_tmp, v2838);
+    int16x8_t v2840 = vaddq_s16(v2837, v2839);
+    int16x8_t v2841 = vqrdmulhq_n_s16(v2840, 19393);
+    int16x8_t v2842 = vaddq_s16(v2836, v2841);
+    int16x8_t v2843 = vsubq_s16(v2364, v2366);
+    int16x8_t v2844 = vsubq_s16(v2368, v2370);
+    int16x8_t v2845_tmp = vqrdmulhq_n_s16(v2844, 5552);
+    int16x8_t v2845 = vaddq_s16(v2845_tmp, v2844);
+    int16x8_t v2846 = vaddq_s16(v2843, v2845);
+    int16x8_t v2847 = vsubq_s16(v2374, v2376);
+    int16x8_t v2848 = vsubq_s16(v2378, v2380);
+    int16x8_t v2849_tmp = vqrdmulhq_n_s16(v2848, 5552);
+    int16x8_t v2849 = vaddq_s16(v2849_tmp, v2848);
+    int16x8_t v2850 = vaddq_s16(v2847, v2849);
+    int16x8_t v2851 = vqrdmulhq_n_s16(v2850, 19393);
+    int16x8_t v2852 = vaddq_s16(v2846, v2851);
+    int16x8_t v2853 = vqrdmulhq_n_s16(v2852, 17059);
+    int16x8_t v2854 = vaddq_s16(v2842, v2853);
+    int16x8_t v2855 = vqrdmulhq_n_s16(v2854, 16549);
+    int16x8_t v2856 = vaddq_s16(v2832, v2855);
+    int16x8_t v2857 = vsubq_s16(v2109, v2114);
+    int16x8_t v2858 = vsubq_s16(v2119, v2124);
+    int16x8_t v2859_tmp = vqrdmulhq_n_s16(v2858, 15865);
+    int16x8_t v2859 = vaddq_s16(v2859_tmp, v2858);
+    int16x8_t v2860 = vaddq_s16(v2857, v2859);
+    int16x8_t v2861 = vsubq_s16(v2131, v2136);
+    int16x8_t v2862 = vsubq_s16(v2141, v2146);
+    int16x8_t v2863_tmp = vqrdmulhq_n_s16(v2862, 15865);
+    int16x8_t v2863 = vaddq_s16(v2863_tmp, v2862);
+    int16x8_t v2864 = vaddq_s16(v2861, v2863);
+    int16x8_t v2865 = vqrdmulhq_n_s16(v2864, 20040);
+    int16x8_t v2866 = vaddq_s16(v2860, v2865);
+    int16x8_t v2867 = vsubq_s16(v2155, v2160);
+    int16x8_t v2868 = vsubq_s16(v2165, v2170);
+    int16x8_t v2869_tmp = vqrdmulhq_n_s16(v2868, 15865);
+    int16x8_t v2869 = vaddq_s16(v2869_tmp, v2868);
+    int16x8_t v2870 = vaddq_s16(v2867, v2869);
+    int16x8_t v2871 = vsubq_s16(v2177, v2182);
+    int16x8_t v2872 = vsubq_s16(v2187, v2192);
+    int16x8_t v2873_tmp = vqrdmulhq_n_s16(v2872, 15865);
+    int16x8_t v2873 = vaddq_s16(v2873_tmp, v2872);
+    int16x8_t v2874 = vaddq_s16(v2871, v2873);
+    int16x8_t v2875 = vqrdmulhq_n_s16(v2874, 20040);
+    int16x8_t v2876 = vaddq_s16(v2870, v2875);
+    int16x8_t v2877 = vqrdmulhq_n_s16(v2876, 17187);
+    int16x8_t v2878 = vaddq_s16(v2866, v2877);
+    int16x8_t v2879 = vsubq_s16(v2203, v2208);
+    int16x8_t v2880 = vsubq_s16(v2213, v2218);
+    int16x8_t v2881_tmp = vqrdmulhq_n_s16(v2880, 15865);
+    int16x8_t v2881 = vaddq_s16(v2881_tmp, v2880);
+    int16x8_t v2882 = vaddq_s16(v2879, v2881);
+    int16x8_t v2883 = vsubq_s16(v2225, v2230);
+    int16x8_t v2884 = vsubq_s16(v2235, v2240);
+    int16x8_t v2885_tmp = vqrdmulhq_n_s16(v2884, 15865);
+    int16x8_t v2885 = vaddq_s16(v2885_tmp, v2884);
+    int16x8_t v2886 = vaddq_s16(v2883, v2885);
+    int16x8_t v2887 = vqrdmulhq_n_s16(v2886, 20040);
+    int16x8_t v2888 = vaddq_s16(v2882, v2887);
+    int16x8_t v2889 = vsubq_s16(v2249, v2254);
+    int16x8_t v2890 = vsubq_s16(v2259, v2264);
+    int16x8_t v2891_tmp = vqrdmulhq_n_s16(v2890, 15865);
+    int16x8_t v2891 = vaddq_s16(v2891_tmp, v2890);
+    int16x8_t v2892 = vaddq_s16(v2889, v2891);
+    int16x8_t v2893 = vsubq_s16(v2271, v2276);
+    int16x8_t v2894 = vsubq_s16(v2281, v2286);
+    int16x8_t v2895_tmp = vqrdmulhq_n_s16(v2894, 15865);
+    int16x8_t v2895 = vaddq_s16(v2895_tmp, v2894);
+    int16x8_t v2896 = vaddq_s16(v2893, v2895);
+    int16x8_t v2897 = vqrdmulhq_n_s16(v2896, 20040);
+    int16x8_t v2898 = vaddq_s16(v2892, v2897);
+    int16x8_t v2899 = vqrdmulhq_n_s16(v2898, 17187);
+    int16x8_t v2900 = vaddq_s16(v2888, v2899);
+    int16x8_t v2901 = vqrdmulhq_n_s16(v2900, 16579);
+    int16x8_t v2902 = vaddq_s16(v2878, v2901);
+    int16x8_t v2903 = vsubq_s16(v1919, v1924);
+    int16x8_t v2904 = vsubq_s16(v1929, v1934);
+    int16x8_t v2905_tmp = vqrdmulhq_n_s16(v2904, 1893);
+    int16x8_t v2905 = vmlaq_n_s16(v2905_tmp, v2904, 2);
+    int16x8_t v2906 = vaddq_s16(v2903, v2905);
+    int16x8_t v2907 = vsubq_s16(v1941, v1946);
+    int16x8_t v2908 = vsubq_s16(v1951, v1956);
+    int16x8_t v2909_tmp = vqrdmulhq_n_s16(v2908, 1893);
+    int16x8_t v2909 = vmlaq_n_s16(v2909_tmp, v2908, 2);
+    int16x8_t v2910 = vaddq_s16(v2907, v2909);
+    int16x8_t v2911 = vqrdmulhq_n_s16(v2910, 20783);
+    int16x8_t v2912 = vaddq_s16(v2906, v2911);
+    int16x8_t v2913 = vsubq_s16(v1965, v1970);
+    int16x8_t v2914 = vsubq_s16(v1975, v1980);
+    int16x8_t v2915_tmp = vqrdmulhq_n_s16(v2914, 1893);
+    int16x8_t v2915 = vmlaq_n_s16(v2915_tmp, v2914, 2);
+    int16x8_t v2916 = vaddq_s16(v2913, v2915);
+    int16x8_t v2917 = vsubq_s16(v1987, v1992);
+    int16x8_t v2918 = vsubq_s16(v1997, v2002);
+    int16x8_t v2919_tmp = vqrdmulhq_n_s16(v2918, 1893);
+    int16x8_t v2919 = vmlaq_n_s16(v2919_tmp, v2918, 2);
+    int16x8_t v2920 = vaddq_s16(v2917, v2919);
+    int16x8_t v2921 = vqrdmulhq_n_s16(v2920, 20783);
+    int16x8_t v2922 = vaddq_s16(v2916, v2921);
+    int16x8_t v2923 = vqrdmulhq_n_s16(v2922, 17326);
+    int16x8_t v2924 = vaddq_s16(v2912, v2923);
+    int16x8_t v2925 = vsubq_s16(v2013, v2018);
+    int16x8_t v2926 = vsubq_s16(v2023, v2028);
+    int16x8_t v2927_tmp = vqrdmulhq_n_s16(v2926, 1893);
+    int16x8_t v2927 = vmlaq_n_s16(v2927_tmp, v2926, 2);
+    int16x8_t v2928 = vaddq_s16(v2925, v2927);
+    int16x8_t v2929 = vsubq_s16(v2035, v2040);
+    int16x8_t v2930 = vsubq_s16(v2045, v2050);
+    int16x8_t v2931_tmp = vqrdmulhq_n_s16(v2930, 1893);
+    int16x8_t v2931 = vmlaq_n_s16(v2931_tmp, v2930, 2);
+    int16x8_t v2932 = vaddq_s16(v2929, v2931);
+    int16x8_t v2933 = vqrdmulhq_n_s16(v2932, 20783);
+    int16x8_t v2934 = vaddq_s16(v2928, v2933);
+    int16x8_t v2935 = vsubq_s16(v2059, v2064);
+    int16x8_t v2936 = vsubq_s16(v2069, v2074);
+    int16x8_t v2937_tmp = vqrdmulhq_n_s16(v2936, 1893);
+    int16x8_t v2937 = vmlaq_n_s16(v2937_tmp, v2936, 2);
+    int16x8_t v2938 = vaddq_s16(v2935, v2937);
+    int16x8_t v2939 = vsubq_s16(v2081, v2086);
+    int16x8_t v2940 = vsubq_s16(v2091, v2096);
+    int16x8_t v2941_tmp = vqrdmulhq_n_s16(v2940, 1893);
+    int16x8_t v2941 = vmlaq_n_s16(v2941_tmp, v2940, 2);
+    int16x8_t v2942 = vaddq_s16(v2939, v2941);
+    int16x8_t v2943 = vqrdmulhq_n_s16(v2942, 20783);
+    int16x8_t v2944 = vaddq_s16(v2938, v2943);
+    int16x8_t v2945 = vqrdmulhq_n_s16(v2944, 17326);
+    int16x8_t v2946 = vaddq_s16(v2934, v2945);
+    int16x8_t v2947 = vqrdmulhq_n_s16(v2946, 16611);
+    int16x8_t v2948 = vaddq_s16(v2924, v2947);
+    int16x8_t v2949 = vsubq_s16(v1543, v1554);
+    int16x8_t v2950 = vsubq_s16(v1565, v1576);
+    int16x8_t v2951_tmp = vqrdmulhq_n_s16(v2950, 13357);
+    int16x8_t v2951 = vmlaq_n_s16(v2951_tmp, v2950, 3);
+    int16x8_t v2952 = vaddq_s16(v2949, v2951);
+    int16x8_t v2953 = vsubq_s16(v1589, v1600);
+    int16x8_t v2954 = vsubq_s16(v1611, v1622);
+    int16x8_t v2955_tmp = vqrdmulhq_n_s16(v2954, 13357);
+    int16x8_t v2955 = vmlaq_n_s16(v2955_tmp, v2954, 3);
+    int16x8_t v2956 = vaddq_s16(v2953, v2955);
+    int16x8_t v2957 = vqrdmulhq_n_s16(v2956, 21637);
+    int16x8_t v2958 = vaddq_s16(v2952, v2957);
+    int16x8_t v2959 = vsubq_s16(v1637, v1648);
+    int16x8_t v2960 = vsubq_s16(v1659, v1670);
+    int16x8_t v2961_tmp = vqrdmulhq_n_s16(v2960, 13357);
+    int16x8_t v2961 = vmlaq_n_s16(v2961_tmp, v2960, 3);
+    int16x8_t v2962 = vaddq_s16(v2959, v2961);
+    int16x8_t v2963 = vsubq_s16(v1683, v1694);
+    int16x8_t v2964 = vsubq_s16(v1705, v1716);
+    int16x8_t v2965_tmp = vqrdmulhq_n_s16(v2964, 13357);
+    int16x8_t v2965 = vmlaq_n_s16(v2965_tmp, v2964, 3);
+    int16x8_t v2966 = vaddq_s16(v2963, v2965);
+    int16x8_t v2967 = vqrdmulhq_n_s16(v2966, 21637);
+    int16x8_t v2968 = vaddq_s16(v2962, v2967);
+    int16x8_t v2969 = vqrdmulhq_n_s16(v2968, 17479);
+    int16x8_t v2970 = vaddq_s16(v2958, v2969);
+    int16x8_t v2971 = vsubq_s16(v1733, v1744);
+    int16x8_t v2972 = vsubq_s16(v1755, v1766);
+    int16x8_t v2973_tmp = vqrdmulhq_n_s16(v2972, 13357);
+    int16x8_t v2973 = vmlaq_n_s16(v2973_tmp, v2972, 3);
+    int16x8_t v2974 = vaddq_s16(v2971, v2973);
+    int16x8_t v2975 = vsubq_s16(v1779, v1790);
+    int16x8_t v2976 = vsubq_s16(v1801, v1812);
+    int16x8_t v2977_tmp = vqrdmulhq_n_s16(v2976, 13357);
+    int16x8_t v2977 = vmlaq_n_s16(v2977_tmp, v2976, 3);
+    int16x8_t v2978 = vaddq_s16(v2975, v2977);
+    int16x8_t v2979 = vqrdmulhq_n_s16(v2978, 21637);
+    int16x8_t v2980 = vaddq_s16(v2974, v2979);
+    int16x8_t v2981 = vsubq_s16(v1827, v1838);
+    int16x8_t v2982 = vsubq_s16(v1849, v1860);
+    int16x8_t v2983_tmp = vqrdmulhq_n_s16(v2982, 13357);
+    int16x8_t v2983 = vmlaq_n_s16(v2983_tmp, v2982, 3);
+    int16x8_t v2984 = vaddq_s16(v2981, v2983);
+    int16x8_t v2985 = vsubq_s16(v1873, v1884);
+    int16x8_t v2986 = vsubq_s16(v1895, v1906);
+    int16x8_t v2987_tmp = vqrdmulhq_n_s16(v2986, 13357);
+    int16x8_t v2987 = vmlaq_n_s16(v2987_tmp, v2986, 3);
+    int16x8_t v2988 = vaddq_s16(v2985, v2987);
+    int16x8_t v2989 = vqrdmulhq_n_s16(v2988, 21637);
+    int16x8_t v2990 = vaddq_s16(v2984, v2989);
+    int16x8_t v2991 = vqrdmulhq_n_s16(v2990, 17479);
+    int16x8_t v2992 = vaddq_s16(v2980, v2991);
+    int16x8_t v2993 = vqrdmulhq_n_s16(v2992, 16647);
+    int16x8_t v2994 = vaddq_s16(v2970, v2993);
+    int16x8_t v2995 = vsubq_s16(v25, v60);
+    int16x8_t v2996 = vsubq_s16(v102, v138);
+    int16x8_t v2997_tmp = vqrdmulhq_n_s16(v2996, 6226);
+    int16x8_t v2997 = vmlaq_n_s16(v2997_tmp, v2996, 10);
+    int16x8_t v2998 = vaddq_s16(v2995, v2997);
+    int16x8_t v2999 = vsubq_s16(v182, v233);
+    int16x8_t v3000 = vsubq_s16(v275, v312);
+    int16x8_t v3001_tmp = vqrdmulhq_n_s16(v3000, 6226);
+    int16x8_t v3001 = vmlaq_n_s16(v3001_tmp, v3000, 10);
+    int16x8_t v3002 = vaddq_s16(v2999, v3001);
+    int16x8_t v3003 = vqrdmulhq_n_s16(v3002, 22622);
+    int16x8_t v3004 = vaddq_s16(v2998, v3003);
+    int16x8_t v3005 = vsubq_s16(v358, v409);
+    int16x8_t v3006 = vsubq_s16(v481, v519);
+    int16x8_t v3007_tmp = vqrdmulhq_n_s16(v3006, 6226);
+    int16x8_t v3007 = vmlaq_n_s16(v3007_tmp, v3006, 10);
+    int16x8_t v3008 = vaddq_s16(v3005, v3007);
+    int16x8_t v3009 = vsubq_s16(v563, v614);
+    int16x8_t v3010 = vsubq_s16(v656, v694);
+    int16x8_t v3011_tmp = vqrdmulhq_n_s16(v3010, 6226);
+    int16x8_t v3011 = vmlaq_n_s16(v3011_tmp, v3010, 10);
+    int16x8_t v3012 = vaddq_s16(v3009, v3011);
+    int16x8_t v3013 = vqrdmulhq_n_s16(v3012, 22622);
+    int16x8_t v3014 = vaddq_s16(v3008, v3013);
+    int16x8_t v3015 = vqrdmulhq_n_s16(v3014, 17646);
+    int16x8_t v3016 = vaddq_s16(v3004, v3015);
+    int16x8_t v3017 = vsubq_s16(v742, v793);
+    int16x8_t v3018 = vsubq_s16(v865, v903);
+    int16x8_t v3019_tmp = vqrdmulhq_n_s16(v3018, 6226);
+    int16x8_t v3019 = vmlaq_n_s16(v3019_tmp, v3018, 10);
+    int16x8_t v3020 = vaddq_s16(v3017, v3019);
+    int16x8_t v3021 = vsubq_s16(v977, v1060);
+    int16x8_t v3022 = vsubq_s16(v1102, v1141);
+    int16x8_t v3023_tmp = vqrdmulhq_n_s16(v3022, 6226);
+    int16x8_t v3023 = vmlaq_n_s16(v3023_tmp, v3022, 10);
+    int16x8_t v3024 = vaddq_s16(v3021, v3023);
+    int16x8_t v3025 = vqrdmulhq_n_s16(v3024, 22622);
+    int16x8_t v3026 = vaddq_s16(v3020, v3025);
+    int16x8_t v3027 = vsubq_s16(v1187, v1238);
+    int16x8_t v3028 = vsubq_s16(v1310, v1348);
+    int16x8_t v3029_tmp = vqrdmulhq_n_s16(v3028, 6226);
+    int16x8_t v3029 = vmlaq_n_s16(v3029_tmp, v3028, 10);
+    int16x8_t v3030 = vaddq_s16(v3027, v3029);
+    int16x8_t v3031 = vsubq_s16(v1392, v1443);
+    int16x8_t v3032 = vsubq_s16(v1485, v1524);
+    int16x8_t v3033_tmp = vqrdmulhq_n_s16(v3032, 6226);
+    int16x8_t v3033 = vmlaq_n_s16(v3033_tmp, v3032, 10);
+    int16x8_t v3034 = vaddq_s16(v3031, v3033);
+    int16x8_t v3035 = vqrdmulhq_n_s16(v3034, 22622);
+    int16x8_t v3036 = vaddq_s16(v3030, v3035);
+    int16x8_t v3037 = vqrdmulhq_n_s16(v3036, 17646);
+    int16x8_t v3038 = vaddq_s16(v3026, v3037);
+    int16x8_t v3039 = vqrdmulhq_n_s16(v3038, 16685);
+    int16x8_t v3040 = vaddq_s16(v3016, v3039);
+    int16x8_t v3041 = vsubq_s16(v2995, v2997);
+    int16x8_t v3042 = vsubq_s16(v2999, v3001);
+    int16x8_t v3043 = vqrdmulhq_n_s16(v3042, 23761);
+    int16x8_t v3044 = vaddq_s16(v3041, v3043);
+    int16x8_t v3045 = vsubq_s16(v3005, v3007);
+    int16x8_t v3046 = vsubq_s16(v3009, v3011);
+    int16x8_t v3047 = vqrdmulhq_n_s16(v3046, 23761);
+    int16x8_t v3048 = vaddq_s16(v3045, v3047);
+    int16x8_t v3049 = vqrdmulhq_n_s16(v3048, 17826);
+    int16x8_t v3050 = vaddq_s16(v3044, v3049);
+    int16x8_t v3051 = vsubq_s16(v3017, v3019);
+    int16x8_t v3052 = vsubq_s16(v3021, v3023);
+    int16x8_t v3053 = vqrdmulhq_n_s16(v3052, 23761);
+    int16x8_t v3054 = vaddq_s16(v3051, v3053);
+    int16x8_t v3055 = vsubq_s16(v3027, v3029);
+    int16x8_t v3056 = vsubq_s16(v3031, v3033);
+    int16x8_t v3057 = vqrdmulhq_n_s16(v3056, 23761);
+    int16x8_t v3058 = vaddq_s16(v3055, v3057);
+    int16x8_t v3059 = vqrdmulhq_n_s16(v3058, 17826);
+    int16x8_t v3060 = vaddq_s16(v3054, v3059);
+    int16x8_t v3061 = vqrdmulhq_n_s16(v3060, 16726);
+    int16x8_t v3062 = vaddq_s16(v3050, v3061);
+    int16x8_t v3063 = vsubq_s16(v2949, v2951);
+    int16x8_t v3064 = vsubq_s16(v2953, v2955);
+    int16x8_t v3065 = vqrdmulhq_n_s16(v3064, 25084);
+    int16x8_t v3066 = vaddq_s16(v3063, v3065);
+    int16x8_t v3067 = vsubq_s16(v2959, v2961);
+    int16x8_t v3068 = vsubq_s16(v2963, v2965);
+    int16x8_t v3069 = vqrdmulhq_n_s16(v3068, 25084);
+    int16x8_t v3070 = vaddq_s16(v3067, v3069);
+    int16x8_t v3071 = vqrdmulhq_n_s16(v3070, 18021);
+    int16x8_t v3072 = vaddq_s16(v3066, v3071);
+    int16x8_t v3073 = vsubq_s16(v2971, v2973);
+    int16x8_t v3074 = vsubq_s16(v2975, v2977);
+    int16x8_t v3075 = vqrdmulhq_n_s16(v3074, 25084);
+    int16x8_t v3076 = vaddq_s16(v3073, v3075);
+    int16x8_t v3077 = vsubq_s16(v2981, v2983);
+    int16x8_t v3078 = vsubq_s16(v2985, v2987);
+    int16x8_t v3079 = vqrdmulhq_n_s16(v3078, 25084);
+    int16x8_t v3080 = vaddq_s16(v3077, v3079);
+    int16x8_t v3081 = vqrdmulhq_n_s16(v3080, 18021);
+    int16x8_t v3082 = vaddq_s16(v3076, v3081);
+    int16x8_t v3083 = vqrdmulhq_n_s16(v3082, 16769);
+    int16x8_t v3084 = vaddq_s16(v3072, v3083);
+    int16x8_t v3085 = vsubq_s16(v2903, v2905);
+    int16x8_t v3086 = vsubq_s16(v2907, v2909);
+    int16x8_t v3087 = vqrdmulhq_n_s16(v3086, 26631);
+    int16x8_t v3088 = vaddq_s16(v3085, v3087);
+    int16x8_t v3089 = vsubq_s16(v2913, v2915);
+    int16x8_t v3090 = vsubq_s16(v2917, v2919);
+    int16x8_t v3091 = vqrdmulhq_n_s16(v3090, 26631);
+    int16x8_t v3092 = vaddq_s16(v3089, v3091);
+    int16x8_t v3093 = vqrdmulhq_n_s16(v3092, 18231);
+    int16x8_t v3094 = vaddq_s16(v3088, v3093);
+    int16x8_t v3095 = vsubq_s16(v2925, v2927);
+    int16x8_t v3096 = vsubq_s16(v2929, v2931);
+    int16x8_t v3097 = vqrdmulhq_n_s16(v3096, 26631);
+    int16x8_t v3098 = vaddq_s16(v3095, v3097);
+    int16x8_t v3099 = vsubq_s16(v2935, v2937);
+    int16x8_t v3100 = vsubq_s16(v2939, v2941);
+    int16x8_t v3101 = vqrdmulhq_n_s16(v3100, 26631);
+    int16x8_t v3102 = vaddq_s16(v3099, v3101);
+    int16x8_t v3103 = vqrdmulhq_n_s16(v3102, 18231);
+    int16x8_t v3104 = vaddq_s16(v3098, v3103);
+    int16x8_t v3105 = vqrdmulhq_n_s16(v3104, 16815);
+    int16x8_t v3106 = vaddq_s16(v3094, v3105);
+    int16x8_t v3107 = vsubq_s16(v2857, v2859);
+    int16x8_t v3108 = vsubq_s16(v2861, v2863);
+    int16x8_t v3109 = vqrdmulhq_n_s16(v3108, 28454);
+    int16x8_t v3110 = vaddq_s16(v3107, v3109);
+    int16x8_t v3111 = vsubq_s16(v2867, v2869);
+    int16x8_t v3112 = vsubq_s16(v2871, v2873);
+    int16x8_t v3113 = vqrdmulhq_n_s16(v3112, 28454);
+    int16x8_t v3114 = vaddq_s16(v3111, v3113);
+    int16x8_t v3115 = vqrdmulhq_n_s16(v3114, 18458);
+    int16x8_t v3116 = vaddq_s16(v3110, v3115);
+    int16x8_t v3117 = vsubq_s16(v2879, v2881);
+    int16x8_t v3118 = vsubq_s16(v2883, v2885);
+    int16x8_t v3119 = vqrdmulhq_n_s16(v3118, 28454);
+    int16x8_t v3120 = vaddq_s16(v3117, v3119);
+    int16x8_t v3121 = vsubq_s16(v2889, v2891);
+    int16x8_t v3122 = vsubq_s16(v2893, v2895);
+    int16x8_t v3123 = vqrdmulhq_n_s16(v3122, 28454);
+    int16x8_t v3124 = vaddq_s16(v3121, v3123);
+    int16x8_t v3125 = vqrdmulhq_n_s16(v3124, 18458);
+    int16x8_t v3126 = vaddq_s16(v3120, v3125);
+    int16x8_t v3127 = vqrdmulhq_n_s16(v3126, 16865);
+    int16x8_t v3128 = vaddq_s16(v3116, v3127);
+    int16x8_t v3129 = vsubq_s16(v2811, v2813);
+    int16x8_t v3130 = vsubq_s16(v2815, v2817);
+    int16x8_t v3131 = vqrdmulhq_n_s16(v3130, 30624);
+    int16x8_t v3132 = vaddq_s16(v3129, v3131);
+    int16x8_t v3133 = vsubq_s16(v2821, v2823);
+    int16x8_t v3134 = vsubq_s16(v2825, v2827);
+    int16x8_t v3135 = vqrdmulhq_n_s16(v3134, 30624);
+    int16x8_t v3136 = vaddq_s16(v3133, v3135);
+    int16x8_t v3137 = vqrdmulhq_n_s16(v3136, 18702);
+    int16x8_t v3138 = vaddq_s16(v3132, v3137);
+    int16x8_t v3139 = vsubq_s16(v2833, v2835);
+    int16x8_t v3140 = vsubq_s16(v2837, v2839);
+    int16x8_t v3141 = vqrdmulhq_n_s16(v3140, 30624);
+    int16x8_t v3142 = vaddq_s16(v3139, v3141);
+    int16x8_t v3143 = vsubq_s16(v2843, v2845);
+    int16x8_t v3144 = vsubq_s16(v2847, v2849);
+    int16x8_t v3145 = vqrdmulhq_n_s16(v3144, 30624);
+    int16x8_t v3146 = vaddq_s16(v3143, v3145);
+    int16x8_t v3147 = vqrdmulhq_n_s16(v3146, 18702);
+    int16x8_t v3148 = vaddq_s16(v3142, v3147);
+    int16x8_t v3149 = vqrdmulhq_n_s16(v3148, 16916);
+    int16x8_t v3150 = vaddq_s16(v3138, v3149);
+    int16x8_t v3151 = vsubq_s16(v2765, v2767);
+    int16x8_t v3152 = vsubq_s16(v2769, v2771);
+    int16x8_t v3153_tmp = vqrdmulhq_n_s16(v3152, 472);
+    int16x8_t v3153 = vaddq_s16(v3153_tmp, v3152);
+    int16x8_t v3154 = vaddq_s16(v3151, v3153);
+    int16x8_t v3155 = vsubq_s16(v2775, v2777);
+    int16x8_t v3156 = vsubq_s16(v2779, v2781);
+    int16x8_t v3157_tmp = vqrdmulhq_n_s16(v3156, 472);
+    int16x8_t v3157 = vaddq_s16(v3157_tmp, v3156);
+    int16x8_t v3158 = vaddq_s16(v3155, v3157);
+    int16x8_t v3159 = vqrdmulhq_n_s16(v3158, 18964);
+    int16x8_t v3160 = vaddq_s16(v3154, v3159);
+    int16x8_t v3161 = vsubq_s16(v2787, v2789);
+    int16x8_t v3162 = vsubq_s16(v2791, v2793);
+    int16x8_t v3163_tmp = vqrdmulhq_n_s16(v3162, 472);
+    int16x8_t v3163 = vaddq_s16(v3163_tmp, v3162);
+    int16x8_t v3164 = vaddq_s16(v3161, v3163);
+    int16x8_t v3165 = vsubq_s16(v2797, v2799);
+    int16x8_t v3166 = vsubq_s16(v2801, v2803);
+    int16x8_t v3167_tmp = vqrdmulhq_n_s16(v3166, 472);
+    int16x8_t v3167 = vaddq_s16(v3167_tmp, v3166);
+    int16x8_t v3168 = vaddq_s16(v3165, v3167);
+    int16x8_t v3169 = vqrdmulhq_n_s16(v3168, 18964);
+    int16x8_t v3170 = vaddq_s16(v3164, v3169);
+    int16x8_t v3171 = vqrdmulhq_n_s16(v3170, 16971);
+    int16x8_t v3172 = vaddq_s16(v3160, v3171);
+    int16x8_t v3173 = vsubq_s16(v2719, v2721);
+    int16x8_t v3174 = vsubq_s16(v2723, v2725);
+    int16x8_t v3175_tmp = vqrdmulhq_n_s16(v3174, 3672);
+    int16x8_t v3175 = vaddq_s16(v3175_tmp, v3174);
+    int16x8_t v3176 = vaddq_s16(v3173, v3175);
+    int16x8_t v3177 = vsubq_s16(v2729, v2731);
+    int16x8_t v3178 = vsubq_s16(v2733, v2735);
+    int16x8_t v3179_tmp = vqrdmulhq_n_s16(v3178, 3672);
+    int16x8_t v3179 = vaddq_s16(v3179_tmp, v3178);
+    int16x8_t v3180 = vaddq_s16(v3177, v3179);
+    int16x8_t v3181 = vqrdmulhq_n_s16(v3180, 19245);
+    int16x8_t v3182 = vaddq_s16(v3176, v3181);
+    int16x8_t v3183 = vsubq_s16(v2741, v2743);
+    int16x8_t v3184 = vsubq_s16(v2745, v2747);
+    int16x8_t v3185_tmp = vqrdmulhq_n_s16(v3184, 3672);
+    int16x8_t v3185 = vaddq_s16(v3185_tmp, v3184);
+    int16x8_t v3186 = vaddq_s16(v3183, v3185);
+    int16x8_t v3187 = vsubq_s16(v2751, v2753);
+    int16x8_t v3188 = vsubq_s16(v2755, v2757);
+    int16x8_t v3189_tmp = vqrdmulhq_n_s16(v3188, 3672);
+    int16x8_t v3189 = vaddq_s16(v3189_tmp, v3188);
+    int16x8_t v3190 = vaddq_s16(v3187, v3189);
+    int16x8_t v3191 = vqrdmulhq_n_s16(v3190, 19245);
+    int16x8_t v3192 = vaddq_s16(v3186, v3191);
+    int16x8_t v3193 = vqrdmulhq_n_s16(v3192, 17029);
+    int16x8_t v3194 = vaddq_s16(v3182, v3193);
+    int16x8_t v3195 = vsubq_s16(v2673, v2675);
+    int16x8_t v3196 = vsubq_s16(v2677, v2679);
+    int16x8_t v3197_tmp = vqrdmulhq_n_s16(v3196, 7662);
+    int16x8_t v3197 = vaddq_s16(v3197_tmp, v3196);
+    int16x8_t v3198 = vaddq_s16(v3195, v3197);
+    int16x8_t v3199 = vsubq_s16(v2683, v2685);
+    int16x8_t v3200 = vsubq_s16(v2687, v2689);
+    int16x8_t v3201_tmp = vqrdmulhq_n_s16(v3200, 7662);
+    int16x8_t v3201 = vaddq_s16(v3201_tmp, v3200);
+    int16x8_t v3202 = vaddq_s16(v3199, v3201);
+    int16x8_t v3203 = vqrdmulhq_n_s16(v3202, 19546);
+    int16x8_t v3204 = vaddq_s16(v3198, v3203);
+    int16x8_t v3205 = vsubq_s16(v2695, v2697);
+    int16x8_t v3206 = vsubq_s16(v2699, v2701);
+    int16x8_t v3207_tmp = vqrdmulhq_n_s16(v3206, 7662);
+    int16x8_t v3207 = vaddq_s16(v3207_tmp, v3206);
+    int16x8_t v3208 = vaddq_s16(v3205, v3207);
+    int16x8_t v3209 = vsubq_s16(v2705, v2707);
+    int16x8_t v3210 = vsubq_s16(v2709, v2711);
+    int16x8_t v3211_tmp = vqrdmulhq_n_s16(v3210, 7662);
+    int16x8_t v3211 = vaddq_s16(v3211_tmp, v3210);
+    int16x8_t v3212 = vaddq_s16(v3209, v3211);
+    int16x8_t v3213 = vqrdmulhq_n_s16(v3212, 19546);
+    int16x8_t v3214 = vaddq_s16(v3208, v3213);
+    int16x8_t v3215 = vqrdmulhq_n_s16(v3214, 17090);
+    int16x8_t v3216 = vaddq_s16(v3204, v3215);
+    int16x8_t v3217 = vsubq_s16(v2582, v2587);
+    int16x8_t v3218 = vsubq_s16(v2592, v2597);
+    int16x8_t v3219_tmp = vqrdmulhq_n_s16(v3218, 12756);
+    int16x8_t v3219 = vaddq_s16(v3219_tmp, v3218);
+    int16x8_t v3220 = vaddq_s16(v3217, v3219);
+    int16x8_t v3221 = vsubq_s16(v2604, v2609);
+    int16x8_t v3222 = vsubq_s16(v2614, v2619);
+    int16x8_t v3223_tmp = vqrdmulhq_n_s16(v3222, 12756);
+    int16x8_t v3223 = vaddq_s16(v3223_tmp, v3222);
+    int16x8_t v3224 = vaddq_s16(v3221, v3223);
+    int16x8_t v3225 = vqrdmulhq_n_s16(v3224, 19869);
+    int16x8_t v3226 = vaddq_s16(v3220, v3225);
+    int16x8_t v3227 = vsubq_s16(v2628, v2633);
+    int16x8_t v3228 = vsubq_s16(v2638, v2643);
+    int16x8_t v3229_tmp = vqrdmulhq_n_s16(v3228, 12756);
+    int16x8_t v3229 = vaddq_s16(v3229_tmp, v3228);
+    int16x8_t v3230 = vaddq_s16(v3227, v3229);
+    int16x8_t v3231 = vsubq_s16(v2650, v2655);
+    int16x8_t v3232 = vsubq_s16(v2660, v2665);
+    int16x8_t v3233_tmp = vqrdmulhq_n_s16(v3232, 12756);
+    int16x8_t v3233 = vaddq_s16(v3233_tmp, v3232);
+    int16x8_t v3234 = vaddq_s16(v3231, v3233);
+    int16x8_t v3235 = vqrdmulhq_n_s16(v3234, 19869);
+    int16x8_t v3236 = vaddq_s16(v3230, v3235);
+    int16x8_t v3237 = vqrdmulhq_n_s16(v3236, 17153);
+    int16x8_t v3238 = vaddq_s16(v3226, v3237);
+    int16x8_t v3239 = vsubq_s16(v2488, v2493);
+    int16x8_t v3240 = vsubq_s16(v2498, v2503);
+    int16x8_t v3241_tmp = vqrdmulhq_n_s16(v3240, 19463);
+    int16x8_t v3241 = vaddq_s16(v3241_tmp, v3240);
+    int16x8_t v3242 = vaddq_s16(v3239, v3241);
+    int16x8_t v3243 = vsubq_s16(v2510, v2515);
+    int16x8_t v3244 = vsubq_s16(v2520, v2525);
+    int16x8_t v3245_tmp = vqrdmulhq_n_s16(v3244, 19463);
+    int16x8_t v3245 = vaddq_s16(v3245_tmp, v3244);
+    int16x8_t v3246 = vaddq_s16(v3243, v3245);
+    int16x8_t v3247 = vqrdmulhq_n_s16(v3246, 20216);
+    int16x8_t v3248 = vaddq_s16(v3242, v3247);
+    int16x8_t v3249 = vsubq_s16(v2534, v2539);
+    int16x8_t v3250 = vsubq_s16(v2544, v2549);
+    int16x8_t v3251_tmp = vqrdmulhq_n_s16(v3250, 19463);
+    int16x8_t v3251 = vaddq_s16(v3251_tmp, v3250);
+    int16x8_t v3252 = vaddq_s16(v3249, v3251);
+    int16x8_t v3253 = vsubq_s16(v2556, v2561);
+    int16x8_t v3254 = vsubq_s16(v2566, v2571);
+    int16x8_t v3255_tmp = vqrdmulhq_n_s16(v3254, 19463);
+    int16x8_t v3255 = vaddq_s16(v3255_tmp, v3254);
+    int16x8_t v3256 = vaddq_s16(v3253, v3255);
+    int16x8_t v3257 = vqrdmulhq_n_s16(v3256, 20216);
+    int16x8_t v3258 = vaddq_s16(v3252, v3257);
+    int16x8_t v3259 = vqrdmulhq_n_s16(v3258, 17220);
+    int16x8_t v3260 = vaddq_s16(v3248, v3259);
+    int16x8_t v3261 = vsubq_s16(v2393, v2398);
+    int16x8_t v3262 = vsubq_s16(v2403, v2408);
+    int16x8_t v3263_tmp = vqrdmulhq_n_s16(v3262, 28661);
+    int16x8_t v3263 = vaddq_s16(v3263_tmp, v3262);
+    int16x8_t v3264 = vaddq_s16(v3261, v3263);
+    int16x8_t v3265 = vsubq_s16(v2415, v2420);
+    int16x8_t v3266 = vsubq_s16(v2425, v2430);
+    int16x8_t v3267_tmp = vqrdmulhq_n_s16(v3266, 28661);
+    int16x8_t v3267 = vaddq_s16(v3267_tmp, v3266);
+    int16x8_t v3268 = vaddq_s16(v3265, v3267);
+    int16x8_t v3269 = vqrdmulhq_n_s16(v3268, 20587);
+    int16x8_t v3270 = vaddq_s16(v3264, v3269);
+    int16x8_t v3271 = vsubq_s16(v2439, v2444);
+    int16x8_t v3272 = vsubq_s16(v2449, v2454);
+    int16x8_t v3273_tmp = vqrdmulhq_n_s16(v3272, 28661);
+    int16x8_t v3273 = vaddq_s16(v3273_tmp, v3272);
+    int16x8_t v3274 = vaddq_s16(v3271, v3273);
+    int16x8_t v3275 = vsubq_s16(v2461, v2467);
+    int16x8_t v3276 = vsubq_s16(v2472, v2477);
+    int16x8_t v3277_tmp = vqrdmulhq_n_s16(v3276, 28661);
+    int16x8_t v3277 = vaddq_s16(v3277_tmp, v3276);
+    int16x8_t v3278 = vaddq_s16(v3275, v3277);
+    int16x8_t v3279 = vqrdmulhq_n_s16(v3278, 20587);
+    int16x8_t v3280 = vaddq_s16(v3274, v3279);
+    int16x8_t v3281 = vqrdmulhq_n_s16(v3280, 17290);
+    int16x8_t v3282 = vaddq_s16(v3270, v3281);
+    int16x8_t v3283 = vsubq_s16(v2299, v2304);
+    int16x8_t v3284 = vsubq_s16(v2309, v2314);
+    int16x8_t v3285_tmp = vqrdmulhq_n_s16(v3284, 9242);
+    int16x8_t v3285 = vmlaq_n_s16(v3285_tmp, v3284, 2);
+    int16x8_t v3286 = vaddq_s16(v3283, v3285);
+    int16x8_t v3287 = vsubq_s16(v2321, v2326);
+    int16x8_t v3288 = vsubq_s16(v2331, v2336);
+    int16x8_t v3289_tmp = vqrdmulhq_n_s16(v3288, 9242);
+    int16x8_t v3289 = vmlaq_n_s16(v3289_tmp, v3288, 2);
+    int16x8_t v3290 = vaddq_s16(v3287, v3289);
+    int16x8_t v3291 = vqrdmulhq_n_s16(v3290, 20985);
+    int16x8_t v3292 = vaddq_s16(v3286, v3291);
+    int16x8_t v3293 = vsubq_s16(v2345, v2350);
+    int16x8_t v3294 = vsubq_s16(v2355, v2360);
+    int16x8_t v3295_tmp = vqrdmulhq_n_s16(v3294, 9242);
+    int16x8_t v3295 = vmlaq_n_s16(v3295_tmp, v3294, 2);
+    int16x8_t v3296 = vaddq_s16(v3293, v3295);
+    int16x8_t v3297 = vsubq_s16(v2367, v2372);
+    int16x8_t v3298 = vsubq_s16(v2377, v2382);
+    int16x8_t v3299_tmp = vqrdmulhq_n_s16(v3298, 9242);
+    int16x8_t v3299 = vmlaq_n_s16(v3299_tmp, v3298, 2);
+    int16x8_t v3300 = vaddq_s16(v3297, v3299);
+    int16x8_t v3301 = vqrdmulhq_n_s16(v3300, 20985);
+    int16x8_t v3302 = vaddq_s16(v3296, v3301);
+    int16x8_t v3303 = vqrdmulhq_n_s16(v3302, 17363);
+    int16x8_t v3304 = vaddq_s16(v3292, v3303);
+    int16x8_t v3305 = vsubq_s16(v2115, v2126);
+    int16x8_t v3306 = vsubq_s16(v2137, v2148);
+    int16x8_t v3307_tmp = vqrdmulhq_n_s16(v3306, 30298);
+    int16x8_t v3307 = vmlaq_n_s16(v3307_tmp, v3306, 2);
+    int16x8_t v3308 = vaddq_s16(v3305, v3307);
+    int16x8_t v3309 = vsubq_s16(v2161, v2172);
+    int16x8_t v3310 = vsubq_s16(v2183, v2194);
+    int16x8_t v3311_tmp = vqrdmulhq_n_s16(v3310, 30298);
+    int16x8_t v3311 = vmlaq_n_s16(v3311_tmp, v3310, 2);
+    int16x8_t v3312 = vaddq_s16(v3309, v3311);
+    int16x8_t v3313 = vqrdmulhq_n_s16(v3312, 21412);
+    int16x8_t v3314 = vaddq_s16(v3308, v3313);
+    int16x8_t v3315 = vsubq_s16(v2209, v2220);
+    int16x8_t v3316 = vsubq_s16(v2231, v2242);
+    int16x8_t v3317_tmp = vqrdmulhq_n_s16(v3316, 30298);
+    int16x8_t v3317 = vmlaq_n_s16(v3317_tmp, v3316, 2);
+    int16x8_t v3318 = vaddq_s16(v3315, v3317);
+    int16x8_t v3319 = vsubq_s16(v2255, v2266);
+    int16x8_t v3320 = vsubq_s16(v2277, v2288);
+    int16x8_t v3321_tmp = vqrdmulhq_n_s16(v3320, 30298);
+    int16x8_t v3321 = vmlaq_n_s16(v3321_tmp, v3320, 2);
+    int16x8_t v3322 = vaddq_s16(v3319, v3321);
+    int16x8_t v3323 = vqrdmulhq_n_s16(v3322, 21412);
+    int16x8_t v3324 = vaddq_s16(v3318, v3323);
+    int16x8_t v3325 = vqrdmulhq_n_s16(v3324, 17440);
+    int16x8_t v3326 = vaddq_s16(v3314, v3325);
+    int16x8_t v3327 = vsubq_s16(v1925, v1936);
+    int16x8_t v3328 = vsubq_s16(v1947, v1958);
+    int16x8_t v3329_tmp = vqrdmulhq_n_s16(v3328, 2773);
+    int16x8_t v3329 = vmlaq_n_s16(v3329_tmp, v3328, 4);
+    int16x8_t v3330 = vaddq_s16(v3327, v3329);
+    int16x8_t v3331 = vsubq_s16(v1971, v1982);
+    int16x8_t v3332 = vsubq_s16(v1993, v2004);
+    int16x8_t v3333_tmp = vqrdmulhq_n_s16(v3332, 2773);
+    int16x8_t v3333 = vmlaq_n_s16(v3333_tmp, v3332, 4);
+    int16x8_t v3334 = vaddq_s16(v3331, v3333);
+    int16x8_t v3335 = vqrdmulhq_n_s16(v3334, 21871);
+    int16x8_t v3336 = vaddq_s16(v3330, v3335);
+    int16x8_t v3337 = vsubq_s16(v2019, v2030);
+    int16x8_t v3338 = vsubq_s16(v2041, v2052);
+    int16x8_t v3339_tmp = vqrdmulhq_n_s16(v3338, 2773);
+    int16x8_t v3339 = vmlaq_n_s16(v3339_tmp, v3338, 4);
+    int16x8_t v3340 = vaddq_s16(v3337, v3339);
+    int16x8_t v3341 = vsubq_s16(v2065, v2076);
+    int16x8_t v3342 = vsubq_s16(v2087, v2098);
+    int16x8_t v3343_tmp = vqrdmulhq_n_s16(v3342, 2773);
+    int16x8_t v3343 = vmlaq_n_s16(v3343_tmp, v3342, 4);
+    int16x8_t v3344 = vaddq_s16(v3341, v3343);
+    int16x8_t v3345 = vqrdmulhq_n_s16(v3344, 21871);
+    int16x8_t v3346 = vaddq_s16(v3340, v3345);
+    int16x8_t v3347 = vqrdmulhq_n_s16(v3346, 17520);
+    int16x8_t v3348 = vaddq_s16(v3336, v3347);
+    int16x8_t v3349 = vsubq_s16(v1555, v1578);
+    int16x8_t v3350 = vsubq_s16(v1601, v1624);
+    int16x8_t v3351_tmp = vqrdmulhq_n_s16(v3350, 26108);
+    int16x8_t v3351 = vmlaq_n_s16(v3351_tmp, v3350, 6);
+    int16x8_t v3352 = vaddq_s16(v3349, v3351);
+    int16x8_t v3353 = vsubq_s16(v1649, v1672);
+    int16x8_t v3354 = vsubq_s16(v1695, v1718);
+    int16x8_t v3355_tmp = vqrdmulhq_n_s16(v3354, 26108);
+    int16x8_t v3355 = vmlaq_n_s16(v3355_tmp, v3354, 6);
+    int16x8_t v3356 = vaddq_s16(v3353, v3355);
+    int16x8_t v3357 = vqrdmulhq_n_s16(v3356, 22363);
+    int16x8_t v3358 = vaddq_s16(v3352, v3357);
+    int16x8_t v3359 = vsubq_s16(v1745, v1768);
+    int16x8_t v3360 = vsubq_s16(v1791, v1814);
+    int16x8_t v3361_tmp = vqrdmulhq_n_s16(v3360, 26108);
+    int16x8_t v3361 = vmlaq_n_s16(v3361_tmp, v3360, 6);
+    int16x8_t v3362 = vaddq_s16(v3359, v3361);
+    int16x8_t v3363 = vsubq_s16(v1839, v1862);
+    int16x8_t v3364 = vsubq_s16(v1885, v1908);
+    int16x8_t v3365_tmp = vqrdmulhq_n_s16(v3364, 26108);
+    int16x8_t v3365 = vmlaq_n_s16(v3365_tmp, v3364, 6);
+    int16x8_t v3366 = vaddq_s16(v3363, v3365);
+    int16x8_t v3367 = vqrdmulhq_n_s16(v3366, 22363);
+    int16x8_t v3368 = vaddq_s16(v3362, v3367);
+    int16x8_t v3369 = vqrdmulhq_n_s16(v3368, 17603);
+    int16x8_t v3370 = vaddq_s16(v3358, v3369);
+    int16x8_t v3371 = vsubq_s16(v61, v140);
+    int16x8_t v3372 = vsubq_s16(v234, v314);
+    int16x8_t v3373_tmp = vqrdmulhq_n_s16(v3372, 12251);
+    int16x8_t v3373 = vmlaq_n_s16(v3373_tmp, v3372, 20);
+    int16x8_t v3374 = vaddq_s16(v3371, v3373);
+    int16x8_t v3375 = vsubq_s16(v410, v521);
+    int16x8_t v3376 = vsubq_s16(v615, v696);
+    int16x8_t v3377_tmp = vqrdmulhq_n_s16(v3376, 12251);
+    int16x8_t v3377 = vmlaq_n_s16(v3377_tmp, v3376, 20);
+    int16x8_t v3378 = vaddq_s16(v3375, v3377);
+    int16x8_t v3379 = vqrdmulhq_n_s16(v3378, 22891);
+    int16x8_t v3380 = vaddq_s16(v3374, v3379);
+    int16x8_t v3381 = vsubq_s16(v794, v905);
+    int16x8_t v3382 = vsubq_s16(v1061, v1143);
+    int16x8_t v3383_tmp = vqrdmulhq_n_s16(v3382, 12251);
+    int16x8_t v3383 = vmlaq_n_s16(v3383_tmp, v3382, 20);
+    int16x8_t v3384 = vaddq_s16(v3381, v3383);
+    int16x8_t v3385 = vsubq_s16(v1239, v1350);
+    int16x8_t v3386 = vsubq_s16(v1444, v1526);
+    int16x8_t v3387_tmp = vqrdmulhq_n_s16(v3386, 12251);
+    int16x8_t v3387 = vmlaq_n_s16(v3387_tmp, v3386, 20);
+    int16x8_t v3388 = vaddq_s16(v3385, v3387);
+    int16x8_t v3389 = vqrdmulhq_n_s16(v3388, 22891);
+    int16x8_t v3390 = vaddq_s16(v3384, v3389);
+    int16x8_t v3391 = vqrdmulhq_n_s16(v3390, 17689);
+    int16x8_t v3392 = vaddq_s16(v3380, v3391);
+    int16x8_t v3393 = vsubq_s16(v3371, v3373);
+    int16x8_t v3394 = vsubq_s16(v3375, v3377);
+    int16x8_t v3395 = vqrdmulhq_n_s16(v3394, 23460);
+    int16x8_t v3396 = vaddq_s16(v3393, v3395);
+    int16x8_t v3397 = vsubq_s16(v3381, v3383);
+    int16x8_t v3398 = vsubq_s16(v3385, v3387);
+    int16x8_t v3399 = vqrdmulhq_n_s16(v3398, 23460);
+    int16x8_t v3400 = vaddq_s16(v3397, v3399);
+    int16x8_t v3401 = vqrdmulhq_n_s16(v3400, 17779);
+    int16x8_t v3402 = vaddq_s16(v3396, v3401);
+    int16x8_t v3403 = vsubq_s16(v3349, v3351);
+    int16x8_t v3404 = vsubq_s16(v3353, v3355);
+    int16x8_t v3405 = vqrdmulhq_n_s16(v3404, 24073);
+    int16x8_t v3406 = vaddq_s16(v3403, v3405);
+    int16x8_t v3407 = vsubq_s16(v3359, v3361);
+    int16x8_t v3408 = vsubq_s16(v3363, v3365);
+    int16x8_t v3409 = vqrdmulhq_n_s16(v3408, 24073);
+    int16x8_t v3410 = vaddq_s16(v3407, v3409);
+    int16x8_t v3411 = vqrdmulhq_n_s16(v3410, 17873);
+    int16x8_t v3412 = vaddq_s16(v3406, v3411);
+    int16x8_t v3413 = vsubq_s16(v3327, v3329);
+    int16x8_t v3414 = vsubq_s16(v3331, v3333);
+    int16x8_t v3415 = vqrdmulhq_n_s16(v3414, 24734);
+    int16x8_t v3416 = vaddq_s16(v3413, v3415);
+    int16x8_t v3417 = vsubq_s16(v3337, v3339);
+    int16x8_t v3418 = vsubq_s16(v3341, v3343);
+    int16x8_t v3419 = vqrdmulhq_n_s16(v3418, 24734);
+    int16x8_t v3420 = vaddq_s16(v3417, v3419);
+    int16x8_t v3421 = vqrdmulhq_n_s16(v3420, 17971);
+    int16x8_t v3422 = vaddq_s16(v3416, v3421);
+    int16x8_t v3423 = vsubq_s16(v3305, v3307);
+    int16x8_t v3424 = vsubq_s16(v3309, v3311);
+    int16x8_t v3425 = vqrdmulhq_n_s16(v3424, 25448);
+    int16x8_t v3426 = vaddq_s16(v3423, v3425);
+    int16x8_t v3427 = vsubq_s16(v3315, v3317);
+    int16x8_t v3428 = vsubq_s16(v3319, v3321);
+    int16x8_t v3429 = vqrdmulhq_n_s16(v3428, 25448);
+    int16x8_t v3430 = vaddq_s16(v3427, v3429);
+    int16x8_t v3431 = vqrdmulhq_n_s16(v3430, 18072);
+    int16x8_t v3432 = vaddq_s16(v3426, v3431);
+    int16x8_t v3433 = vsubq_s16(v3283, v3285);
+    int16x8_t v3434 = vsubq_s16(v3287, v3289);
+    int16x8_t v3435 = vqrdmulhq_n_s16(v3434, 26220);
+    int16x8_t v3436 = vaddq_s16(v3433, v3435);
+    int16x8_t v3437 = vsubq_s16(v3293, v3295);
+    int16x8_t v3438 = vsubq_s16(v3297, v3299);
+    int16x8_t v3439 = vqrdmulhq_n_s16(v3438, 26220);
+    int16x8_t v3440 = vaddq_s16(v3437, v3439);
+    int16x8_t v3441 = vqrdmulhq_n_s16(v3440, 18177);
+    int16x8_t v3442 = vaddq_s16(v3436, v3441);
+    int16x8_t v3443 = vsubq_s16(v3261, v3263);
+    int16x8_t v3444 = vsubq_s16(v3265, v3267);
+    int16x8_t v3445 = vqrdmulhq_n_s16(v3444, 27058);
+    int16x8_t v3446 = vaddq_s16(v3443, v3445);
+    int16x8_t v3447 = vsubq_s16(v3271, v3273);
+    int16x8_t v3448 = vsubq_s16(v3275, v3277);
+    int16x8_t v3449 = vqrdmulhq_n_s16(v3448, 27058);
+    int16x8_t v3450 = vaddq_s16(v3447, v3449);
+    int16x8_t v3451 = vqrdmulhq_n_s16(v3450, 18286);
+    int16x8_t v3452 = vaddq_s16(v3446, v3451);
+    int16x8_t v3453 = vsubq_s16(v3239, v3241);
+    int16x8_t v3454 = vsubq_s16(v3243, v3245);
+    int16x8_t v3455 = vqrdmulhq_n_s16(v3454, 27969);
+    int16x8_t v3456 = vaddq_s16(v3453, v3455);
+    int16x8_t v3457 = vsubq_s16(v3249, v3251);
+    int16x8_t v3458 = vsubq_s16(v3253, v3255);
+    int16x8_t v3459 = vqrdmulhq_n_s16(v3458, 27969);
+    int16x8_t v3460 = vaddq_s16(v3457, v3459);
+    int16x8_t v3461 = vqrdmulhq_n_s16(v3460, 18400);
+    int16x8_t v3462 = vaddq_s16(v3456, v3461);
+    int16x8_t v3463 = vsubq_s16(v3217, v3219);
+    int16x8_t v3464 = vsubq_s16(v3221, v3223);
+    int16x8_t v3465 = vqrdmulhq_n_s16(v3464, 28961);
+    int16x8_t v3466 = vaddq_s16(v3463, v3465);
+    int16x8_t v3467 = vsubq_s16(v3227, v3229);
+    int16x8_t v3468 = vsubq_s16(v3231, v3233);
+    int16x8_t v3469 = vqrdmulhq_n_s16(v3468, 28961);
+    int16x8_t v3470 = vaddq_s16(v3467, v3469);
+    int16x8_t v3471 = vqrdmulhq_n_s16(v3470, 18517);
+    int16x8_t v3472 = vaddq_s16(v3466, v3471);
+    int16x8_t v3473 = vsubq_s16(v3195, v3197);
+    int16x8_t v3474 = vsubq_s16(v3199, v3201);
+    int16x8_t v3475 = vqrdmulhq_n_s16(v3474, 30044);
+    int16x8_t v3476 = vaddq_s16(v3473, v3475);
+    int16x8_t v3477 = vsubq_s16(v3205, v3207);
+    int16x8_t v3478 = vsubq_s16(v3209, v3211);
+    int16x8_t v3479 = vqrdmulhq_n_s16(v3478, 30044);
+    int16x8_t v3480 = vaddq_s16(v3477, v3479);
+    int16x8_t v3481 = vqrdmulhq_n_s16(v3480, 18639);
+    int16x8_t v3482 = vaddq_s16(v3476, v3481);
+    int16x8_t v3483 = vsubq_s16(v3173, v3175);
+    int16x8_t v3484 = vsubq_s16(v3177, v3179);
+    int16x8_t v3485 = vqrdmulhq_n_s16(v3484, 31232);
+    int16x8_t v3486 = vaddq_s16(v3483, v3485);
+    int16x8_t v3487 = vsubq_s16(v3183, v3185);
+    int16x8_t v3488 = vsubq_s16(v3187, v3189);
+    int16x8_t v3489 = vqrdmulhq_n_s16(v3488, 31232);
+    int16x8_t v3490 = vaddq_s16(v3487, v3489);
+    int16x8_t v3491 = vqrdmulhq_n_s16(v3490, 18765);
+    int16x8_t v3492 = vaddq_s16(v3486, v3491);
+    int16x8_t v3493 = vsubq_s16(v3151, v3153);
+    int16x8_t v3494 = vsubq_s16(v3155, v3157);
+    int16x8_t v3495 = vqrdmulhq_n_s16(v3494, 32538);
+    int16x8_t v3496 = vaddq_s16(v3493, v3495);
+    int16x8_t v3497 = vsubq_s16(v3161, v3163);
+    int16x8_t v3498 = vsubq_s16(v3165, v3167);
+    int16x8_t v3499 = vqrdmulhq_n_s16(v3498, 32538);
+    int16x8_t v3500 = vaddq_s16(v3497, v3499);
+    int16x8_t v3501 = vqrdmulhq_n_s16(v3500, 18896);
+    int16x8_t v3502 = vaddq_s16(v3496, v3501);
+    int16x8_t v3503 = vsubq_s16(v3129, v3131);
+    int16x8_t v3504 = vsubq_s16(v3133, v3135);
+    int16x8_t v3505_tmp = vqrdmulhq_n_s16(v3504, 1211);
+    int16x8_t v3505 = vaddq_s16(v3505_tmp, v3504);
+    int16x8_t v3506 = vaddq_s16(v3503, v3505);
+    int16x8_t v3507 = vsubq_s16(v3139, v3141);
+    int16x8_t v3508 = vsubq_s16(v3143, v3145);
+    int16x8_t v3509_tmp = vqrdmulhq_n_s16(v3508, 1211);
+    int16x8_t v3509 = vaddq_s16(v3509_tmp, v3508);
+    int16x8_t v3510 = vaddq_s16(v3507, v3509);
+    int16x8_t v3511 = vqrdmulhq_n_s16(v3510, 19032);
+    int16x8_t v3512 = vaddq_s16(v3506, v3511);
+    int16x8_t v3513 = vsubq_s16(v3107, v3109);
+    int16x8_t v3514 = vsubq_s16(v3111, v3113);
+    int16x8_t v3515_tmp = vqrdmulhq_n_s16(v3514, 2808);
+    int16x8_t v3515 = vaddq_s16(v3515_tmp, v3514);
+    int16x8_t v3516 = vaddq_s16(v3513, v3515);
+    int16x8_t v3517 = vsubq_s16(v3117, v3119);
+    int16x8_t v3518 = vsubq_s16(v3121, v3123);
+    int16x8_t v3519_tmp = vqrdmulhq_n_s16(v3518, 2808);
+    int16x8_t v3519 = vaddq_s16(v3519_tmp, v3518);
+    int16x8_t v3520 = vaddq_s16(v3517, v3519);
+    int16x8_t v3521 = vqrdmulhq_n_s16(v3520, 19172);
+    int16x8_t v3522 = vaddq_s16(v3516, v3521);
+    int16x8_t v3523 = vsubq_s16(v3085, v3087);
+    int16x8_t v3524 = vsubq_s16(v3089, v3091);
+    int16x8_t v3525_tmp = vqrdmulhq_n_s16(v3524, 4586);
+    int16x8_t v3525 = vaddq_s16(v3525_tmp, v3524);
+    int16x8_t v3526 = vaddq_s16(v3523, v3525);
+    int16x8_t v3527 = vsubq_s16(v3095, v3097);
+    int16x8_t v3528 = vsubq_s16(v3099, v3101);
+    int16x8_t v3529_tmp = vqrdmulhq_n_s16(v3528, 4586);
+    int16x8_t v3529 = vaddq_s16(v3529_tmp, v3528);
+    int16x8_t v3530 = vaddq_s16(v3527, v3529);
+    int16x8_t v3531 = vqrdmulhq_n_s16(v3530, 19318);
+    int16x8_t v3532 = vaddq_s16(v3526, v3531);
+    int16x8_t v3533 = vsubq_s16(v3063, v3065);
+    int16x8_t v3534 = vsubq_s16(v3067, v3069);
+    int16x8_t v3535_tmp = vqrdmulhq_n_s16(v3534, 6576);
+    int16x8_t v3535 = vaddq_s16(v3535_tmp, v3534);
+    int16x8_t v3536 = vaddq_s16(v3533, v3535);
+    int16x8_t v3537 = vsubq_s16(v3073, v3075);
+    int16x8_t v3538 = vsubq_s16(v3077, v3079);
+    int16x8_t v3539_tmp = vqrdmulhq_n_s16(v3538, 6576);
+    int16x8_t v3539 = vaddq_s16(v3539_tmp, v3538);
+    int16x8_t v3540 = vaddq_s16(v3537, v3539);
+    int16x8_t v3541 = vqrdmulhq_n_s16(v3540, 19469);
+    int16x8_t v3542 = vaddq_s16(v3536, v3541);
+    int16x8_t v3543 = vsubq_s16(v3041, v3043);
+    int16x8_t v3544 = vsubq_s16(v3045, v3047);
+    int16x8_t v3545_tmp = vqrdmulhq_n_s16(v3544, 8817);
+    int16x8_t v3545 = vaddq_s16(v3545_tmp, v3544);
+    int16x8_t v3546 = vaddq_s16(v3543, v3545);
+    int16x8_t v3547 = vsubq_s16(v3051, v3053);
+    int16x8_t v3548 = vsubq_s16(v3055, v3057);
+    int16x8_t v3549_tmp = vqrdmulhq_n_s16(v3548, 8817);
+    int16x8_t v3549 = vaddq_s16(v3549_tmp, v3548);
+    int16x8_t v3550 = vaddq_s16(v3547, v3549);
+    int16x8_t v3551 = vqrdmulhq_n_s16(v3550, 19625);
+    int16x8_t v3552 = vaddq_s16(v3546, v3551);
+    int16x8_t v3553 = vsubq_s16(v2998, v3003);
+    int16x8_t v3554 = vsubq_s16(v3008, v3013);
+    int16x8_t v3555_tmp = vqrdmulhq_n_s16(v3554, 11356);
+    int16x8_t v3555 = vaddq_s16(v3555_tmp, v3554);
+    int16x8_t v3556 = vaddq_s16(v3553, v3555);
+    int16x8_t v3557 = vsubq_s16(v3020, v3025);
+    int16x8_t v3558 = vsubq_s16(v3030, v3035);
+    int16x8_t v3559_tmp = vqrdmulhq_n_s16(v3558, 11356);
+    int16x8_t v3559 = vaddq_s16(v3559_tmp, v3558);
+    int16x8_t v3560 = vaddq_s16(v3557, v3559);
+    int16x8_t v3561 = vqrdmulhq_n_s16(v3560, 19786);
+    int16x8_t v3562 = vaddq_s16(v3556, v3561);
+    int16x8_t v3563 = vsubq_s16(v2952, v2957);
+    int16x8_t v3564 = vsubq_s16(v2962, v2967);
+    int16x8_t v3565_tmp = vqrdmulhq_n_s16(v3564, 14256);
+    int16x8_t v3565 = vaddq_s16(v3565_tmp, v3564);
+    int16x8_t v3566 = vaddq_s16(v3563, v3565);
+    int16x8_t v3567 = vsubq_s16(v2974, v2979);
+    int16x8_t v3568 = vsubq_s16(v2984, v2989);
+    int16x8_t v3569_tmp = vqrdmulhq_n_s16(v3568, 14256);
+    int16x8_t v3569 = vaddq_s16(v3569_tmp, v3568);
+    int16x8_t v3570 = vaddq_s16(v3567, v3569);
+    int16x8_t v3571 = vqrdmulhq_n_s16(v3570, 19954);
+    int16x8_t v3572 = vaddq_s16(v3566, v3571);
+    int16x8_t v3573 = vsubq_s16(v2906, v2911);
+    int16x8_t v3574 = vsubq_s16(v2916, v2921);
+    int16x8_t v3575_tmp = vqrdmulhq_n_s16(v3574, 17596);
+    int16x8_t v3575 = vaddq_s16(v3575_tmp, v3574);
+    int16x8_t v3576 = vaddq_s16(v3573, v3575);
+    int16x8_t v3577 = vsubq_s16(v2928, v2933);
+    int16x8_t v3578 = vsubq_s16(v2938, v2943);
+    int16x8_t v3579_tmp = vqrdmulhq_n_s16(v3578, 17596);
+    int16x8_t v3579 = vaddq_s16(v3579_tmp, v3578);
+    int16x8_t v3580 = vaddq_s16(v3577, v3579);
+    int16x8_t v3581 = vqrdmulhq_n_s16(v3580, 20127);
+    int16x8_t v3582 = vaddq_s16(v3576, v3581);
+    int16x8_t v3583 = vsubq_s16(v2860, v2865);
+    int16x8_t v3584 = vsubq_s16(v2870, v2875);
+    int16x8_t v3585_tmp = vqrdmulhq_n_s16(v3584, 21483);
+    int16x8_t v3585 = vaddq_s16(v3585_tmp, v3584);
+    int16x8_t v3586 = vaddq_s16(v3583, v3585);
+    int16x8_t v3587 = vsubq_s16(v2882, v2887);
+    int16x8_t v3588 = vsubq_s16(v2892, v2897);
+    int16x8_t v3589_tmp = vqrdmulhq_n_s16(v3588, 21483);
+    int16x8_t v3589 = vaddq_s16(v3589_tmp, v3588);
+    int16x8_t v3590 = vaddq_s16(v3587, v3589);
+    int16x8_t v3591 = vqrdmulhq_n_s16(v3590, 20306);
+    int16x8_t v3592 = vaddq_s16(v3586, v3591);
+    int16x8_t v3593 = vsubq_s16(v2814, v2819);
+    int16x8_t v3594 = vsubq_s16(v2824, v2829);
+    int16x8_t v3595_tmp = vqrdmulhq_n_s16(v3594, 26057);
+    int16x8_t v3595 = vaddq_s16(v3595_tmp, v3594);
+    int16x8_t v3596 = vaddq_s16(v3593, v3595);
+    int16x8_t v3597 = vsubq_s16(v2836, v2841);
+    int16x8_t v3598 = vsubq_s16(v2846, v2851);
+    int16x8_t v3599_tmp = vqrdmulhq_n_s16(v3598, 26057);
+    int16x8_t v3599 = vaddq_s16(v3599_tmp, v3598);
+    int16x8_t v3600 = vaddq_s16(v3597, v3599);
+    int16x8_t v3601 = vqrdmulhq_n_s16(v3600, 20492);
+    int16x8_t v3602 = vaddq_s16(v3596, v3601);
+    int16x8_t v3603 = vsubq_s16(v2768, v2773);
+    int16x8_t v3604 = vsubq_s16(v2778, v2783);
+    int16x8_t v3605_tmp = vqrdmulhq_n_s16(v3604, 31517);
+    int16x8_t v3605 = vaddq_s16(v3605_tmp, v3604);
+    int16x8_t v3606 = vaddq_s16(v3603, v3605);
+    int16x8_t v3607 = vsubq_s16(v2790, v2795);
+    int16x8_t v3608 = vsubq_s16(v2800, v2805);
+    int16x8_t v3609_tmp = vqrdmulhq_n_s16(v3608, 31517);
+    int16x8_t v3609 = vaddq_s16(v3609_tmp, v3608);
+    int16x8_t v3610 = vaddq_s16(v3607, v3609);
+    int16x8_t v3611 = vqrdmulhq_n_s16(v3610, 20684);
+    int16x8_t v3612 = vaddq_s16(v3606, v3611);
+    int16x8_t v3613 = vsubq_s16(v2722, v2727);
+    int16x8_t v3614 = vsubq_s16(v2732, v2737);
+    int16x8_t v3615_tmp = vqrdmulhq_n_s16(v3614, 5373);
+    int16x8_t v3615 = vmlaq_n_s16(v3615_tmp, v3614, 2);
+    int16x8_t v3616 = vaddq_s16(v3613, v3615);
+    int16x8_t v3617 = vsubq_s16(v2744, v2749);
+    int16x8_t v3618 = vsubq_s16(v2754, v2759);
+    int16x8_t v3619_tmp = vqrdmulhq_n_s16(v3618, 5373);
+    int16x8_t v3619 = vmlaq_n_s16(v3619_tmp, v3618, 2);
+    int16x8_t v3620 = vaddq_s16(v3617, v3619);
+    int16x8_t v3621 = vqrdmulhq_n_s16(v3620, 20883);
+    int16x8_t v3622 = vaddq_s16(v3616, v3621);
+    int16x8_t v3623 = vsubq_s16(v2676, v2681);
+    int16x8_t v3624 = vsubq_s16(v2686, v2691);
+    int16x8_t v3625_tmp = vqrdmulhq_n_s16(v3624, 13571);
+    int16x8_t v3625 = vmlaq_n_s16(v3625_tmp, v3624, 2);
+    int16x8_t v3626 = vaddq_s16(v3623, v3625);
+    int16x8_t v3627 = vsubq_s16(v2698, v2703);
+    int16x8_t v3628 = vsubq_s16(v2708, v2713);
+    int16x8_t v3629_tmp = vqrdmulhq_n_s16(v3628, 13571);
+    int16x8_t v3629 = vmlaq_n_s16(v3629_tmp, v3628, 2);
+    int16x8_t v3630 = vaddq_s16(v3627, v3629);
+    int16x8_t v3631 = vqrdmulhq_n_s16(v3630, 21089);
+    int16x8_t v3632 = vaddq_s16(v3626, v3631);
+    int16x8_t v3633 = vsubq_s16(v2588, v2599);
+    int16x8_t v3634 = vsubq_s16(v2610, v2621);
+    int16x8_t v3635_tmp = vqrdmulhq_n_s16(v3634, 23975);
+    int16x8_t v3635 = vmlaq_n_s16(v3635_tmp, v3634, 2);
+    int16x8_t v3636 = vaddq_s16(v3633, v3635);
+    int16x8_t v3637 = vsubq_s16(v2634, v2645);
+    int16x8_t v3638 = vsubq_s16(v2656, v2667);
+    int16x8_t v3639_tmp = vqrdmulhq_n_s16(v3638, 23975);
+    int16x8_t v3639 = vmlaq_n_s16(v3639_tmp, v3638, 2);
+    int16x8_t v3640 = vaddq_s16(v3637, v3639);
+    int16x8_t v3641 = vqrdmulhq_n_s16(v3640, 21303);
+    int16x8_t v3642 = vaddq_s16(v3636, v3641);
+    int16x8_t v3643 = vsubq_s16(v2494, v2505);
+    int16x8_t v3644 = vsubq_s16(v2516, v2527);
+    int16x8_t v3645_tmp = vqrdmulhq_n_s16(v3644, 4832);
+    int16x8_t v3645 = vmlaq_n_s16(v3645_tmp, v3644, 3);
+    int16x8_t v3646 = vaddq_s16(v3643, v3645);
+    int16x8_t v3647 = vsubq_s16(v2540, v2551);
+    int16x8_t v3648 = vsubq_s16(v2562, v2573);
+    int16x8_t v3649_tmp = vqrdmulhq_n_s16(v3648, 4832);
+    int16x8_t v3649 = vmlaq_n_s16(v3649_tmp, v3648, 3);
+    int16x8_t v3650 = vaddq_s16(v3647, v3649);
+    int16x8_t v3651 = vqrdmulhq_n_s16(v3650, 21524);
+    int16x8_t v3652 = vaddq_s16(v3646, v3651);
+    int16x8_t v3653 = vsubq_s16(v2399, v2410);
+    int16x8_t v3654 = vsubq_s16(v2421, v2432);
+    int16x8_t v3655_tmp = vqrdmulhq_n_s16(v3654, 23437);
+    int16x8_t v3655 = vmlaq_n_s16(v3655_tmp, v3654, 3);
+    int16x8_t v3656 = vaddq_s16(v3653, v3655);
+    int16x8_t v3657 = vsubq_s16(v2445, v2456);
+    int16x8_t v3658 = vsubq_s16(v2468, v2479);
+    int16x8_t v3659_tmp = vqrdmulhq_n_s16(v3658, 23437);
+    int16x8_t v3659 = vmlaq_n_s16(v3659_tmp, v3658, 3);
+    int16x8_t v3660 = vaddq_s16(v3657, v3659);
+    int16x8_t v3661 = vqrdmulhq_n_s16(v3660, 21753);
+    int16x8_t v3662 = vaddq_s16(v3656, v3661);
+    int16x8_t v3663 = vsubq_s16(v2305, v2316);
+    int16x8_t v3664 = vsubq_s16(v2327, v2338);
+    int16x8_t v3665_tmp = vqrdmulhq_n_s16(v3664, 17573);
+    int16x8_t v3665 = vmlaq_n_s16(v3665_tmp, v3664, 4);
+    int16x8_t v3666 = vaddq_s16(v3663, v3665);
+    int16x8_t v3667 = vsubq_s16(v2351, v2362);
+    int16x8_t v3668 = vsubq_s16(v2373, v2384);
+    int16x8_t v3669_tmp = vqrdmulhq_n_s16(v3668, 17573);
+    int16x8_t v3669 = vmlaq_n_s16(v3669_tmp, v3668, 4);
+    int16x8_t v3670 = vaddq_s16(v3667, v3669);
+    int16x8_t v3671 = vqrdmulhq_n_s16(v3670, 21990);
+    int16x8_t v3672 = vaddq_s16(v3666, v3671);
+    int16x8_t v3673 = vsubq_s16(v2127, v2150);
+    int16x8_t v3674 = vsubq_s16(v2173, v2196);
+    int16x8_t v3675_tmp = vqrdmulhq_n_s16(v3674, 27122);
+    int16x8_t v3675 = vmlaq_n_s16(v3675_tmp, v3674, 5);
+    int16x8_t v3676 = vaddq_s16(v3673, v3675);
+    int16x8_t v3677 = vsubq_s16(v2221, v2244);
+    int16x8_t v3678 = vsubq_s16(v2267, v2290);
+    int16x8_t v3679_tmp = vqrdmulhq_n_s16(v3678, 27122);
+    int16x8_t v3679 = vmlaq_n_s16(v3679_tmp, v3678, 5);
+    int16x8_t v3680 = vaddq_s16(v3677, v3679);
+    int16x8_t v3681 = vqrdmulhq_n_s16(v3680, 22236);
+    int16x8_t v3682 = vaddq_s16(v3676, v3681);
+    int16x8_t v3683 = vsubq_s16(v1937, v1960);
+    int16x8_t v3684 = vsubq_s16(v1983, v2006);
+    int16x8_t v3685_tmp = vqrdmulhq_n_s16(v3684, 5041);
+    int16x8_t v3685 = vmlaq_n_s16(v3685_tmp, v3684, 8);
+    int16x8_t v3686 = vaddq_s16(v3683, v3685);
+    int16x8_t v3687 = vsubq_s16(v2031, v2054);
+    int16x8_t v3688 = vsubq_s16(v2077, v2100);
+    int16x8_t v3689_tmp = vqrdmulhq_n_s16(v3688, 5041);
+    int16x8_t v3689 = vmlaq_n_s16(v3689_tmp, v3688, 8);
+    int16x8_t v3690 = vaddq_s16(v3687, v3689);
+    int16x8_t v3691 = vqrdmulhq_n_s16(v3690, 22491);
+    int16x8_t v3692 = vaddq_s16(v3686, v3691);
+    int16x8_t v3693 = vsubq_s16(v1579, v1626);
+    int16x8_t v3694 = vsubq_s16(v1673, v1720);
+    int16x8_t v3695_tmp = vqrdmulhq_n_s16(v3694, 19146);
+    int16x8_t v3695 = vmlaq_n_s16(v3695_tmp, v3694, 13);
+    int16x8_t v3696 = vaddq_s16(v3693, v3695);
+    int16x8_t v3697 = vsubq_s16(v1769, v1816);
+    int16x8_t v3698 = vsubq_s16(v1863, v1910);
+    int16x8_t v3699_tmp = vqrdmulhq_n_s16(v3698, 19146);
+    int16x8_t v3699 = vmlaq_n_s16(v3699_tmp, v3698, 13);
+    int16x8_t v3700 = vaddq_s16(v3697, v3699);
+    int16x8_t v3701 = vqrdmulhq_n_s16(v3700, 22755);
+    int16x8_t v3702 = vaddq_s16(v3696, v3701);
+    int16x8_t v3703 = vsubq_s16(v141, v316);
+    int16x8_t v3704 = vsubq_s16(v522, v698);
+    int16x8_t v3705_tmp = vqrdmulhq_n_s16(v3704, 24402);
+    int16x8_t v3705 = vmlaq_n_s16(v3705_tmp, v3704, 40);
+    int16x8_t v3706 = vaddq_s16(v3703, v3705);
+    int16x8_t v3707 = vsubq_s16(v906, v1145);
+    int16x8_t v3708 = vsubq_s16(v1351, v1528);
+    int16x8_t v3709_tmp = vqrdmulhq_n_s16(v3708, 24402);
+    int16x8_t v3709 = vmlaq_n_s16(v3709_tmp, v3708, 40);
+    int16x8_t v3710 = vaddq_s16(v3707, v3709);
+    int16x8_t v3711 = vqrdmulhq_n_s16(v3710, 23030);
+    int16x8_t v3712 = vaddq_s16(v3706, v3711);
+    int16x8_t v3713 = vsubq_s16(v3703, v3705);
+    int16x8_t v3714 = vsubq_s16(v3707, v3709);
+    int16x8_t v3715 = vqrdmulhq_n_s16(v3714, 23314);
+    int16x8_t v3716 = vaddq_s16(v3713, v3715);
+    int16x8_t v3717 = vsubq_s16(v3693, v3695);
+    int16x8_t v3718 = vsubq_s16(v3697, v3699);
+    int16x8_t v3719 = vqrdmulhq_n_s16(v3718, 23609);
+    int16x8_t v3720 = vaddq_s16(v3717, v3719);
+    int16x8_t v3721 = vsubq_s16(v3683, v3685);
+    int16x8_t v3722 = vsubq_s16(v3687, v3689);
+    int16x8_t v3723 = vqrdmulhq_n_s16(v3722, 23915);
+    int16x8_t v3724 = vaddq_s16(v3721, v3723);
+    int16x8_t v3725 = vsubq_s16(v3673, v3675);
+    int16x8_t v3726 = vsubq_s16(v3677, v3679);
+    int16x8_t v3727 = vqrdmulhq_n_s16(v3726, 24233);
+    int16x8_t v3728 = vaddq_s16(v3725, v3727);
+    int16x8_t v3729 = vsubq_s16(v3663, v3665);
+    int16x8_t v3730 = vsubq_s16(v3667, v3669);
+    int16x8_t v3731 = vqrdmulhq_n_s16(v3730, 24564);
+    int16x8_t v3732 = vaddq_s16(v3729, v3731);
+    int16x8_t v3733 = vsubq_s16(v3653, v3655);
+    int16x8_t v3734 = vsubq_s16(v3657, v3659);
+    int16x8_t v3735 = vqrdmulhq_n_s16(v3734, 24907);
+    int16x8_t v3736 = vaddq_s16(v3733, v3735);
+    int16x8_t v3737 = vsubq_s16(v3643, v3645);
+    int16x8_t v3738 = vsubq_s16(v3647, v3649);
+    int16x8_t v3739 = vqrdmulhq_n_s16(v3738, 25264);
+    int16x8_t v3740 = vaddq_s16(v3737, v3739);
+    int16x8_t v3741 = vsubq_s16(v3633, v3635);
+    int16x8_t v3742 = vsubq_s16(v3637, v3639);
+    int16x8_t v3743 = vqrdmulhq_n_s16(v3742, 25635);
+    int16x8_t v3744 = vaddq_s16(v3741, v3743);
+    int16x8_t v3745 = vsubq_s16(v3623, v3625);
+    int16x8_t v3746 = vsubq_s16(v3627, v3629);
+    int16x8_t v3747 = vqrdmulhq_n_s16(v3746, 26021);
+    int16x8_t v3748 = vaddq_s16(v3745, v3747);
+    int16x8_t v3749 = vsubq_s16(v3613, v3615);
+    int16x8_t v3750 = vsubq_s16(v3617, v3619);
+    int16x8_t v3751 = vqrdmulhq_n_s16(v3750, 26423);
+    int16x8_t v3752 = vaddq_s16(v3749, v3751);
+    int16x8_t v3753 = vsubq_s16(v3603, v3605);
+    int16x8_t v3754 = vsubq_s16(v3607, v3609);
+    int16x8_t v3755 = vqrdmulhq_n_s16(v3754, 26842);
+    int16x8_t v3756 = vaddq_s16(v3753, v3755);
+    int16x8_t v3757 = vsubq_s16(v3593, v3595);
+    int16x8_t v3758 = vsubq_s16(v3597, v3599);
+    int16x8_t v3759 = vqrdmulhq_n_s16(v3758, 27279);
+    int16x8_t v3760 = vaddq_s16(v3757, v3759);
+    int16x8_t v3761 = vsubq_s16(v3583, v3585);
+    int16x8_t v3762 = vsubq_s16(v3587, v3589);
+    int16x8_t v3763 = vqrdmulhq_n_s16(v3762, 27734);
+    int16x8_t v3764 = vaddq_s16(v3761, v3763);
+    int16x8_t v3765 = vsubq_s16(v3573, v3575);
+    int16x8_t v3766 = vsubq_s16(v3577, v3579);
+    int16x8_t v3767 = vqrdmulhq_n_s16(v3766, 28209);
+    int16x8_t v3768 = vaddq_s16(v3765, v3767);
+    int16x8_t v3769 = vsubq_s16(v3563, v3565);
+    int16x8_t v3770 = vsubq_s16(v3567, v3569);
+    int16x8_t v3771 = vqrdmulhq_n_s16(v3770, 28705);
+    int16x8_t v3772 = vaddq_s16(v3769, v3771);
+    int16x8_t v3773 = vsubq_s16(v3553, v3555);
+    int16x8_t v3774 = vsubq_s16(v3557, v3559);
+    int16x8_t v3775 = vqrdmulhq_n_s16(v3774, 29223);
+    int16x8_t v3776 = vaddq_s16(v3773, v3775);
+    int16x8_t v3777 = vsubq_s16(v3543, v3545);
+    int16x8_t v3778 = vsubq_s16(v3547, v3549);
+    int16x8_t v3779 = vqrdmulhq_n_s16(v3778, 29764);
+    int16x8_t v3780 = vaddq_s16(v3777, v3779);
+    int16x8_t v3781 = vsubq_s16(v3533, v3535);
+    int16x8_t v3782 = vsubq_s16(v3537, v3539);
+    int16x8_t v3783 = vqrdmulhq_n_s16(v3782, 30331);
+    int16x8_t v3784 = vaddq_s16(v3781, v3783);
+    int16x8_t v3785 = vsubq_s16(v3523, v3525);
+    int16x8_t v3786 = vsubq_s16(v3527, v3529);
+    int16x8_t v3787 = vqrdmulhq_n_s16(v3786, 30925);
+    int16x8_t v3788 = vaddq_s16(v3785, v3787);
+    int16x8_t v3789 = vsubq_s16(v3513, v3515);
+    int16x8_t v3790 = vsubq_s16(v3517, v3519);
+    int16x8_t v3791 = vqrdmulhq_n_s16(v3790, 31547);
+    int16x8_t v3792 = vaddq_s16(v3789, v3791);
+    int16x8_t v3793 = vsubq_s16(v3503, v3505);
+    int16x8_t v3794 = vsubq_s16(v3507, v3509);
+    int16x8_t v3795 = vqrdmulhq_n_s16(v3794, 32199);
+    int16x8_t v3796 = vaddq_s16(v3793, v3795);
+    int16x8_t v3797 = vsubq_s16(v3493, v3495);
+    int16x8_t v3798 = vsubq_s16(v3497, v3499);
+    int16x8_t v3799_tmp = vqrdmulhq_n_s16(v3798, 117);
+    int16x8_t v3799 = vaddq_s16(v3799_tmp, v3798);
+    int16x8_t v3800 = vaddq_s16(v3797, v3799);
+    int16x8_t v3801 = vsubq_s16(v3483, v3485);
+    int16x8_t v3802 = vsubq_s16(v3487, v3489);
+    int16x8_t v3803_tmp = vqrdmulhq_n_s16(v3802, 837);
+    int16x8_t v3803 = vaddq_s16(v3803_tmp, v3802);
+    int16x8_t v3804 = vaddq_s16(v3801, v3803);
+    int16x8_t v3805 = vsubq_s16(v3473, v3475);
+    int16x8_t v3806 = vsubq_s16(v3477, v3479);
+    int16x8_t v3807_tmp = vqrdmulhq_n_s16(v3806, 1594);
+    int16x8_t v3807 = vaddq_s16(v3807_tmp, v3806);
+    int16x8_t v3808 = vaddq_s16(v3805, v3807);
+    int16x8_t v3809 = vsubq_s16(v3463, v3465);
+    int16x8_t v3810 = vsubq_s16(v3467, v3469);
+    int16x8_t v3811_tmp = vqrdmulhq_n_s16(v3810, 2393);
+    int16x8_t v3811 = vaddq_s16(v3811_tmp, v3810);
+    int16x8_t v3812 = vaddq_s16(v3809, v3811);
+    int16x8_t v3813 = vsubq_s16(v3453, v3455);
+    int16x8_t v3814 = vsubq_s16(v3457, v3459);
+    int16x8_t v3815_tmp = vqrdmulhq_n_s16(v3814, 3234);
+    int16x8_t v3815 = vaddq_s16(v3815_tmp, v3814);
+    int16x8_t v3816 = vaddq_s16(v3813, v3815);
+    int16x8_t v3817 = vsubq_s16(v3443, v3445);
+    int16x8_t v3818 = vsubq_s16(v3447, v3449);
+    int16x8_t v3819_tmp = vqrdmulhq_n_s16(v3818, 4123);
+    int16x8_t v3819 = vaddq_s16(v3819_tmp, v3818);
+    int16x8_t v3820 = vaddq_s16(v3817, v3819);
+    int16x8_t v3821 = vsubq_s16(v3433, v3435);
+    int16x8_t v3822 = vsubq_s16(v3437, v3439);
+    int16x8_t v3823_tmp = vqrdmulhq_n_s16(v3822, 5062);
+    int16x8_t v3823 = vaddq_s16(v3823_tmp, v3822);
+    int16x8_t v3824 = vaddq_s16(v3821, v3823);
+    int16x8_t v3825 = vsubq_s16(v3423, v3425);
+    int16x8_t v3826 = vsubq_s16(v3427, v3429);
+    int16x8_t v3827_tmp = vqrdmulhq_n_s16(v3826, 6057);
+    int16x8_t v3827 = vaddq_s16(v3827_tmp, v3826);
+    int16x8_t v3828 = vaddq_s16(v3825, v3827);
+    int16x8_t v3829 = vsubq_s16(v3413, v3415);
+    int16x8_t v3830 = vsubq_s16(v3417, v3419);
+    int16x8_t v3831_tmp = vqrdmulhq_n_s16(v3830, 7111);
+    int16x8_t v3831 = vaddq_s16(v3831_tmp, v3830);
+    int16x8_t v3832 = vaddq_s16(v3829, v3831);
+    int16x8_t v3833 = vsubq_s16(v3403, v3405);
+    int16x8_t v3834 = vsubq_s16(v3407, v3409);
+    int16x8_t v3835_tmp = vqrdmulhq_n_s16(v3834, 8231);
+    int16x8_t v3835 = vaddq_s16(v3835_tmp, v3834);
+    int16x8_t v3836 = vaddq_s16(v3833, v3835);
+    int16x8_t v3837 = vsubq_s16(v3393, v3395);
+    int16x8_t v3838 = vsubq_s16(v3397, v3399);
+    int16x8_t v3839_tmp = vqrdmulhq_n_s16(v3838, 9421);
+    int16x8_t v3839 = vaddq_s16(v3839_tmp, v3838);
+    int16x8_t v3840 = vaddq_s16(v3837, v3839);
+    int16x8_t v3841 = vsubq_s16(v3374, v3379);
+    int16x8_t v3842 = vsubq_s16(v3384, v3389);
+    int16x8_t v3843_tmp = vqrdmulhq_n_s16(v3842, 10690);
+    int16x8_t v3843 = vaddq_s16(v3843_tmp, v3842);
+    int16x8_t v3844 = vaddq_s16(v3841, v3843);
+    int16x8_t v3845 = vsubq_s16(v3352, v3357);
+    int16x8_t v3846 = vsubq_s16(v3362, v3367);
+    int16x8_t v3847_tmp = vqrdmulhq_n_s16(v3846, 12044);
+    int16x8_t v3847 = vaddq_s16(v3847_tmp, v3846);
+    int16x8_t v3848 = vaddq_s16(v3845, v3847);
+    int16x8_t v3849 = vsubq_s16(v3330, v3335);
+    int16x8_t v3850 = vsubq_s16(v3340, v3345);
+    int16x8_t v3851_tmp = vqrdmulhq_n_s16(v3850, 13493);
+    int16x8_t v3851 = vaddq_s16(v3851_tmp, v3850);
+    int16x8_t v3852 = vaddq_s16(v3849, v3851);
+    int16x8_t v3853 = vsubq_s16(v3308, v3313);
+    int16x8_t v3854 = vsubq_s16(v3318, v3323);
+    int16x8_t v3855_tmp = vqrdmulhq_n_s16(v3854, 15046);
+    int16x8_t v3855 = vaddq_s16(v3855_tmp, v3854);
+    int16x8_t v3856 = vaddq_s16(v3853, v3855);
+    int16x8_t v3857 = vsubq_s16(v3286, v3291);
+    int16x8_t v3858 = vsubq_s16(v3296, v3301);
+    int16x8_t v3859_tmp = vqrdmulhq_n_s16(v3858, 16715);
+    int16x8_t v3859 = vaddq_s16(v3859_tmp, v3858);
+    int16x8_t v3860 = vaddq_s16(v3857, v3859);
+    int16x8_t v3861 = vsubq_s16(v3264, v3269);
+    int16x8_t v3862 = vsubq_s16(v3274, v3279);
+    int16x8_t v3863_tmp = vqrdmulhq_n_s16(v3862, 18512);
+    int16x8_t v3863 = vaddq_s16(v3863_tmp, v3862);
+    int16x8_t v3864 = vaddq_s16(v3861, v3863);
+    int16x8_t v3865 = vsubq_s16(v3242, v3247);
+    int16x8_t v3866 = vsubq_s16(v3252, v3257);
+    int16x8_t v3867_tmp = vqrdmulhq_n_s16(v3866, 20453);
+    int16x8_t v3867 = vaddq_s16(v3867_tmp, v3866);
+    int16x8_t v3868 = vaddq_s16(v3865, v3867);
+    int16x8_t v3869 = vsubq_s16(v3220, v3225);
+    int16x8_t v3870 = vsubq_s16(v3230, v3235);
+    int16x8_t v3871_tmp = vqrdmulhq_n_s16(v3870, 22555);
+    int16x8_t v3871 = vaddq_s16(v3871_tmp, v3870);
+    int16x8_t v3872 = vaddq_s16(v3869, v3871);
+    int16x8_t v3873 = vsubq_s16(v3198, v3203);
+    int16x8_t v3874 = vsubq_s16(v3208, v3213);
+    int16x8_t v3875_tmp = vqrdmulhq_n_s16(v3874, 24839);
+    int16x8_t v3875 = vaddq_s16(v3875_tmp, v3874);
+    int16x8_t v3876 = vaddq_s16(v3873, v3875);
+    int16x8_t v3877 = vsubq_s16(v3176, v3181);
+    int16x8_t v3878 = vsubq_s16(v3186, v3191);
+    int16x8_t v3879_tmp = vqrdmulhq_n_s16(v3878, 27330);
+    int16x8_t v3879 = vaddq_s16(v3879_tmp, v3878);
+    int16x8_t v3880 = vaddq_s16(v3877, v3879);
+    int16x8_t v3881 = vsubq_s16(v3154, v3159);
+    int16x8_t v3882 = vsubq_s16(v3164, v3169);
+    int16x8_t v3883_tmp = vqrdmulhq_n_s16(v3882, 30056);
+    int16x8_t v3883 = vaddq_s16(v3883_tmp, v3882);
+    int16x8_t v3884 = vaddq_s16(v3881, v3883);
+    int16x8_t v3885 = vsubq_s16(v3132, v3137);
+    int16x8_t v3886 = vsubq_s16(v3142, v3147);
+    int16x8_t v3887_tmp = vqrdmulhq_n_s16(v3886, 282);
+    int16x8_t v3887 = vmlaq_n_s16(v3887_tmp, v3886, 2);
+    int16x8_t v3888 = vaddq_s16(v3885, v3887);
+    int16x8_t v3889 = vsubq_s16(v3110, v3115);
+    int16x8_t v3890 = vsubq_s16(v3120, v3125);
+    int16x8_t v3891_tmp = vqrdmulhq_n_s16(v3890, 3588);
+    int16x8_t v3891 = vmlaq_n_s16(v3891_tmp, v3890, 2);
+    int16x8_t v3892 = vaddq_s16(v3889, v3891);
+    int16x8_t v3893 = vsubq_s16(v3088, v3093);
+    int16x8_t v3894 = vsubq_s16(v3098, v3103);
+    int16x8_t v3895_tmp = vqrdmulhq_n_s16(v3894, 7255);
+    int16x8_t v3895 = vmlaq_n_s16(v3895_tmp, v3894, 2);
+    int16x8_t v3896 = vaddq_s16(v3893, v3895);
+    int16x8_t v3897 = vsubq_s16(v3066, v3071);
+    int16x8_t v3898 = vsubq_s16(v3076, v3081);
+    int16x8_t v3899_tmp = vqrdmulhq_n_s16(v3898, 11344);
+    int16x8_t v3899 = vmlaq_n_s16(v3899_tmp, v3898, 2);
+    int16x8_t v3900 = vaddq_s16(v3897, v3899);
+    int16x8_t v3901 = vsubq_s16(v3044, v3049);
+    int16x8_t v3902 = vsubq_s16(v3054, v3059);
+    int16x8_t v3903_tmp = vqrdmulhq_n_s16(v3902, 15934);
+    int16x8_t v3903 = vmlaq_n_s16(v3903_tmp, v3902, 2);
+    int16x8_t v3904 = vaddq_s16(v3901, v3903);
+    int16x8_t v3905 = vsubq_s16(v3004, v3015);
+    int16x8_t v3906 = vsubq_s16(v3026, v3037);
+    int16x8_t v3907_tmp = vqrdmulhq_n_s16(v3906, 21120);
+    int16x8_t v3907 = vmlaq_n_s16(v3907_tmp, v3906, 2);
+    int16x8_t v3908 = vaddq_s16(v3905, v3907);
+    int16x8_t v3909 = vsubq_s16(v2958, v2969);
+    int16x8_t v3910 = vsubq_s16(v2980, v2991);
+    int16x8_t v3911_tmp = vqrdmulhq_n_s16(v3910, 27027);
+    int16x8_t v3911 = vmlaq_n_s16(v3911_tmp, v3910, 2);
+    int16x8_t v3912 = vaddq_s16(v3909, v3911);
+    int16x8_t v3913 = vsubq_s16(v2912, v2923);
+    int16x8_t v3914 = vsubq_s16(v2934, v2945);
+    int16x8_t v3915_tmp = vqrdmulhq_n_s16(v3914, 1045);
+    int16x8_t v3915 = vmlaq_n_s16(v3915_tmp, v3914, 3);
+    int16x8_t v3916 = vaddq_s16(v3913, v3915);
+    int16x8_t v3917 = vsubq_s16(v2866, v2877);
+    int16x8_t v3918 = vsubq_s16(v2888, v2899);
+    int16x8_t v3919_tmp = vqrdmulhq_n_s16(v3918, 8923);
+    int16x8_t v3919 = vmlaq_n_s16(v3919_tmp, v3918, 3);
+    int16x8_t v3920 = vaddq_s16(v3917, v3919);
+    int16x8_t v3921 = vsubq_s16(v2820, v2831);
+    int16x8_t v3922 = vsubq_s16(v2842, v2853);
+    int16x8_t v3923_tmp = vqrdmulhq_n_s16(v3922, 18177);
+    int16x8_t v3923 = vmlaq_n_s16(v3923_tmp, v3922, 3);
+    int16x8_t v3924 = vaddq_s16(v3921, v3923);
+    int16x8_t v3925 = vsubq_s16(v2774, v2785);
+    int16x8_t v3926 = vsubq_s16(v2796, v2807);
+    int16x8_t v3927_tmp = vqrdmulhq_n_s16(v3926, 29200);
+    int16x8_t v3927 = vmlaq_n_s16(v3927_tmp, v3926, 3);
+    int16x8_t v3928 = vaddq_s16(v3925, v3927);
+    int16x8_t v3929 = vsubq_s16(v2728, v2739);
+    int16x8_t v3930 = vsubq_s16(v2750, v2761);
+    int16x8_t v3931_tmp = vqrdmulhq_n_s16(v3930, 9782);
+    int16x8_t v3931 = vmlaq_n_s16(v3931_tmp, v3930, 4);
+    int16x8_t v3932 = vaddq_s16(v3929, v3931);
+    int16x8_t v3933 = vsubq_s16(v2682, v2693);
+    int16x8_t v3934 = vsubq_s16(v2704, v2715);
+    int16x8_t v3935_tmp = vqrdmulhq_n_s16(v3934, 26282);
+    int16x8_t v3935 = vmlaq_n_s16(v3935_tmp, v3934, 4);
+    int16x8_t v3936 = vaddq_s16(v3933, v3935);
+    int16x8_t v3937 = vsubq_s16(v2600, v2623);
+    int16x8_t v3938 = vsubq_s16(v2646, v2669);
+    int16x8_t v3939_tmp = vqrdmulhq_n_s16(v3938, 14423);
+    int16x8_t v3939 = vmlaq_n_s16(v3939_tmp, v3938, 5);
+    int16x8_t v3940 = vaddq_s16(v3937, v3939);
+    int16x8_t v3941 = vsubq_s16(v2506, v2529);
+    int16x8_t v3942 = vsubq_s16(v2552, v2575);
+    int16x8_t v3943_tmp = vqrdmulhq_n_s16(v3942, 9008);
+    int16x8_t v3943 = vmlaq_n_s16(v3943_tmp, v3942, 6);
+    int16x8_t v3944 = vaddq_s16(v3941, v3943);
+    int16x8_t v3945 = vsubq_s16(v2411, v2434);
+    int16x8_t v3946 = vsubq_s16(v2457, v2481);
+    int16x8_t v3947_tmp = vqrdmulhq_n_s16(v3946, 13552);
+    int16x8_t v3947 = vmlaq_n_s16(v3947_tmp, v3946, 7);
+    int16x8_t v3948 = vaddq_s16(v3945, v3947);
+    int16x8_t v3949 = vsubq_s16(v2317, v2340);
+    int16x8_t v3950 = vsubq_s16(v2363, v2386);
+    int16x8_t v3951_tmp = vqrdmulhq_n_s16(v3950, 1925);
+    int16x8_t v3951 = vmlaq_n_s16(v3951_tmp, v3950, 9);
+    int16x8_t v3952 = vaddq_s16(v3949, v3951);
+    int16x8_t v3953 = vsubq_s16(v2151, v2198);
+    int16x8_t v3954 = vsubq_s16(v2245, v2292);
+    int16x8_t v3955_tmp = vqrdmulhq_n_s16(v3954, 21123);
+    int16x8_t v3955 = vmlaq_n_s16(v3955_tmp, v3954, 11);
+    int16x8_t v3956 = vaddq_s16(v3953, v3955);
+    int16x8_t v3957 = vsubq_s16(v1961, v2008);
+    int16x8_t v3958 = vsubq_s16(v2055, v2102);
+    int16x8_t v3959_tmp = vqrdmulhq_n_s16(v3958, 9831);
+    int16x8_t v3959 = vmlaq_n_s16(v3959_tmp, v3958, 16);
+    int16x8_t v3960 = vaddq_s16(v3957, v3959);
+    int16x8_t v3961 = vsubq_s16(v1627, v1722);
+    int16x8_t v3962 = vsubq_s16(v1817, v1912);
+    int16x8_t v3963_tmp = vqrdmulhq_n_s16(v3962, 5373);
+    int16x8_t v3963 = vmlaq_n_s16(v3963_tmp, v3962, 27);
+    int16x8_t v3964 = vaddq_s16(v3961, v3963);
+    int16x8_t v3965 = vsubq_s16(v317, v700);
+    int16x8_t v3966 = vsubq_s16(v1146, v1530);
+    int16x8_t v3967_tmp = vqrdmulhq_n_s16(v3966, 15986);
+    int16x8_t v3967 = vmlaq_n_s16(v3967_tmp, v3966, 81);
+    int16x8_t v3968 = vaddq_s16(v3965, v3967);
+    int16x8_t v3969 = vsubq_s16(v3965, v3967);
+    int16x8_t v3970 = vsubq_s16(v3961, v3963);
+    int16x8_t v3971 = vsubq_s16(v3957, v3959);
+    int16x8_t v3972 = vsubq_s16(v3953, v3955);
+    int16x8_t v3973 = vsubq_s16(v3949, v3951);
+    int16x8_t v3974 = vsubq_s16(v3945, v3947);
+    int16x8_t v3975 = vsubq_s16(v3941, v3943);
+    int16x8_t v3976 = vsubq_s16(v3937, v3939);
+    int16x8_t v3977 = vsubq_s16(v3933, v3935);
+    int16x8_t v3978 = vsubq_s16(v3929, v3931);
+    int16x8_t v3979 = vsubq_s16(v3925, v3927);
+    int16x8_t v3980 = vsubq_s16(v3921, v3923);
+    int16x8_t v3981 = vsubq_s16(v3917, v3919);
+    int16x8_t v3982 = vsubq_s16(v3913, v3915);
+    int16x8_t v3983 = vsubq_s16(v3909, v3911);
+    int16x8_t v3984 = vsubq_s16(v3905, v3907);
+    int16x8_t v3985 = vsubq_s16(v3901, v3903);
+    int16x8_t v3986 = vsubq_s16(v3897, v3899);
+    int16x8_t v3987 = vsubq_s16(v3893, v3895);
+    int16x8_t v3988 = vsubq_s16(v3889, v3891);
+    int16x8_t v3989 = vsubq_s16(v3885, v3887);
+    int16x8_t v3990 = vsubq_s16(v3881, v3883);
+    int16x8_t v3991 = vsubq_s16(v3877, v3879);
+    int16x8_t v3992 = vsubq_s16(v3873, v3875);
+    int16x8_t v3993 = vsubq_s16(v3869, v3871);
+    int16x8_t v3994 = vsubq_s16(v3865, v3867);
+    int16x8_t v3995 = vsubq_s16(v3861, v3863);
+    int16x8_t v3996 = vsubq_s16(v3857, v3859);
+    int16x8_t v3997 = vsubq_s16(v3853, v3855);
+    int16x8_t v3998 = vsubq_s16(v3849, v3851);
+    int16x8_t v3999 = vsubq_s16(v3845, v3847);
+    int16x8_t v4000 = vsubq_s16(v3841, v3843);
+    int16x8_t v4001 = vsubq_s16(v3837, v3839);
+    int16x8_t v4002 = vsubq_s16(v3833, v3835);
+    int16x8_t v4003 = vsubq_s16(v3829, v3831);
+    int16x8_t v4004 = vsubq_s16(v3825, v3827);
+    int16x8_t v4005 = vsubq_s16(v3821, v3823);
+    int16x8_t v4006 = vsubq_s16(v3817, v3819);
+    int16x8_t v4007 = vsubq_s16(v3813, v3815);
+    int16x8_t v4008 = vsubq_s16(v3809, v3811);
+    int16x8_t v4009 = vsubq_s16(v3805, v3807);
+    int16x8_t v4010 = vsubq_s16(v3801, v3803);
+    int16x8_t v4011 = vsubq_s16(v3797, v3799);
+    int16x8_t v4012 = vsubq_s16(v3793, v3795);
+    int16x8_t v4013 = vsubq_s16(v3789, v3791);
+    int16x8_t v4014 = vsubq_s16(v3785, v3787);
+    int16x8_t v4015 = vsubq_s16(v3781, v3783);
+    int16x8_t v4016 = vsubq_s16(v3777, v3779);
+    int16x8_t v4017 = vsubq_s16(v3773, v3775);
+    int16x8_t v4018 = vsubq_s16(v3769, v3771);
+    int16x8_t v4019 = vsubq_s16(v3765, v3767);
+    int16x8_t v4020 = vsubq_s16(v3761, v3763);
+    int16x8_t v4021 = vsubq_s16(v3757, v3759);
+    int16x8_t v4022 = vsubq_s16(v3753, v3755);
+    int16x8_t v4023 = vsubq_s16(v3749, v3751);
+    int16x8_t v4024 = vsubq_s16(v3745, v3747);
+    int16x8_t v4025 = vsubq_s16(v3741, v3743);
+    int16x8_t v4026 = vsubq_s16(v3737, v3739);
+    int16x8_t v4027 = vsubq_s16(v3733, v3735);
+    int16x8_t v4028 = vsubq_s16(v3729, v3731);
+    int16x8_t v4029 = vsubq_s16(v3725, v3727);
+    int16x8_t v4030 = vsubq_s16(v3721, v3723);
+    int16x8_t v4031 = vsubq_s16(v3717, v3719);
+    int16x8_t v4032 = vsubq_s16(v3713, v3715);
+    int16x8_t v4033 = vsubq_s16(v3706, v3711);
+    int16x8_t v4034 = vsubq_s16(v3696, v3701);
+    int16x8_t v4035 = vsubq_s16(v3686, v3691);
+    int16x8_t v4036 = vsubq_s16(v3676, v3681);
+    int16x8_t v4037 = vsubq_s16(v3666, v3671);
+    int16x8_t v4038 = vsubq_s16(v3656, v3661);
+    int16x8_t v4039 = vsubq_s16(v3646, v3651);
+    int16x8_t v4040 = vsubq_s16(v3636, v3641);
+    int16x8_t v4041 = vsubq_s16(v3626, v3631);
+    int16x8_t v4042 = vsubq_s16(v3616, v3621);
+    int16x8_t v4043 = vsubq_s16(v3606, v3611);
+    int16x8_t v4044 = vsubq_s16(v3596, v3601);
+    int16x8_t v4045 = vsubq_s16(v3586, v3591);
+    int16x8_t v4046 = vsubq_s16(v3576, v3581);
+    int16x8_t v4047 = vsubq_s16(v3566, v3571);
+    int16x8_t v4048 = vsubq_s16(v3556, v3561);
+    int16x8_t v4049 = vsubq_s16(v3546, v3551);
+    int16x8_t v4050 = vsubq_s16(v3536, v3541);
+    int16x8_t v4051 = vsubq_s16(v3526, v3531);
+    int16x8_t v4052 = vsubq_s16(v3516, v3521);
+    int16x8_t v4053 = vsubq_s16(v3506, v3511);
+    int16x8_t v4054 = vsubq_s16(v3496, v3501);
+    int16x8_t v4055 = vsubq_s16(v3486, v3491);
+    int16x8_t v4056 = vsubq_s16(v3476, v3481);
+    int16x8_t v4057 = vsubq_s16(v3466, v3471);
+    int16x8_t v4058 = vsubq_s16(v3456, v3461);
+    int16x8_t v4059 = vsubq_s16(v3446, v3451);
+    int16x8_t v4060 = vsubq_s16(v3436, v3441);
+    int16x8_t v4061 = vsubq_s16(v3426, v3431);
+    int16x8_t v4062 = vsubq_s16(v3416, v3421);
+    int16x8_t v4063 = vsubq_s16(v3406, v3411);
+    int16x8_t v4064 = vsubq_s16(v3396, v3401);
+    int16x8_t v4065 = vsubq_s16(v3380, v3391);
+    int16x8_t v4066 = vsubq_s16(v3358, v3369);
+    int16x8_t v4067 = vsubq_s16(v3336, v3347);
+    int16x8_t v4068 = vsubq_s16(v3314, v3325);
+    int16x8_t v4069 = vsubq_s16(v3292, v3303);
+    int16x8_t v4070 = vsubq_s16(v3270, v3281);
+    int16x8_t v4071 = vsubq_s16(v3248, v3259);
+    int16x8_t v4072 = vsubq_s16(v3226, v3237);
+    int16x8_t v4073 = vsubq_s16(v3204, v3215);
+    int16x8_t v4074 = vsubq_s16(v3182, v3193);
+    int16x8_t v4075 = vsubq_s16(v3160, v3171);
+    int16x8_t v4076 = vsubq_s16(v3138, v3149);
+    int16x8_t v4077 = vsubq_s16(v3116, v3127);
+    int16x8_t v4078 = vsubq_s16(v3094, v3105);
+    int16x8_t v4079 = vsubq_s16(v3072, v3083);
+    int16x8_t v4080 = vsubq_s16(v3050, v3061);
+    int16x8_t v4081 = vsubq_s16(v3016, v3039);
+    int16x8_t v4082 = vsubq_s16(v2970, v2993);
+    int16x8_t v4083 = vsubq_s16(v2924, v2947);
+    int16x8_t v4084 = vsubq_s16(v2878, v2901);
+    int16x8_t v4085 = vsubq_s16(v2832, v2855);
+    int16x8_t v4086 = vsubq_s16(v2786, v2809);
+    int16x8_t v4087 = vsubq_s16(v2740, v2763);
+    int16x8_t v4088 = vsubq_s16(v2694, v2717);
+    int16x8_t v4089 = vsubq_s16(v2624, v2671);
+    int16x8_t v4090 = vsubq_s16(v2530, v2577);
+    int16x8_t v4091 = vsubq_s16(v2435, v2483);
+    int16x8_t v4092 = vsubq_s16(v2341, v2388);
+    int16x8_t v4093 = vsubq_s16(v2199, v2294);
+    int16x8_t v4094 = vsubq_s16(v2009, v2104);
+    int16x8_t v4095 = vsubq_s16(v1723, v1914);
+    int16x8_t v4096 = vsubq_s16(v701, v1532);
+    vst1q_s16(out + out_stride * 0 + i, v1533);
+    vst1q_s16(out + out_stride * 1 + i, v1915);
+    vst1q_s16(out + out_stride * 2 + i, v2105);
+    vst1q_s16(out + out_stride * 3 + i, v2295);
+    vst1q_s16(out + out_stride * 4 + i, v2389);
+    vst1q_s16(out + out_stride * 5 + i, v2484);
+    vst1q_s16(out + out_stride * 6 + i, v2578);
+    vst1q_s16(out + out_stride * 7 + i, v2672);
+    vst1q_s16(out + out_stride * 8 + i, v2718);
+    vst1q_s16(out + out_stride * 9 + i, v2764);
+    vst1q_s16(out + out_stride * 10 + i, v2810);
+    vst1q_s16(out + out_stride * 11 + i, v2856);
+    vst1q_s16(out + out_stride * 12 + i, v2902);
+    vst1q_s16(out + out_stride * 13 + i, v2948);
+    vst1q_s16(out + out_stride * 14 + i, v2994);
+    vst1q_s16(out + out_stride * 15 + i, v3040);
+    vst1q_s16(out + out_stride * 16 + i, v3062);
+    vst1q_s16(out + out_stride * 17 + i, v3084);
+    vst1q_s16(out + out_stride * 18 + i, v3106);
+    vst1q_s16(out + out_stride * 19 + i, v3128);
+    vst1q_s16(out + out_stride * 20 + i, v3150);
+    vst1q_s16(out + out_stride * 21 + i, v3172);
+    vst1q_s16(out + out_stride * 22 + i, v3194);
+    vst1q_s16(out + out_stride * 23 + i, v3216);
+    vst1q_s16(out + out_stride * 24 + i, v3238);
+    vst1q_s16(out + out_stride * 25 + i, v3260);
+    vst1q_s16(out + out_stride * 26 + i, v3282);
+    vst1q_s16(out + out_stride * 27 + i, v3304);
+    vst1q_s16(out + out_stride * 28 + i, v3326);
+    vst1q_s16(out + out_stride * 29 + i, v3348);
+    vst1q_s16(out + out_stride * 30 + i, v3370);
+    vst1q_s16(out + out_stride * 31 + i, v3392);
+    vst1q_s16(out + out_stride * 32 + i, v3402);
+    vst1q_s16(out + out_stride * 33 + i, v3412);
+    vst1q_s16(out + out_stride * 34 + i, v3422);
+    vst1q_s16(out + out_stride * 35 + i, v3432);
+    vst1q_s16(out + out_stride * 36 + i, v3442);
+    vst1q_s16(out + out_stride * 37 + i, v3452);
+    vst1q_s16(out + out_stride * 38 + i, v3462);
+    vst1q_s16(out + out_stride * 39 + i, v3472);
+    vst1q_s16(out + out_stride * 40 + i, v3482);
+    vst1q_s16(out + out_stride * 41 + i, v3492);
+    vst1q_s16(out + out_stride * 42 + i, v3502);
+    vst1q_s16(out + out_stride * 43 + i, v3512);
+    vst1q_s16(out + out_stride * 44 + i, v3522);
+    vst1q_s16(out + out_stride * 45 + i, v3532);
+    vst1q_s16(out + out_stride * 46 + i, v3542);
+    vst1q_s16(out + out_stride * 47 + i, v3552);
+    vst1q_s16(out + out_stride * 48 + i, v3562);
+    vst1q_s16(out + out_stride * 49 + i, v3572);
+    vst1q_s16(out + out_stride * 50 + i, v3582);
+    vst1q_s16(out + out_stride * 51 + i, v3592);
+    vst1q_s16(out + out_stride * 52 + i, v3602);
+    vst1q_s16(out + out_stride * 53 + i, v3612);
+    vst1q_s16(out + out_stride * 54 + i, v3622);
+    vst1q_s16(out + out_stride * 55 + i, v3632);
+    vst1q_s16(out + out_stride * 56 + i, v3642);
+    vst1q_s16(out + out_stride * 57 + i, v3652);
+    vst1q_s16(out + out_stride * 58 + i, v3662);
+    vst1q_s16(out + out_stride * 59 + i, v3672);
+    vst1q_s16(out + out_stride * 60 + i, v3682);
+    vst1q_s16(out + out_stride * 61 + i, v3692);
+    vst1q_s16(out + out_stride * 62 + i, v3702);
+    vst1q_s16(out + out_stride * 63 + i, v3712);
+    vst1q_s16(out + out_stride * 64 + i, v3716);
+    vst1q_s16(out + out_stride * 65 + i, v3720);
+    vst1q_s16(out + out_stride * 66 + i, v3724);
+    vst1q_s16(out + out_stride * 67 + i, v3728);
+    vst1q_s16(out + out_stride * 68 + i, v3732);
+    vst1q_s16(out + out_stride * 69 + i, v3736);
+    vst1q_s16(out + out_stride * 70 + i, v3740);
+    vst1q_s16(out + out_stride * 71 + i, v3744);
+    vst1q_s16(out + out_stride * 72 + i, v3748);
+    vst1q_s16(out + out_stride * 73 + i, v3752);
+    vst1q_s16(out + out_stride * 74 + i, v3756);
+    vst1q_s16(out + out_stride * 75 + i, v3760);
+    vst1q_s16(out + out_stride * 76 + i, v3764);
+    vst1q_s16(out + out_stride * 77 + i, v3768);
+    vst1q_s16(out + out_stride * 78 + i, v3772);
+    vst1q_s16(out + out_stride * 79 + i, v3776);
+    vst1q_s16(out + out_stride * 80 + i, v3780);
+    vst1q_s16(out + out_stride * 81 + i, v3784);
+    vst1q_s16(out + out_stride * 82 + i, v3788);
+    vst1q_s16(out + out_stride * 83 + i, v3792);
+    vst1q_s16(out + out_stride * 84 + i, v3796);
+    vst1q_s16(out + out_stride * 85 + i, v3800);
+    vst1q_s16(out + out_stride * 86 + i, v3804);
+    vst1q_s16(out + out_stride * 87 + i, v3808);
+    vst1q_s16(out + out_stride * 88 + i, v3812);
+    vst1q_s16(out + out_stride * 89 + i, v3816);
+    vst1q_s16(out + out_stride * 90 + i, v3820);
+    vst1q_s16(out + out_stride * 91 + i, v3824);
+    vst1q_s16(out + out_stride * 92 + i, v3828);
+    vst1q_s16(out + out_stride * 93 + i, v3832);
+    vst1q_s16(out + out_stride * 94 + i, v3836);
+    vst1q_s16(out + out_stride * 95 + i, v3840);
+    vst1q_s16(out + out_stride * 96 + i, v3844);
+    vst1q_s16(out + out_stride * 97 + i, v3848);
+    vst1q_s16(out + out_stride * 98 + i, v3852);
+    vst1q_s16(out + out_stride * 99 + i, v3856);
+    vst1q_s16(out + out_stride * 100 + i, v3860);
+    vst1q_s16(out + out_stride * 101 + i, v3864);
+    vst1q_s16(out + out_stride * 102 + i, v3868);
+    vst1q_s16(out + out_stride * 103 + i, v3872);
+    vst1q_s16(out + out_stride * 104 + i, v3876);
+    vst1q_s16(out + out_stride * 105 + i, v3880);
+    vst1q_s16(out + out_stride * 106 + i, v3884);
+    vst1q_s16(out + out_stride * 107 + i, v3888);
+    vst1q_s16(out + out_stride * 108 + i, v3892);
+    vst1q_s16(out + out_stride * 109 + i, v3896);
+    vst1q_s16(out + out_stride * 110 + i, v3900);
+    vst1q_s16(out + out_stride * 111 + i, v3904);
+    vst1q_s16(out + out_stride * 112 + i, v3908);
+    vst1q_s16(out + out_stride * 113 + i, v3912);
+    vst1q_s16(out + out_stride * 114 + i, v3916);
+    vst1q_s16(out + out_stride * 115 + i, v3920);
+    vst1q_s16(out + out_stride * 116 + i, v3924);
+    vst1q_s16(out + out_stride * 117 + i, v3928);
+    vst1q_s16(out + out_stride * 118 + i, v3932);
+    vst1q_s16(out + out_stride * 119 + i, v3936);
+    vst1q_s16(out + out_stride * 120 + i, v3940);
+    vst1q_s16(out + out_stride * 121 + i, v3944);
+    vst1q_s16(out + out_stride * 122 + i, v3948);
+    vst1q_s16(out + out_stride * 123 + i, v3952);
+    vst1q_s16(out + out_stride * 124 + i, v3956);
+    vst1q_s16(out + out_stride * 125 + i, v3960);
+    vst1q_s16(out + out_stride * 126 + i, v3964);
+    vst1q_s16(out + out_stride * 127 + i, v3968);
+    vst1q_s16(out + out_stride * 128 + i, v3969);
+    vst1q_s16(out + out_stride * 129 + i, v3970);
+    vst1q_s16(out + out_stride * 130 + i, v3971);
+    vst1q_s16(out + out_stride * 131 + i, v3972);
+    vst1q_s16(out + out_stride * 132 + i, v3973);
+    vst1q_s16(out + out_stride * 133 + i, v3974);
+    vst1q_s16(out + out_stride * 134 + i, v3975);
+    vst1q_s16(out + out_stride * 135 + i, v3976);
+    vst1q_s16(out + out_stride * 136 + i, v3977);
+    vst1q_s16(out + out_stride * 137 + i, v3978);
+    vst1q_s16(out + out_stride * 138 + i, v3979);
+    vst1q_s16(out + out_stride * 139 + i, v3980);
+    vst1q_s16(out + out_stride * 140 + i, v3981);
+    vst1q_s16(out + out_stride * 141 + i, v3982);
+    vst1q_s16(out + out_stride * 142 + i, v3983);
+    vst1q_s16(out + out_stride * 143 + i, v3984);
+    vst1q_s16(out + out_stride * 144 + i, v3985);
+    vst1q_s16(out + out_stride * 145 + i, v3986);
+    vst1q_s16(out + out_stride * 146 + i, v3987);
+    vst1q_s16(out + out_stride * 147 + i, v3988);
+    vst1q_s16(out + out_stride * 148 + i, v3989);
+    vst1q_s16(out + out_stride * 149 + i, v3990);
+    vst1q_s16(out + out_stride * 150 + i, v3991);
+    vst1q_s16(out + out_stride * 151 + i, v3992);
+    vst1q_s16(out + out_stride * 152 + i, v3993);
+    vst1q_s16(out + out_stride * 153 + i, v3994);
+    vst1q_s16(out + out_stride * 154 + i, v3995);
+    vst1q_s16(out + out_stride * 155 + i, v3996);
+    vst1q_s16(out + out_stride * 156 + i, v3997);
+    vst1q_s16(out + out_stride * 157 + i, v3998);
+    vst1q_s16(out + out_stride * 158 + i, v3999);
+    vst1q_s16(out + out_stride * 159 + i, v4000);
+    vst1q_s16(out + out_stride * 160 + i, v4001);
+    vst1q_s16(out + out_stride * 161 + i, v4002);
+    vst1q_s16(out + out_stride * 162 + i, v4003);
+    vst1q_s16(out + out_stride * 163 + i, v4004);
+    vst1q_s16(out + out_stride * 164 + i, v4005);
+    vst1q_s16(out + out_stride * 165 + i, v4006);
+    vst1q_s16(out + out_stride * 166 + i, v4007);
+    vst1q_s16(out + out_stride * 167 + i, v4008);
+    vst1q_s16(out + out_stride * 168 + i, v4009);
+    vst1q_s16(out + out_stride * 169 + i, v4010);
+    vst1q_s16(out + out_stride * 170 + i, v4011);
+    vst1q_s16(out + out_stride * 171 + i, v4012);
+    vst1q_s16(out + out_stride * 172 + i, v4013);
+    vst1q_s16(out + out_stride * 173 + i, v4014);
+    vst1q_s16(out + out_stride * 174 + i, v4015);
+    vst1q_s16(out + out_stride * 175 + i, v4016);
+    vst1q_s16(out + out_stride * 176 + i, v4017);
+    vst1q_s16(out + out_stride * 177 + i, v4018);
+    vst1q_s16(out + out_stride * 178 + i, v4019);
+    vst1q_s16(out + out_stride * 179 + i, v4020);
+    vst1q_s16(out + out_stride * 180 + i, v4021);
+    vst1q_s16(out + out_stride * 181 + i, v4022);
+    vst1q_s16(out + out_stride * 182 + i, v4023);
+    vst1q_s16(out + out_stride * 183 + i, v4024);
+    vst1q_s16(out + out_stride * 184 + i, v4025);
+    vst1q_s16(out + out_stride * 185 + i, v4026);
+    vst1q_s16(out + out_stride * 186 + i, v4027);
+    vst1q_s16(out + out_stride * 187 + i, v4028);
+    vst1q_s16(out + out_stride * 188 + i, v4029);
+    vst1q_s16(out + out_stride * 189 + i, v4030);
+    vst1q_s16(out + out_stride * 190 + i, v4031);
+    vst1q_s16(out + out_stride * 191 + i, v4032);
+    vst1q_s16(out + out_stride * 192 + i, v4033);
+    vst1q_s16(out + out_stride * 193 + i, v4034);
+    vst1q_s16(out + out_stride * 194 + i, v4035);
+    vst1q_s16(out + out_stride * 195 + i, v4036);
+    vst1q_s16(out + out_stride * 196 + i, v4037);
+    vst1q_s16(out + out_stride * 197 + i, v4038);
+    vst1q_s16(out + out_stride * 198 + i, v4039);
+    vst1q_s16(out + out_stride * 199 + i, v4040);
+    vst1q_s16(out + out_stride * 200 + i, v4041);
+    vst1q_s16(out + out_stride * 201 + i, v4042);
+    vst1q_s16(out + out_stride * 202 + i, v4043);
+    vst1q_s16(out + out_stride * 203 + i, v4044);
+    vst1q_s16(out + out_stride * 204 + i, v4045);
+    vst1q_s16(out + out_stride * 205 + i, v4046);
+    vst1q_s16(out + out_stride * 206 + i, v4047);
+    vst1q_s16(out + out_stride * 207 + i, v4048);
+    vst1q_s16(out + out_stride * 208 + i, v4049);
+    vst1q_s16(out + out_stride * 209 + i, v4050);
+    vst1q_s16(out + out_stride * 210 + i, v4051);
+    vst1q_s16(out + out_stride * 211 + i, v4052);
+    vst1q_s16(out + out_stride * 212 + i, v4053);
+    vst1q_s16(out + out_stride * 213 + i, v4054);
+    vst1q_s16(out + out_stride * 214 + i, v4055);
+    vst1q_s16(out + out_stride * 215 + i, v4056);
+    vst1q_s16(out + out_stride * 216 + i, v4057);
+    vst1q_s16(out + out_stride * 217 + i, v4058);
+    vst1q_s16(out + out_stride * 218 + i, v4059);
+    vst1q_s16(out + out_stride * 219 + i, v4060);
+    vst1q_s16(out + out_stride * 220 + i, v4061);
+    vst1q_s16(out + out_stride * 221 + i, v4062);
+    vst1q_s16(out + out_stride * 222 + i, v4063);
+    vst1q_s16(out + out_stride * 223 + i, v4064);
+    vst1q_s16(out + out_stride * 224 + i, v4065);
+    vst1q_s16(out + out_stride * 225 + i, v4066);
+    vst1q_s16(out + out_stride * 226 + i, v4067);
+    vst1q_s16(out + out_stride * 227 + i, v4068);
+    vst1q_s16(out + out_stride * 228 + i, v4069);
+    vst1q_s16(out + out_stride * 229 + i, v4070);
+    vst1q_s16(out + out_stride * 230 + i, v4071);
+    vst1q_s16(out + out_stride * 231 + i, v4072);
+    vst1q_s16(out + out_stride * 232 + i, v4073);
+    vst1q_s16(out + out_stride * 233 + i, v4074);
+    vst1q_s16(out + out_stride * 234 + i, v4075);
+    vst1q_s16(out + out_stride * 235 + i, v4076);
+    vst1q_s16(out + out_stride * 236 + i, v4077);
+    vst1q_s16(out + out_stride * 237 + i, v4078);
+    vst1q_s16(out + out_stride * 238 + i, v4079);
+    vst1q_s16(out + out_stride * 239 + i, v4080);
+    vst1q_s16(out + out_stride * 240 + i, v4081);
+    vst1q_s16(out + out_stride * 241 + i, v4082);
+    vst1q_s16(out + out_stride * 242 + i, v4083);
+    vst1q_s16(out + out_stride * 243 + i, v4084);
+    vst1q_s16(out + out_stride * 244 + i, v4085);
+    vst1q_s16(out + out_stride * 245 + i, v4086);
+    vst1q_s16(out + out_stride * 246 + i, v4087);
+    vst1q_s16(out + out_stride * 247 + i, v4088);
+    vst1q_s16(out + out_stride * 248 + i, v4089);
+    vst1q_s16(out + out_stride * 249 + i, v4090);
+    vst1q_s16(out + out_stride * 250 + i, v4091);
+    vst1q_s16(out + out_stride * 251 + i, v4092);
+    vst1q_s16(out + out_stride * 252 + i, v4093);
+    vst1q_s16(out + out_stride * 253 + i, v4094);
+    vst1q_s16(out + out_stride * 254 + i, v4095);
+    vst1q_s16(out + out_stride * 255 + i, v4096);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
new file mode 100644
index 0000000000..0f3b31cfea
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
@@ -0,0 +1,419 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; }
+
+void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v17 = vaddq_s16(v16, v12);
+    int16x8_t v18 = vaddq_s16(v13, v10);
+    int16x8_t v19 = vaddq_s16(v17, v18);
+    int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
+    int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
+    int16x8_t v22 = vaddq_s16(v20, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
+    int16x8_t v41 = vaddq_s16(v35, v40);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v29, v32);
+    int16x8_t v46 = vaddq_s16(v37, v28);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
+    int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v51 = vaddq_s16(v50, v36);
+    int16x8_t v52 = vaddq_s16(v51, v46);
+    int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734);
+    int16x8_t v54 = vaddq_s16(v45, v43);
+    int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045);
+    int16x8_t v55 = vaddq_s16(v55_tmp, v54);
+    int16x8_t v56 = vaddq_s16(v53, v55);
+    int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
+    int16x8_t v58 = vaddq_s16(v49, v57);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v64 = vaddq_s16(v62, v63);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v67 = vaddq_s16(v65, v66);
+    int16x8_t v68 = vaddq_s16(v64, v67);
+    int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045);
+    int16x8_t v69 = vaddq_s16(v69_tmp, v68);
+    int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v72 = vaddq_s16(v70, v71);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v75 = vaddq_s16(v73, v74);
+    int16x8_t v76 = vaddq_s16(v72, v75);
+    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+    int16x8_t v78 = vaddq_s16(v69, v77);
+    int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705);
+    int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573);
+    int16x8_t v80 = vaddq_s16(v80_tmp, v67);
+    int16x8_t v81 = vaddq_s16(v64, v72);
+    int16x8_t v82 = vaddq_s16(v80, v81);
+    int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705);
+    int16x8_t v84 = vaddq_s16(v79, v83);
+    int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573);
+    int16x8_t v86 = vaddq_s16(v86_tmp, v85);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v89 = vaddq_s16(v87, v88);
+    int16x8_t v90 = vaddq_s16(v86, v89);
+    int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v93 = vaddq_s16(v91, v92);
+    int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080);
+    int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v97 = vaddq_s16(v95, v96);
+    int16x8_t v98 = vaddq_s16(v97, v93);
+    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+    int16x8_t v100 = vaddq_s16(v94, v99);
+    int16x8_t v101 = vaddq_s16(v90, v100);
+    int16x8_t v102 = vaddq_s16(v84, v101);
+    int16x8_t v103 = vaddq_s16(v92, v65);
+    int16x8_t v104 = vaddq_s16(v66, v85);
+    int16x8_t v105 = vaddq_s16(v103, v104);
+    int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
+    int16x8_t v106 = vaddq_s16(v106_tmp, v105);
+    int16x8_t v107 = vaddq_s16(v96, v70);
+    int16x8_t v108 = vaddq_s16(v71, v87);
+    int16x8_t v109 = vaddq_s16(v107, v108);
+    int16x8_t v110 = vaddq_s16(v63, v91);
+    int16x8_t v111 = vaddq_s16(v88, v62);
+    int16x8_t v112 = vaddq_s16(v110, v111);
+    int16x8_t v113 = vaddq_s16(v109, v112);
+    int16x8_t v114 = vaddq_s16(v106, v113);
+    int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
+    int16x8_t v116 = vaddq_s16(v112, v105);
+    int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
+    int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
+    int16x8_t v119 = vaddq_s16(v74, v95);
+    int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v121 = vaddq_s16(v120, v73);
+    int16x8_t v122 = vaddq_s16(v119, v121);
+    int16x8_t v123 = vaddq_s16(v122, v109);
+    int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
+    int16x8_t v125 = vaddq_s16(v118, v124);
+    int16x8_t v126 = vaddq_s16(v117, v125);
+    int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
+    int16x8_t v128 = vaddq_s16(v115, v127);
+    int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
+    int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
+    int16x8_t v130 = vaddq_s16(v130_tmp, v104);
+    int16x8_t v131 = vaddq_s16(v108, v111);
+    int16x8_t v132 = vaddq_s16(v130, v131);
+    int16x8_t v133 = vaddq_s16(v119, v107);
+    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+    int16x8_t v135 = vaddq_s16(v110, v103);
+    int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045);
+    int16x8_t v136 = vaddq_s16(v136_tmp, v135);
+    int16x8_t v137 = vaddq_s16(v134, v136);
+    int16x8_t v138 = vaddq_s16(v132, v137);
+    int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
+    int16x8_t v140 = vaddq_s16(v129, v139);
+    int16x8_t v141 = vaddq_s16(v102, v140);
+    int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
+    int16x8_t v143 = vaddq_s16(v61, v142);
+    int16x8_t v144 = vsubq_s16(v0, v1);
+    int16x8_t v145 = vsubq_s16(v4, v6);
+    int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045);
+    int16x8_t v146 = vaddq_s16(v146_tmp, v145);
+    int16x8_t v147 = vaddq_s16(v144, v146);
+    int16x8_t v148 = vsubq_s16(v11, v14);
+    int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734);
+    int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045);
+    int16x8_t v150 = vaddq_s16(v150_tmp, v17);
+    int16x8_t v151 = vsubq_s16(v149, v150);
+    int16x8_t v152 = vaddq_s16(v148, v151);
+    int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705);
+    int16x8_t v154 = vaddq_s16(v147, v153);
+    int16x8_t v155 = vsubq_s16(v27, v30);
+    int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734);
+    int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045);
+    int16x8_t v157 = vaddq_s16(v157_tmp, v38);
+    int16x8_t v158 = vsubq_s16(v156, v157);
+    int16x8_t v159 = vaddq_s16(v155, v158);
+    int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573);
+    int16x8_t v161 = vsubq_s16(v160, v52);
+    int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746);
+    int16x8_t v163 = vsubq_s16(v44, v47);
+    int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705);
+    int16x8_t v165 = vaddq_s16(v162, v164);
+    int16x8_t v166 = vaddq_s16(v159, v165);
+    int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121);
+    int16x8_t v168 = vaddq_s16(v154, v167);
+    int16x8_t v169 = vsubq_s16(v86, v89);
+    int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734);
+    int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045);
+    int16x8_t v171 = vaddq_s16(v171_tmp, v97);
+    int16x8_t v172 = vsubq_s16(v170, v171);
+    int16x8_t v173 = vaddq_s16(v169, v172);
+    int16x8_t v174 = vsubq_s16(v80, v81);
+    int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705);
+    int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573);
+    int16x8_t v177 = vsubq_s16(v176, v76);
+    int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746);
+    int16x8_t v179 = vaddq_s16(v175, v178);
+    int16x8_t v180 = vaddq_s16(v173, v179);
+    int16x8_t v181 = vsubq_s16(v130, v131);
+    int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573);
+    int16x8_t v183 = vsubq_s16(v182, v133);
+    int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045);
+    int16x8_t v184 = vaddq_s16(v184_tmp, v183);
+    int16x8_t v185 = vaddq_s16(v181, v184);
+    int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121);
+    int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867);
+    int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705);
+    int16x8_t v189 = vsubq_s16(v187, v188);
+    int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573);
+    int16x8_t v191 = vsubq_s16(v190, v123);
+    int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746);
+    int16x8_t v193 = vaddq_s16(v189, v192);
+    int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121);
+    int16x8_t v195 = vaddq_s16(v186, v194);
+    int16x8_t v196 = vaddq_s16(v180, v195);
+    int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563);
+    int16x8_t v198 = vaddq_s16(v168, v197);
+    int16x8_t v199 = vsubq_s16(v144, v146);
+    int16x8_t v200 = vsubq_s16(v148, v151);
+    int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490);
+    int16x8_t v202 = vaddq_s16(v199, v201);
+    int16x8_t v203 = vsubq_s16(v155, v158);
+    int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490);
+    int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763);
+    int16x8_t v205 = vaddq_s16(v205_tmp, v161);
+    int16x8_t v206 = vsubq_s16(v204, v205);
+    int16x8_t v207 = vaddq_s16(v203, v206);
+    int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578);
+    int16x8_t v209 = vaddq_s16(v202, v208);
+    int16x8_t v210 = vsubq_s16(v169, v172);
+    int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490);
+    int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763);
+    int16x8_t v212 = vaddq_s16(v212_tmp, v177);
+    int16x8_t v213 = vsubq_s16(v211, v212);
+    int16x8_t v214 = vaddq_s16(v210, v213);
+    int16x8_t v215 = vsubq_s16(v181, v184);
+    int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578);
+    int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803);
+    int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845);
+    int16x8_t v219 = vsubq_s16(v217, v218);
+    int16x8_t v220 = vaddq_s16(v216, v219);
+    int16x8_t v221 = vaddq_s16(v214, v220);
+    int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890);
+    int16x8_t v223 = vaddq_s16(v209, v222);
+    int16x8_t v224 = vsubq_s16(v2, v8);
+    int16x8_t v225 = vsubq_s16(v15, v22);
+    int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446);
+    int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2);
+    int16x8_t v227 = vaddq_s16(v224, v226);
+    int16x8_t v228 = vsubq_s16(v31, v41);
+    int16x8_t v229 = vsubq_s16(v48, v56);
+    int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446);
+    int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2);
+    int16x8_t v231 = vaddq_s16(v228, v230);
+    int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195);
+    int16x8_t v233 = vaddq_s16(v227, v232);
+    int16x8_t v234 = vsubq_s16(v82, v78);
+    int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446);
+    int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2);
+    int16x8_t v236 = vsubq_s16(v90, v100);
+    int16x8_t v237 = vaddq_s16(v235, v236);
+    int16x8_t v238 = vsubq_s16(v132, v137);
+    int16x8_t v239 = vsubq_s16(v114, v126);
+    int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446);
+    int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2);
+    int16x8_t v241 = vaddq_s16(v238, v240);
+    int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195);
+    int16x8_t v243 = vaddq_s16(v237, v242);
+    int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401);
+    int16x8_t v245 = vaddq_s16(v233, v244);
+    int16x8_t v246 = vsubq_s16(v228, v230);
+    int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826);
+    int16x8_t v248 = vsubq_s16(v224, v226);
+    int16x8_t v249 = vaddq_s16(v247, v248);
+    int16x8_t v250 = vsubq_s16(v238, v240);
+    int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826);
+    int16x8_t v252 = vsubq_s16(v236, v235);
+    int16x8_t v253 = vaddq_s16(v251, v252);
+    int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124);
+    int16x8_t v255 = vaddq_s16(v249, v254);
+    int16x8_t v256 = vsubq_s16(v199, v201);
+    int16x8_t v257 = vsubq_s16(v203, v206);
+    int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988);
+    int16x8_t v258 = vaddq_s16(v258_tmp, v257);
+    int16x8_t v259 = vaddq_s16(v256, v258);
+    int16x8_t v260 = vsubq_s16(v210, v213);
+    int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030);
+    int16x8_t v261 = vaddq_s16(v261_tmp, v219);
+    int16x8_t v262 = vsubq_s16(v215, v261);
+    int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988);
+    int16x8_t v263 = vaddq_s16(v263_tmp, v262);
+    int16x8_t v264 = vaddq_s16(v260, v263);
+    int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102);
+    int16x8_t v266 = vaddq_s16(v259, v265);
+    int16x8_t v267 = vsubq_s16(v147, v153);
+    int16x8_t v268 = vsubq_s16(v159, v165);
+    int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673);
+    int16x8_t v269 = vaddq_s16(v269_tmp, v268);
+    int16x8_t v270 = vaddq_s16(v267, v269);
+    int16x8_t v271 = vsubq_s16(v173, v179);
+    int16x8_t v272 = vsubq_s16(v185, v193);
+    int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673);
+    int16x8_t v273 = vaddq_s16(v273_tmp, v272);
+    int16x8_t v274 = vaddq_s16(v271, v273);
+    int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398);
+    int16x8_t v276 = vaddq_s16(v270, v275);
+    int16x8_t v277 = vsubq_s16(v9, v24);
+    int16x8_t v278 = vsubq_s16(v42, v58);
+    int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314);
+    int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5);
+    int16x8_t v280 = vaddq_s16(v277, v279);
+    int16x8_t v281 = vsubq_s16(v138, v128);
+    int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314);
+    int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5);
+    int16x8_t v283 = vsubq_s16(v101, v84);
+    int16x8_t v284 = vaddq_s16(v282, v283);
+    int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112);
+    int16x8_t v286 = vaddq_s16(v280, v285);
+    int16x8_t v287 = vsubq_s16(v277, v279);
+    int16x8_t v288 = vsubq_s16(v283, v282);
+    int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397);
+    int16x8_t v290 = vaddq_s16(v287, v289);
+    int16x8_t v291 = vsubq_s16(v267, v269);
+    int16x8_t v292 = vsubq_s16(v271, v273);
+    int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504);
+    int16x8_t v294 = vaddq_s16(v291, v293);
+    int16x8_t v295 = vsubq_s16(v260, v263);
+    int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869);
+    int16x8_t v297 = vsubq_s16(v256, v258);
+    int16x8_t v298 = vaddq_s16(v296, v297);
+    int16x8_t v299 = vsubq_s16(v248, v247);
+    int16x8_t v300 = vsubq_s16(v252, v251);
+    int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552);
+    int16x8_t v301 = vaddq_s16(v301_tmp, v300);
+    int16x8_t v302 = vaddq_s16(v299, v301);
+    int16x8_t v303 = vsubq_s16(v227, v232);
+    int16x8_t v304 = vsubq_s16(v237, v242);
+    int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865);
+    int16x8_t v305 = vaddq_s16(v305_tmp, v304);
+    int16x8_t v306 = vaddq_s16(v303, v305);
+    int16x8_t v307 = vsubq_s16(v202, v208);
+    int16x8_t v308 = vsubq_s16(v214, v220);
+    int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893);
+    int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2);
+    int16x8_t v310 = vaddq_s16(v307, v309);
+    int16x8_t v311 = vsubq_s16(v154, v167);
+    int16x8_t v312 = vsubq_s16(v180, v195);
+    int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357);
+    int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3);
+    int16x8_t v314 = vaddq_s16(v311, v313);
+    int16x8_t v315 = vsubq_s16(v102, v140);
+    int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226);
+    int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10);
+    int16x8_t v317 = vsubq_s16(v25, v60);
+    int16x8_t v318 = vaddq_s16(v316, v317);
+    int16x8_t v319 = vsubq_s16(v317, v316);
+    int16x8_t v320 = vsubq_s16(v311, v313);
+    int16x8_t v321 = vsubq_s16(v307, v309);
+    int16x8_t v322 = vsubq_s16(v303, v305);
+    int16x8_t v323 = vsubq_s16(v299, v301);
+    int16x8_t v324 = vsubq_s16(v297, v296);
+    int16x8_t v325 = vsubq_s16(v291, v293);
+    int16x8_t v326 = vsubq_s16(v287, v289);
+    int16x8_t v327 = vsubq_s16(v280, v285);
+    int16x8_t v328 = vsubq_s16(v270, v275);
+    int16x8_t v329 = vsubq_s16(v259, v265);
+    int16x8_t v330 = vsubq_s16(v249, v254);
+    int16x8_t v331 = vsubq_s16(v233, v244);
+    int16x8_t v332 = vsubq_s16(v209, v222);
+    int16x8_t v333 = vsubq_s16(v168, v197);
+    int16x8_t v334 = vsubq_s16(v61, v142);
+    vst1q_s16(out + out_stride * 0 + i, v143);
+    vst1q_s16(out + out_stride * 1 + i, v198);
+    vst1q_s16(out + out_stride * 2 + i, v223);
+    vst1q_s16(out + out_stride * 3 + i, v245);
+    vst1q_s16(out + out_stride * 4 + i, v255);
+    vst1q_s16(out + out_stride * 5 + i, v266);
+    vst1q_s16(out + out_stride * 6 + i, v276);
+    vst1q_s16(out + out_stride * 7 + i, v286);
+    vst1q_s16(out + out_stride * 8 + i, v290);
+    vst1q_s16(out + out_stride * 9 + i, v294);
+    vst1q_s16(out + out_stride * 10 + i, v298);
+    vst1q_s16(out + out_stride * 11 + i, v302);
+    vst1q_s16(out + out_stride * 12 + i, v306);
+    vst1q_s16(out + out_stride * 13 + i, v310);
+    vst1q_s16(out + out_stride * 14 + i, v314);
+    vst1q_s16(out + out_stride * 15 + i, v318);
+    vst1q_s16(out + out_stride * 16 + i, v319);
+    vst1q_s16(out + out_stride * 17 + i, v320);
+    vst1q_s16(out + out_stride * 18 + i, v321);
+    vst1q_s16(out + out_stride * 19 + i, v322);
+    vst1q_s16(out + out_stride * 20 + i, v323);
+    vst1q_s16(out + out_stride * 21 + i, v324);
+    vst1q_s16(out + out_stride * 22 + i, v325);
+    vst1q_s16(out + out_stride * 23 + i, v326);
+    vst1q_s16(out + out_stride * 24 + i, v327);
+    vst1q_s16(out + out_stride * 25 + i, v328);
+    vst1q_s16(out + out_stride * 26 + i, v329);
+    vst1q_s16(out + out_stride * 27 + i, v330);
+    vst1q_s16(out + out_stride * 28 + i, v331);
+    vst1q_s16(out + out_stride * 29 + i, v332);
+    vst1q_s16(out + out_stride * 30 + i, v333);
+    vst1q_s16(out + out_stride * 31 + i, v334);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h
new file mode 100644
index 0000000000..400da1a9de
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct64-inl.h
@@ -0,0 +1,985 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<64>) { return 1; }
+
+void FastIDCT(FastDCTTag<64>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 32 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 16 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 48 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 8 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 40 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 24 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vld1q_s16(in + in_stride * 56 + i);
+    int16x8_t v17 = vaddq_s16(v16, v12);
+    int16x8_t v18 = vaddq_s16(v13, v10);
+    int16x8_t v19 = vaddq_s16(v17, v18);
+    int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
+    int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
+    int16x8_t v22 = vaddq_s16(v20, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+    int16x8_t v28 = vld1q_s16(in + in_stride * 36 + i);
+    int16x8_t v29 = vld1q_s16(in + in_stride * 28 + i);
+    int16x8_t v30 = vaddq_s16(v28, v29);
+    int16x8_t v31 = vaddq_s16(v27, v30);
+    int16x8_t v32 = vld1q_s16(in + in_stride * 20 + i);
+    int16x8_t v33 = vld1q_s16(in + in_stride * 12 + i);
+    int16x8_t v34 = vaddq_s16(v32, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
+    int16x8_t v36 = vld1q_s16(in + in_stride * 52 + i);
+    int16x8_t v37 = vld1q_s16(in + in_stride * 44 + i);
+    int16x8_t v38 = vaddq_s16(v36, v37);
+    int16x8_t v39 = vaddq_s16(v38, v34);
+    int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
+    int16x8_t v41 = vaddq_s16(v35, v40);
+    int16x8_t v42 = vaddq_s16(v31, v41);
+    int16x8_t v43 = vaddq_s16(v33, v26);
+    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+    int16x8_t v45 = vaddq_s16(v37, v28);
+    int16x8_t v46 = vaddq_s16(v29, v32);
+    int16x8_t v47 = vaddq_s16(v45, v46);
+    int16x8_t v48 = vaddq_s16(v44, v47);
+    int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
+    int16x8_t v50 = vaddq_s16(v46, v43);
+    int16x8_t v51_tmp = vqrdmulhq_n_s16(v50, 10045);
+    int16x8_t v51 = vaddq_s16(v51_tmp, v50);
+    int16x8_t v52 = vld1q_s16(in + in_stride * 60 + i);
+    int16x8_t v53 = vaddq_s16(v52, v36);
+    int16x8_t v54 = vaddq_s16(v53, v45);
+    int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
+    int16x8_t v56 = vaddq_s16(v51, v55);
+    int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
+    int16x8_t v58 = vaddq_s16(v49, v57);
+    int16x8_t v59 = vaddq_s16(v42, v58);
+    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+    int16x8_t v61 = vaddq_s16(v25, v60);
+    int16x8_t v62 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+    int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+    int16x8_t v64 = vld1q_s16(in + in_stride * 34 + i);
+    int16x8_t v65 = vld1q_s16(in + in_stride * 30 + i);
+    int16x8_t v66 = vaddq_s16(v64, v65);
+    int16x8_t v67 = vaddq_s16(v63, v66);
+    int16x8_t v68 = vld1q_s16(in + in_stride * 18 + i);
+    int16x8_t v69 = vld1q_s16(in + in_stride * 14 + i);
+    int16x8_t v70 = vaddq_s16(v68, v69);
+    int16x8_t v71 = vqrdmulhq_n_s16(v70, 25080);
+    int16x8_t v72 = vld1q_s16(in + in_stride * 50 + i);
+    int16x8_t v73 = vld1q_s16(in + in_stride * 46 + i);
+    int16x8_t v74 = vaddq_s16(v72, v73);
+    int16x8_t v75 = vaddq_s16(v74, v70);
+    int16x8_t v76 = vqrdmulhq_n_s16(v75, 17734);
+    int16x8_t v77 = vaddq_s16(v71, v76);
+    int16x8_t v78 = vaddq_s16(v67, v77);
+    int16x8_t v79 = vld1q_s16(in + in_stride * 10 + i);
+    int16x8_t v80 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v81 = vaddq_s16(v79, v80);
+    int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+    int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+    int16x8_t v83 = vld1q_s16(in + in_stride * 42 + i);
+    int16x8_t v84 = vld1q_s16(in + in_stride * 38 + i);
+    int16x8_t v85 = vaddq_s16(v83, v84);
+    int16x8_t v86 = vld1q_s16(in + in_stride * 26 + i);
+    int16x8_t v87 = vld1q_s16(in + in_stride * 22 + i);
+    int16x8_t v88 = vaddq_s16(v86, v87);
+    int16x8_t v89 = vaddq_s16(v85, v88);
+    int16x8_t v90 = vaddq_s16(v82, v89);
+    int16x8_t v91 = vqrdmulhq_n_s16(v90, 16705);
+    int16x8_t v92 = vaddq_s16(v88, v81);
+    int16x8_t v93_tmp = vqrdmulhq_n_s16(v92, 10045);
+    int16x8_t v93 = vaddq_s16(v93_tmp, v92);
+    int16x8_t v94 = vld1q_s16(in + in_stride * 58 + i);
+    int16x8_t v95 = vld1q_s16(in + in_stride * 54 + i);
+    int16x8_t v96 = vaddq_s16(v94, v95);
+    int16x8_t v97 = vaddq_s16(v96, v85);
+    int16x8_t v98 = vqrdmulhq_n_s16(v97, 17734);
+    int16x8_t v99 = vaddq_s16(v93, v98);
+    int16x8_t v100 = vqrdmulhq_n_s16(v99, 16705);
+    int16x8_t v101 = vaddq_s16(v91, v100);
+    int16x8_t v102 = vaddq_s16(v78, v101);
+    int16x8_t v103 = vaddq_s16(v69, v79);
+    int16x8_t v104 = vaddq_s16(v80, v62);
+    int16x8_t v105 = vaddq_s16(v103, v104);
+    int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
+    int16x8_t v106 = vaddq_s16(v106_tmp, v105);
+    int16x8_t v107 = vaddq_s16(v73, v83);
+    int16x8_t v108 = vaddq_s16(v84, v64);
+    int16x8_t v109 = vaddq_s16(v107, v108);
+    int16x8_t v110 = vaddq_s16(v65, v86);
+    int16x8_t v111 = vaddq_s16(v87, v68);
+    int16x8_t v112 = vaddq_s16(v110, v111);
+    int16x8_t v113 = vaddq_s16(v109, v112);
+    int16x8_t v114 = vaddq_s16(v106, v113);
+    int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
+    int16x8_t v116 = vaddq_s16(v112, v105);
+    int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
+    int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
+    int16x8_t v119 = vld1q_s16(in + in_stride * 62 + i);
+    int16x8_t v120 = vaddq_s16(v119, v94);
+    int16x8_t v121 = vaddq_s16(v95, v72);
+    int16x8_t v122 = vaddq_s16(v120, v121);
+    int16x8_t v123 = vaddq_s16(v122, v109);
+    int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
+    int16x8_t v125 = vaddq_s16(v118, v124);
+    int16x8_t v126 = vaddq_s16(v117, v125);
+    int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
+    int16x8_t v128 = vaddq_s16(v115, v127);
+    int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
+    int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
+    int16x8_t v130 = vaddq_s16(v130_tmp, v104);
+    int16x8_t v131 = vaddq_s16(v108, v110);
+    int16x8_t v132 = vaddq_s16(v130, v131);
+    int16x8_t v133 = vaddq_s16(v111, v103);
+    int16x8_t v134_tmp = vqrdmulhq_n_s16(v133, 10045);
+    int16x8_t v134 = vaddq_s16(v134_tmp, v133);
+    int16x8_t v135 = vaddq_s16(v121, v107);
+    int16x8_t v136 = vqrdmulhq_n_s16(v135, 17734);
+    int16x8_t v137 = vaddq_s16(v134, v136);
+    int16x8_t v138 = vaddq_s16(v132, v137);
+    int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
+    int16x8_t v140 = vaddq_s16(v129, v139);
+    int16x8_t v141 = vaddq_s16(v102, v140);
+    int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
+    int16x8_t v143 = vaddq_s16(v61, v142);
+    int16x8_t v144 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v145_tmp = vqrdmulhq_n_s16(v144, 13573);
+    int16x8_t v145 = vaddq_s16(v145_tmp, v144);
+    int16x8_t v146 = vld1q_s16(in + in_stride * 33 + i);
+    int16x8_t v147 = vld1q_s16(in + in_stride * 31 + i);
+    int16x8_t v148 = vaddq_s16(v146, v147);
+    int16x8_t v149 = vaddq_s16(v145, v148);
+    int16x8_t v150 = vld1q_s16(in + in_stride * 17 + i);
+    int16x8_t v151 = vld1q_s16(in + in_stride * 15 + i);
+    int16x8_t v152 = vaddq_s16(v150, v151);
+    int16x8_t v153 = vqrdmulhq_n_s16(v152, 25080);
+    int16x8_t v154 = vld1q_s16(in + in_stride * 49 + i);
+    int16x8_t v155 = vld1q_s16(in + in_stride * 47 + i);
+    int16x8_t v156 = vaddq_s16(v154, v155);
+    int16x8_t v157 = vaddq_s16(v156, v152);
+    int16x8_t v158 = vqrdmulhq_n_s16(v157, 17734);
+    int16x8_t v159 = vaddq_s16(v153, v158);
+    int16x8_t v160 = vaddq_s16(v149, v159);
+    int16x8_t v161 = vld1q_s16(in + in_stride * 9 + i);
+    int16x8_t v162 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v163 = vaddq_s16(v161, v162);
+    int16x8_t v164_tmp = vqrdmulhq_n_s16(v163, 13573);
+    int16x8_t v164 = vaddq_s16(v164_tmp, v163);
+    int16x8_t v165 = vld1q_s16(in + in_stride * 41 + i);
+    int16x8_t v166 = vld1q_s16(in + in_stride * 39 + i);
+    int16x8_t v167 = vaddq_s16(v165, v166);
+    int16x8_t v168 = vld1q_s16(in + in_stride * 25 + i);
+    int16x8_t v169 = vld1q_s16(in + in_stride * 23 + i);
+    int16x8_t v170 = vaddq_s16(v168, v169);
+    int16x8_t v171 = vaddq_s16(v167, v170);
+    int16x8_t v172 = vaddq_s16(v164, v171);
+    int16x8_t v173 = vqrdmulhq_n_s16(v172, 16705);
+    int16x8_t v174 = vaddq_s16(v170, v163);
+    int16x8_t v175_tmp = vqrdmulhq_n_s16(v174, 10045);
+    int16x8_t v175 = vaddq_s16(v175_tmp, v174);
+    int16x8_t v176 = vld1q_s16(in + in_stride * 57 + i);
+    int16x8_t v177 = vld1q_s16(in + in_stride * 55 + i);
+    int16x8_t v178 = vaddq_s16(v176, v177);
+    int16x8_t v179 = vaddq_s16(v178, v167);
+    int16x8_t v180 = vqrdmulhq_n_s16(v179, 17734);
+    int16x8_t v181 = vaddq_s16(v175, v180);
+    int16x8_t v182 = vqrdmulhq_n_s16(v181, 16705);
+    int16x8_t v183 = vaddq_s16(v173, v182);
+    int16x8_t v184 = vaddq_s16(v160, v183);
+    int16x8_t v185 = vld1q_s16(in + in_stride * 37 + i);
+    int16x8_t v186 = vld1q_s16(in + in_stride * 35 + i);
+    int16x8_t v187 = vaddq_s16(v185, v186);
+    int16x8_t v188 = vld1q_s16(in + in_stride * 45 + i);
+    int16x8_t v189 = vld1q_s16(in + in_stride * 43 + i);
+    int16x8_t v190 = vaddq_s16(v188, v189);
+    int16x8_t v191 = vaddq_s16(v187, v190);
+    int16x8_t v192 = vld1q_s16(in + in_stride * 29 + i);
+    int16x8_t v193 = vld1q_s16(in + in_stride * 27 + i);
+    int16x8_t v194 = vaddq_s16(v192, v193);
+    int16x8_t v195 = vld1q_s16(in + in_stride * 21 + i);
+    int16x8_t v196 = vld1q_s16(in + in_stride * 19 + i);
+    int16x8_t v197 = vaddq_s16(v195, v196);
+    int16x8_t v198 = vaddq_s16(v194, v197);
+    int16x8_t v199 = vaddq_s16(v191, v198);
+    int16x8_t v200 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v201 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v202 = vaddq_s16(v200, v201);
+    int16x8_t v203 = vld1q_s16(in + in_stride * 13 + i);
+    int16x8_t v204 = vld1q_s16(in + in_stride * 11 + i);
+    int16x8_t v205 = vaddq_s16(v203, v204);
+    int16x8_t v206 = vaddq_s16(v202, v205);
+    int16x8_t v207_tmp = vqrdmulhq_n_s16(v206, 13573);
+    int16x8_t v207 = vaddq_s16(v207_tmp, v206);
+    int16x8_t v208 = vaddq_s16(v199, v207);
+    int16x8_t v209 = vqrdmulhq_n_s16(v208, 16705);
+    int16x8_t v210 = vaddq_s16(v198, v206);
+    int16x8_t v211 = vqrdmulhq_n_s16(v210, 25080);
+    int16x8_t v212 = vqrdmulhq_n_s16(v210, 17734);
+    int16x8_t v213 = vld1q_s16(in + in_stride * 53 + i);
+    int16x8_t v214 = vld1q_s16(in + in_stride * 51 + i);
+    int16x8_t v215 = vaddq_s16(v213, v214);
+    int16x8_t v216 = vld1q_s16(in + in_stride * 61 + i);
+    int16x8_t v217 = vld1q_s16(in + in_stride * 59 + i);
+    int16x8_t v218 = vaddq_s16(v216, v217);
+    int16x8_t v219 = vaddq_s16(v215, v218);
+    int16x8_t v220 = vaddq_s16(v219, v191);
+    int16x8_t v221 = vqrdmulhq_n_s16(v220, 17734);
+    int16x8_t v222 = vaddq_s16(v212, v221);
+    int16x8_t v223 = vaddq_s16(v211, v222);
+    int16x8_t v224 = vqrdmulhq_n_s16(v223, 16705);
+    int16x8_t v225 = vaddq_s16(v209, v224);
+    int16x8_t v226 = vqrdmulhq_n_s16(v225, 16463);
+    int16x8_t v227_tmp = vqrdmulhq_n_s16(v202, 13573);
+    int16x8_t v227 = vaddq_s16(v227_tmp, v202);
+    int16x8_t v228 = vaddq_s16(v187, v194);
+    int16x8_t v229 = vaddq_s16(v227, v228);
+    int16x8_t v230 = vaddq_s16(v215, v190);
+    int16x8_t v231 = vqrdmulhq_n_s16(v230, 17734);
+    int16x8_t v232 = vaddq_s16(v197, v205);
+    int16x8_t v233_tmp = vqrdmulhq_n_s16(v232, 10045);
+    int16x8_t v233 = vaddq_s16(v233_tmp, v232);
+    int16x8_t v234 = vaddq_s16(v231, v233);
+    int16x8_t v235 = vaddq_s16(v229, v234);
+    int16x8_t v236 = vqrdmulhq_n_s16(v235, 16463);
+    int16x8_t v237 = vaddq_s16(v226, v236);
+    int16x8_t v238 = vaddq_s16(v184, v237);
+    int16x8_t v239 = vaddq_s16(v201, v144);
+    int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 13573);
+    int16x8_t v240 = vaddq_s16(v240_tmp, v239);
+    int16x8_t v241 = vaddq_s16(v186, v146);
+    int16x8_t v242 = vaddq_s16(v147, v192);
+    int16x8_t v243 = vaddq_s16(v241, v242);
+    int16x8_t v244 = vaddq_s16(v240, v243);
+    int16x8_t v245 = vaddq_s16(v196, v150);
+    int16x8_t v246 = vaddq_s16(v151, v203);
+    int16x8_t v247 = vaddq_s16(v245, v246);
+    int16x8_t v248_tmp = vqrdmulhq_n_s16(v247, 10045);
+    int16x8_t v248 = vaddq_s16(v248_tmp, v247);
+    int16x8_t v249 = vaddq_s16(v155, v188);
+    int16x8_t v250 = vaddq_s16(v214, v154);
+    int16x8_t v251 = vaddq_s16(v249, v250);
+    int16x8_t v252 = vqrdmulhq_n_s16(v251, 17734);
+    int16x8_t v253 = vaddq_s16(v248, v252);
+    int16x8_t v254 = vaddq_s16(v244, v253);
+    int16x8_t v255 = vaddq_s16(v204, v161);
+    int16x8_t v256 = vaddq_s16(v162, v200);
+    int16x8_t v257 = vaddq_s16(v255, v256);
+    int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 13573);
+    int16x8_t v258 = vaddq_s16(v258_tmp, v257);
+    int16x8_t v259 = vaddq_s16(v189, v165);
+    int16x8_t v260 = vaddq_s16(v166, v185);
+    int16x8_t v261 = vaddq_s16(v259, v260);
+    int16x8_t v262 = vaddq_s16(v169, v195);
+    int16x8_t v263 = vaddq_s16(v193, v168);
+    int16x8_t v264 = vaddq_s16(v262, v263);
+    int16x8_t v265 = vaddq_s16(v261, v264);
+    int16x8_t v266 = vaddq_s16(v258, v265);
+    int16x8_t v267 = vqrdmulhq_n_s16(v266, 16705);
+    int16x8_t v268 = vaddq_s16(v264, v257);
+    int16x8_t v269 = vqrdmulhq_n_s16(v268, 25080);
+    int16x8_t v270 = vaddq_s16(v217, v176);
+    int16x8_t v271 = vaddq_s16(v177, v213);
+    int16x8_t v272 = vaddq_s16(v270, v271);
+    int16x8_t v273 = vaddq_s16(v272, v261);
+    int16x8_t v274 = vqrdmulhq_n_s16(v273, 17734);
+    int16x8_t v275 = vqrdmulhq_n_s16(v268, 17734);
+    int16x8_t v276 = vaddq_s16(v274, v275);
+    int16x8_t v277 = vaddq_s16(v269, v276);
+    int16x8_t v278 = vqrdmulhq_n_s16(v277, 16705);
+    int16x8_t v279 = vaddq_s16(v267, v278);
+    int16x8_t v280 = vaddq_s16(v254, v279);
+    int16x8_t v281 = vqrdmulhq_n_s16(v280, 16404);
+    int16x8_t v282 = vaddq_s16(v256, v239);
+    int16x8_t v283_tmp = vqrdmulhq_n_s16(v282, 13573);
+    int16x8_t v283 = vaddq_s16(v283_tmp, v282);
+    int16x8_t v284 = vaddq_s16(v260, v241);
+    int16x8_t v285 = vaddq_s16(v242, v263);
+    int16x8_t v286 = vaddq_s16(v284, v285);
+    int16x8_t v287 = vaddq_s16(v283, v286);
+    int16x8_t v288 = vaddq_s16(v262, v245);
+    int16x8_t v289 = vaddq_s16(v246, v255);
+    int16x8_t v290 = vaddq_s16(v288, v289);
+    int16x8_t v291 = vqrdmulhq_n_s16(v290, 25080);
+    int16x8_t v292 = vqrdmulhq_n_s16(v290, 17734);
+    int16x8_t v293 = vaddq_s16(v271, v250);
+    int16x8_t v294 = vaddq_s16(v249, v259);
+    int16x8_t v295 = vaddq_s16(v293, v294);
+    int16x8_t v296 = vqrdmulhq_n_s16(v295, 17734);
+    int16x8_t v297 = vaddq_s16(v292, v296);
+    int16x8_t v298 = vaddq_s16(v291, v297);
+    int16x8_t v299 = vaddq_s16(v287, v298);
+    int16x8_t v300 = vqrdmulhq_n_s16(v299, 16463);
+    int16x8_t v301 = vaddq_s16(v289, v282);
+    int16x8_t v302 = vqrdmulhq_n_s16(v301, 23624);
+    int16x8_t v303 = vaddq_s16(v294, v284);
+    int16x8_t v304 = vqrdmulhq_n_s16(v303, 19705);
+    int16x8_t v305 = vaddq_s16(v285, v288);
+    int16x8_t v306 = vqrdmulhq_n_s16(v305, 19705);
+    int16x8_t v307 = vaddq_s16(v304, v306);
+    int16x8_t v308 = vqrdmulhq_n_s16(v307, 27779);
+    int16x8_t v309 = vaddq_s16(v302, v308);
+    int16x8_t v310 = vaddq_s16(v305, v301);
+    int16x8_t v311 = vqrdmulhq_n_s16(v310, 25080);
+    int16x8_t v312 = vqrdmulhq_n_s16(v310, 17734);
+    int16x8_t v313 = vld1q_s16(in + in_stride * 63 + i);
+    int16x8_t v314 = vaddq_s16(v313, v216);
+    int16x8_t v315 = vaddq_s16(v314, v270);
+    int16x8_t v316 = vaddq_s16(v315, v293);
+    int16x8_t v317 = vqrdmulhq_n_s16(v316, 25746);
+    int16x8_t v318 = vqrdmulhq_n_s16(v303, 25746);
+    int16x8_t v319 = vaddq_s16(v317, v318);
+    int16x8_t v320 = vqrdmulhq_n_s16(v319, 22571);
+    int16x8_t v321 = vaddq_s16(v312, v320);
+    int16x8_t v322 = vaddq_s16(v311, v321);
+    int16x8_t v323 = vqrdmulhq_n_s16(v322, 16705);
+    int16x8_t v324 = vaddq_s16(v309, v323);
+    int16x8_t v325 = vqrdmulhq_n_s16(v324, 16463);
+    int16x8_t v326 = vaddq_s16(v300, v325);
+    int16x8_t v327 = vqrdmulhq_n_s16(v326, 16404);
+    int16x8_t v328 = vaddq_s16(v281, v327);
+    int16x8_t v329 = vaddq_s16(v238, v328);
+    int16x8_t v330 = vqrdmulhq_n_s16(v329, 16389);
+    int16x8_t v331 = vaddq_s16(v143, v330);
+    int16x8_t v332 = vsubq_s16(v82, v89);
+    int16x8_t v333 = vqrdmulhq_n_s16(v332, 19705);
+    int16x8_t v334 = vqrdmulhq_n_s16(v92, 13573);
+    int16x8_t v335 = vsubq_s16(v334, v97);
+    int16x8_t v336 = vqrdmulhq_n_s16(v335, 25746);
+    int16x8_t v337 = vaddq_s16(v333, v336);
+    int16x8_t v338 = vsubq_s16(v63, v66);
+    int16x8_t v339 = vqrdmulhq_n_s16(v70, 17734);
+    int16x8_t v340_tmp = vqrdmulhq_n_s16(v74, 10045);
+    int16x8_t v340 = vaddq_s16(v340_tmp, v74);
+    int16x8_t v341 = vsubq_s16(v339, v340);
+    int16x8_t v342 = vaddq_s16(v338, v341);
+    int16x8_t v343 = vaddq_s16(v337, v342);
+    int16x8_t v344 = vsubq_s16(v130, v131);
+    int16x8_t v345 = vqrdmulhq_n_s16(v133, 13573);
+    int16x8_t v346 = vsubq_s16(v345, v135);
+    int16x8_t v347_tmp = vqrdmulhq_n_s16(v346, 10045);
+    int16x8_t v347 = vaddq_s16(v347_tmp, v346);
+    int16x8_t v348 = vaddq_s16(v344, v347);
+    int16x8_t v349 = vqrdmulhq_n_s16(v348, 17121);
+    int16x8_t v350 = vqrdmulhq_n_s16(v105, 27867);
+    int16x8_t v351 = vqrdmulhq_n_s16(v113, 19705);
+    int16x8_t v352 = vsubq_s16(v350, v351);
+    int16x8_t v353 = vqrdmulhq_n_s16(v116, 13573);
+    int16x8_t v354 = vsubq_s16(v353, v123);
+    int16x8_t v355 = vqrdmulhq_n_s16(v354, 25746);
+    int16x8_t v356 = vaddq_s16(v352, v355);
+    int16x8_t v357 = vqrdmulhq_n_s16(v356, 17121);
+    int16x8_t v358 = vaddq_s16(v349, v357);
+    int16x8_t v359 = vaddq_s16(v343, v358);
+    int16x8_t v360 = vqrdmulhq_n_s16(v359, 16563);
+    int16x8_t v361 = vsubq_s16(v27, v30);
+    int16x8_t v362 = vqrdmulhq_n_s16(v34, 17734);
+    int16x8_t v363_tmp = vqrdmulhq_n_s16(v38, 10045);
+    int16x8_t v363 = vaddq_s16(v363_tmp, v38);
+    int16x8_t v364 = vsubq_s16(v362, v363);
+    int16x8_t v365 = vaddq_s16(v361, v364);
+    int16x8_t v366 = vsubq_s16(v44, v47);
+    int16x8_t v367 = vqrdmulhq_n_s16(v366, 19705);
+    int16x8_t v368 = vqrdmulhq_n_s16(v50, 13573);
+    int16x8_t v369 = vsubq_s16(v368, v54);
+    int16x8_t v370 = vqrdmulhq_n_s16(v369, 25746);
+    int16x8_t v371 = vaddq_s16(v367, v370);
+    int16x8_t v372 = vaddq_s16(v365, v371);
+    int16x8_t v373 = vqrdmulhq_n_s16(v372, 17121);
+    int16x8_t v374 = vsubq_s16(v0, v1);
+    int16x8_t v375 = vsubq_s16(v4, v6);
+    int16x8_t v376_tmp = vqrdmulhq_n_s16(v375, 10045);
+    int16x8_t v376 = vaddq_s16(v376_tmp, v375);
+    int16x8_t v377 = vaddq_s16(v374, v376);
+    int16x8_t v378 = vsubq_s16(v11, v14);
+    int16x8_t v379 = vqrdmulhq_n_s16(v18, 17734);
+    int16x8_t v380_tmp = vqrdmulhq_n_s16(v17, 10045);
+    int16x8_t v380 = vaddq_s16(v380_tmp, v17);
+    int16x8_t v381 = vsubq_s16(v379, v380);
+    int16x8_t v382 = vaddq_s16(v378, v381);
+    int16x8_t v383 = vqrdmulhq_n_s16(v382, 19705);
+    int16x8_t v384 = vaddq_s16(v377, v383);
+    int16x8_t v385 = vaddq_s16(v373, v384);
+    int16x8_t v386 = vaddq_s16(v360, v385);
+    int16x8_t v387 = vsubq_s16(v145, v148);
+    int16x8_t v388 = vqrdmulhq_n_s16(v152, 17734);
+    int16x8_t v389_tmp = vqrdmulhq_n_s16(v156, 10045);
+    int16x8_t v389 = vaddq_s16(v389_tmp, v156);
+    int16x8_t v390 = vsubq_s16(v388, v389);
+    int16x8_t v391 = vaddq_s16(v387, v390);
+    int16x8_t v392 = vsubq_s16(v164, v171);
+    int16x8_t v393 = vqrdmulhq_n_s16(v392, 19705);
+    int16x8_t v394 = vqrdmulhq_n_s16(v174, 13573);
+    int16x8_t v395 = vsubq_s16(v394, v179);
+    int16x8_t v396 = vqrdmulhq_n_s16(v395, 25746);
+    int16x8_t v397 = vaddq_s16(v393, v396);
+    int16x8_t v398 = vaddq_s16(v391, v397);
+    int16x8_t v399 = vsubq_s16(v227, v228);
+    int16x8_t v400 = vqrdmulhq_n_s16(v232, 13573);
+    int16x8_t v401 = vsubq_s16(v400, v230);
+    int16x8_t v402_tmp = vqrdmulhq_n_s16(v401, 10045);
+    int16x8_t v402 = vaddq_s16(v402_tmp, v401);
+    int16x8_t v403 = vaddq_s16(v399, v402);
+    int16x8_t v404 = vqrdmulhq_n_s16(v403, 17121);
+    int16x8_t v405 = vqrdmulhq_n_s16(v206, 27867);
+    int16x8_t v406 = vqrdmulhq_n_s16(v199, 19705);
+    int16x8_t v407 = vsubq_s16(v405, v406);
+    int16x8_t v408 = vqrdmulhq_n_s16(v210, 13573);
+    int16x8_t v409 = vsubq_s16(v408, v220);
+    int16x8_t v410 = vqrdmulhq_n_s16(v409, 25746);
+    int16x8_t v411 = vaddq_s16(v407, v410);
+    int16x8_t v412 = vqrdmulhq_n_s16(v411, 17121);
+    int16x8_t v413 = vaddq_s16(v404, v412);
+    int16x8_t v414 = vaddq_s16(v398, v413);
+    int16x8_t v415 = vsubq_s16(v240, v243);
+    int16x8_t v416 = vqrdmulhq_n_s16(v247, 13573);
+    int16x8_t v417 = vsubq_s16(v416, v251);
+    int16x8_t v418_tmp = vqrdmulhq_n_s16(v417, 10045);
+    int16x8_t v418 = vaddq_s16(v418_tmp, v417);
+    int16x8_t v419 = vaddq_s16(v415, v418);
+    int16x8_t v420 = vqrdmulhq_n_s16(v257, 27867);
+    int16x8_t v421 = vqrdmulhq_n_s16(v265, 19705);
+    int16x8_t v422 = vsubq_s16(v420, v421);
+    int16x8_t v423 = vqrdmulhq_n_s16(v268, 13573);
+    int16x8_t v424 = vsubq_s16(v423, v273);
+    int16x8_t v425 = vqrdmulhq_n_s16(v424, 25746);
+    int16x8_t v426 = vaddq_s16(v422, v425);
+    int16x8_t v427 = vaddq_s16(v419, v426);
+    int16x8_t v428 = vqrdmulhq_n_s16(v427, 16563);
+    int16x8_t v429 = vqrdmulhq_n_s16(v301, 27867);
+    int16x8_t v430 = vsubq_s16(v429, v307);
+    int16x8_t v431 = vqrdmulhq_n_s16(v310, 10664);
+    int16x8_t v432 = vsubq_s16(v431, v319);
+    int16x8_t v433 = vaddq_s16(v430, v432);
+    int16x8_t v434 = vqrdmulhq_n_s16(v433, 17121);
+    int16x8_t v435 = vsubq_s16(v283, v286);
+    int16x8_t v436 = vqrdmulhq_n_s16(v290, 13573);
+    int16x8_t v437 = vsubq_s16(v436, v295);
+    int16x8_t v438_tmp = vqrdmulhq_n_s16(v437, 10045);
+    int16x8_t v438 = vaddq_s16(v438_tmp, v437);
+    int16x8_t v439 = vaddq_s16(v435, v438);
+    int16x8_t v440 = vqrdmulhq_n_s16(v439, 17121);
+    int16x8_t v441 = vaddq_s16(v434, v440);
+    int16x8_t v442 = vqrdmulhq_n_s16(v441, 16563);
+    int16x8_t v443 = vaddq_s16(v428, v442);
+    int16x8_t v444 = vaddq_s16(v414, v443);
+    int16x8_t v445 = vqrdmulhq_n_s16(v444, 16429);
+    int16x8_t v446 = vaddq_s16(v386, v445);
+    int16x8_t v447 = vsubq_s16(v374, v376);
+    int16x8_t v448 = vsubq_s16(v378, v381);
+    int16x8_t v449 = vqrdmulhq_n_s16(v448, 29490);
+    int16x8_t v450 = vaddq_s16(v447, v449);
+    int16x8_t v451 = vsubq_s16(v361, v364);
+    int16x8_t v452 = vqrdmulhq_n_s16(v366, 29490);
+    int16x8_t v453_tmp = vqrdmulhq_n_s16(v369, 5763);
+    int16x8_t v453 = vaddq_s16(v453_tmp, v369);
+    int16x8_t v454 = vsubq_s16(v452, v453);
+    int16x8_t v455 = vaddq_s16(v451, v454);
+    int16x8_t v456 = vqrdmulhq_n_s16(v455, 18578);
+    int16x8_t v457 = vaddq_s16(v450, v456);
+    int16x8_t v458 = vsubq_s16(v338, v341);
+    int16x8_t v459 = vqrdmulhq_n_s16(v332, 29490);
+    int16x8_t v460_tmp = vqrdmulhq_n_s16(v335, 5763);
+    int16x8_t v460 = vaddq_s16(v460_tmp, v335);
+    int16x8_t v461 = vsubq_s16(v459, v460);
+    int16x8_t v462 = vaddq_s16(v458, v461);
+    int16x8_t v463 = vqrdmulhq_n_s16(v352, 27803);
+    int16x8_t v464 = vqrdmulhq_n_s16(v354, 21845);
+    int16x8_t v465 = vsubq_s16(v463, v464);
+    int16x8_t v466 = vsubq_s16(v344, v347);
+    int16x8_t v467 = vqrdmulhq_n_s16(v466, 18578);
+    int16x8_t v468 = vaddq_s16(v465, v467);
+    int16x8_t v469 = vaddq_s16(v462, v468);
+    int16x8_t v470 = vqrdmulhq_n_s16(v469, 16890);
+    int16x8_t v471 = vaddq_s16(v457, v470);
+    int16x8_t v472 = vsubq_s16(v415, v418);
+    int16x8_t v473_tmp = vqrdmulhq_n_s16(v422, 16273);
+    int16x8_t v473 = vaddq_s16(v473_tmp, v422);
+    int16x8_t v474_tmp = vqrdmulhq_n_s16(v424, 5763);
+    int16x8_t v474 = vaddq_s16(v474_tmp, v424);
+    int16x8_t v475 = vsubq_s16(v473, v474);
+    int16x8_t v476 = vaddq_s16(v472, v475);
+    int16x8_t v477 = vqrdmulhq_n_s16(v476, 16890);
+    int16x8_t v478 = vqrdmulhq_n_s16(v435, 20261);
+    int16x8_t v479 = vqrdmulhq_n_s16(v437, 26472);
+    int16x8_t v480 = vsubq_s16(v478, v479);
+    int16x8_t v481 = vqrdmulhq_n_s16(v480, 30046);
+    int16x8_t v482 = vqrdmulhq_n_s16(v430, 30322);
+    int16x8_t v483 = vqrdmulhq_n_s16(v432, 30322);
+    int16x8_t v484 = vsubq_s16(v482, v483);
+    int16x8_t v485 = vqrdmulhq_n_s16(v484, 30046);
+    int16x8_t v486 = vaddq_s16(v481, v485);
+    int16x8_t v487 = vqrdmulhq_n_s16(v486, 16890);
+    int16x8_t v488 = vaddq_s16(v477, v487);
+    int16x8_t v489 = vsubq_s16(v387, v390);
+    int16x8_t v490 = vqrdmulhq_n_s16(v392, 29490);
+    int16x8_t v491_tmp = vqrdmulhq_n_s16(v395, 5763);
+    int16x8_t v491 = vaddq_s16(v491_tmp, v395);
+    int16x8_t v492 = vsubq_s16(v490, v491);
+    int16x8_t v493 = vaddq_s16(v489, v492);
+    int16x8_t v494 = vsubq_s16(v399, v402);
+    int16x8_t v495 = vqrdmulhq_n_s16(v494, 18578);
+    int16x8_t v496 = vqrdmulhq_n_s16(v407, 27803);
+    int16x8_t v497 = vqrdmulhq_n_s16(v409, 21845);
+    int16x8_t v498 = vsubq_s16(v496, v497);
+    int16x8_t v499 = vaddq_s16(v495, v498);
+    int16x8_t v500 = vaddq_s16(v493, v499);
+    int16x8_t v501 = vaddq_s16(v488, v500);
+    int16x8_t v502 = vqrdmulhq_n_s16(v501, 16508);
+    int16x8_t v503 = vaddq_s16(v471, v502);
+    int16x8_t v504 = vsubq_s16(v2, v8);
+    int16x8_t v505 = vsubq_s16(v15, v22);
+    int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 18446);
+    int16x8_t v506 = vmlaq_n_s16(v506_tmp, v505, 2);
+    int16x8_t v507 = vaddq_s16(v504, v506);
+    int16x8_t v508 = vsubq_s16(v31, v41);
+    int16x8_t v509 = vsubq_s16(v48, v56);
+    int16x8_t v510_tmp = vqrdmulhq_n_s16(v509, 18446);
+    int16x8_t v510 = vmlaq_n_s16(v510_tmp, v509, 2);
+    int16x8_t v511 = vaddq_s16(v508, v510);
+    int16x8_t v512 = vqrdmulhq_n_s16(v511, 21195);
+    int16x8_t v513 = vaddq_s16(v507, v512);
+    int16x8_t v514 = vsubq_s16(v67, v77);
+    int16x8_t v515 = vsubq_s16(v90, v99);
+    int16x8_t v516_tmp = vqrdmulhq_n_s16(v515, 18446);
+    int16x8_t v516 = vmlaq_n_s16(v516_tmp, v515, 2);
+    int16x8_t v517 = vaddq_s16(v514, v516);
+    int16x8_t v518 = vsubq_s16(v114, v126);
+    int16x8_t v519_tmp = vqrdmulhq_n_s16(v518, 18446);
+    int16x8_t v519 = vmlaq_n_s16(v519_tmp, v518, 2);
+    int16x8_t v520 = vsubq_s16(v132, v137);
+    int16x8_t v521 = vaddq_s16(v519, v520);
+    int16x8_t v522 = vqrdmulhq_n_s16(v521, 21195);
+    int16x8_t v523 = vaddq_s16(v517, v522);
+    int16x8_t v524 = vqrdmulhq_n_s16(v523, 17401);
+    int16x8_t v525 = vaddq_s16(v513, v524);
+    int16x8_t v526 = vsubq_s16(v172, v181);
+    int16x8_t v527_tmp = vqrdmulhq_n_s16(v526, 18446);
+    int16x8_t v527 = vmlaq_n_s16(v527_tmp, v526, 2);
+    int16x8_t v528 = vsubq_s16(v149, v159);
+    int16x8_t v529 = vaddq_s16(v527, v528);
+    int16x8_t v530 = vsubq_s16(v229, v234);
+    int16x8_t v531 = vsubq_s16(v208, v223);
+    int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 18446);
+    int16x8_t v532 = vmlaq_n_s16(v532_tmp, v531, 2);
+    int16x8_t v533 = vaddq_s16(v530, v532);
+    int16x8_t v534 = vqrdmulhq_n_s16(v533, 21195);
+    int16x8_t v535 = vaddq_s16(v529, v534);
+    int16x8_t v536 = vsubq_s16(v244, v253);
+    int16x8_t v537 = vsubq_s16(v266, v277);
+    int16x8_t v538_tmp = vqrdmulhq_n_s16(v537, 18446);
+    int16x8_t v538 = vmlaq_n_s16(v538_tmp, v537, 2);
+    int16x8_t v539 = vaddq_s16(v536, v538);
+    int16x8_t v540 = vqrdmulhq_n_s16(v539, 17401);
+    int16x8_t v541 = vqrdmulhq_n_s16(v287, 25826);
+    int16x8_t v542 = vqrdmulhq_n_s16(v298, 25826);
+    int16x8_t v543 = vsubq_s16(v541, v542);
+    int16x8_t v544 = vqrdmulhq_n_s16(v543, 14281);
+    int16x8_t v545_tmp = vqrdmulhq_n_s16(v309, 31509);
+    int16x8_t v545 = vaddq_s16(v545_tmp, v309);
+    int16x8_t v546 = vsubq_s16(v545, v322);
+    int16x8_t v547 = vqrdmulhq_n_s16(v546, 28847);
+    int16x8_t v548 = vaddq_s16(v544, v547);
+    int16x8_t v549 = vaddq_s16(v540, v548);
+    int16x8_t v550 = vaddq_s16(v535, v549);
+    int16x8_t v551 = vqrdmulhq_n_s16(v550, 16629);
+    int16x8_t v552 = vaddq_s16(v525, v551);
+    int16x8_t v553 = vsubq_s16(v504, v506);
+    int16x8_t v554 = vsubq_s16(v508, v510);
+    int16x8_t v555 = vqrdmulhq_n_s16(v554, 25826);
+    int16x8_t v556 = vaddq_s16(v553, v555);
+    int16x8_t v557 = vsubq_s16(v514, v516);
+    int16x8_t v558 = vsubq_s16(v520, v519);
+    int16x8_t v559 = vqrdmulhq_n_s16(v558, 25826);
+    int16x8_t v560 = vaddq_s16(v557, v559);
+    int16x8_t v561 = vqrdmulhq_n_s16(v560, 18124);
+    int16x8_t v562 = vaddq_s16(v556, v561);
+    int16x8_t v563 = vsubq_s16(v528, v527);
+    int16x8_t v564 = vsubq_s16(v530, v532);
+    int16x8_t v565 = vqrdmulhq_n_s16(v564, 25826);
+    int16x8_t v566 = vaddq_s16(v563, v565);
+    int16x8_t v567 = vsubq_s16(v536, v538);
+    int16x8_t v568 = vqrdmulhq_n_s16(v567, 18124);
+    int16x8_t v569_tmp = vqrdmulhq_n_s16(v546, 654);
+    int16x8_t v569 = vmlaq_n_s16(v569_tmp, v546, 2);
+    int16x8_t v570 = vsubq_s16(v543, v569);
+    int16x8_t v571 = vqrdmulhq_n_s16(v570, 18124);
+    int16x8_t v572 = vaddq_s16(v568, v571);
+    int16x8_t v573 = vaddq_s16(v566, v572);
+    int16x8_t v574 = vqrdmulhq_n_s16(v573, 16792);
+    int16x8_t v575 = vaddq_s16(v562, v574);
+    int16x8_t v576 = vsubq_s16(v458, v461);
+    int16x8_t v577_tmp = vqrdmulhq_n_s16(v465, 25030);
+    int16x8_t v577 = vaddq_s16(v577_tmp, v465);
+    int16x8_t v578 = vsubq_s16(v466, v577);
+    int16x8_t v579_tmp = vqrdmulhq_n_s16(v578, 1988);
+    int16x8_t v579 = vaddq_s16(v579_tmp, v578);
+    int16x8_t v580 = vaddq_s16(v576, v579);
+    int16x8_t v581 = vqrdmulhq_n_s16(v580, 19102);
+    int16x8_t v582 = vsubq_s16(v447, v449);
+    int16x8_t v583 = vsubq_s16(v451, v454);
+    int16x8_t v584_tmp = vqrdmulhq_n_s16(v583, 1988);
+    int16x8_t v584 = vaddq_s16(v584_tmp, v583);
+    int16x8_t v585 = vaddq_s16(v582, v584);
+    int16x8_t v586 = vaddq_s16(v581, v585);
+    int16x8_t v587 = vsubq_s16(v489, v492);
+    int16x8_t v588_tmp = vqrdmulhq_n_s16(v498, 25030);
+    int16x8_t v588 = vaddq_s16(v588_tmp, v498);
+    int16x8_t v589 = vsubq_s16(v494, v588);
+    int16x8_t v590_tmp = vqrdmulhq_n_s16(v589, 1988);
+    int16x8_t v590 = vaddq_s16(v590_tmp, v589);
+    int16x8_t v591 = vaddq_s16(v587, v590);
+    int16x8_t v592 = vsubq_s16(v472, v475);
+    int16x8_t v593 = vqrdmulhq_n_s16(v592, 19102);
+    int16x8_t v594 = vsubq_s16(v480, v484);
+    int16x8_t v595 = vaddq_s16(v593, v594);
+    int16x8_t v596 = vaddq_s16(v591, v595);
+    int16x8_t v597 = vqrdmulhq_n_s16(v596, 17000);
+    int16x8_t v598 = vaddq_s16(v586, v597);
+    int16x8_t v599 = vsubq_s16(v365, v371);
+    int16x8_t v600_tmp = vqrdmulhq_n_s16(v599, 23673);
+    int16x8_t v600 = vaddq_s16(v600_tmp, v599);
+    int16x8_t v601 = vsubq_s16(v377, v383);
+    int16x8_t v602 = vaddq_s16(v600, v601);
+    int16x8_t v603 = vsubq_s16(v348, v356);
+    int16x8_t v604_tmp = vqrdmulhq_n_s16(v603, 23673);
+    int16x8_t v604 = vaddq_s16(v604_tmp, v603);
+    int16x8_t v605 = vsubq_s16(v342, v337);
+    int16x8_t v606 = vaddq_s16(v604, v605);
+    int16x8_t v607 = vqrdmulhq_n_s16(v606, 20398);
+    int16x8_t v608 = vaddq_s16(v602, v607);
+    int16x8_t v609 = vsubq_s16(v391, v397);
+    int16x8_t v610 = vsubq_s16(v403, v411);
+    int16x8_t v611_tmp = vqrdmulhq_n_s16(v610, 23673);
+    int16x8_t v611 = vaddq_s16(v611_tmp, v610);
+    int16x8_t v612 = vaddq_s16(v609, v611);
+    int16x8_t v613 = vsubq_s16(v419, v426);
+    int16x8_t v614 = vqrdmulhq_n_s16(v613, 20398);
+    int16x8_t v615 = vsubq_s16(v439, v433);
+    int16x8_t v616_tmp = vqrdmulhq_n_s16(v615, 2367);
+    int16x8_t v616 = vaddq_s16(v616_tmp, v615);
+    int16x8_t v617 = vaddq_s16(v614, v616);
+    int16x8_t v618 = vaddq_s16(v612, v617);
+    int16x8_t v619 = vqrdmulhq_n_s16(v618, 17255);
+    int16x8_t v620 = vaddq_s16(v608, v619);
+    int16x8_t v621 = vsubq_s16(v160, v183);
+    int16x8_t v622 = vsubq_s16(v235, v225);
+    int16x8_t v623_tmp = vqrdmulhq_n_s16(v622, 3314);
+    int16x8_t v623 = vmlaq_n_s16(v623_tmp, v622, 5);
+    int16x8_t v624 = vaddq_s16(v621, v623);
+    int16x8_t v625 = vsubq_s16(v254, v279);
+    int16x8_t v626 = vsubq_s16(v299, v324);
+    int16x8_t v627_tmp = vqrdmulhq_n_s16(v626, 3314);
+    int16x8_t v627 = vmlaq_n_s16(v627_tmp, v626, 5);
+    int16x8_t v628 = vaddq_s16(v625, v627);
+    int16x8_t v629 = vqrdmulhq_n_s16(v628, 22112);
+    int16x8_t v630 = vaddq_s16(v624, v629);
+    int16x8_t v631 = vqrdmulhq_n_s16(v630, 17561);
+    int16x8_t v632 = vsubq_s16(v9, v24);
+    int16x8_t v633 = vsubq_s16(v42, v58);
+    int16x8_t v634_tmp = vqrdmulhq_n_s16(v633, 3314);
+    int16x8_t v634 = vmlaq_n_s16(v634_tmp, v633, 5);
+    int16x8_t v635 = vaddq_s16(v632, v634);
+    int16x8_t v636 = vsubq_s16(v78, v101);
+    int16x8_t v637 = vsubq_s16(v138, v128);
+    int16x8_t v638_tmp = vqrdmulhq_n_s16(v637, 3314);
+    int16x8_t v638 = vmlaq_n_s16(v638_tmp, v637, 5);
+    int16x8_t v639 = vaddq_s16(v636, v638);
+    int16x8_t v640 = vqrdmulhq_n_s16(v639, 22112);
+    int16x8_t v641 = vaddq_s16(v635, v640);
+    int16x8_t v642 = vaddq_s16(v631, v641);
+    int16x8_t v643 = vsubq_s16(v632, v634);
+    int16x8_t v644 = vsubq_s16(v636, v638);
+    int16x8_t v645 = vqrdmulhq_n_s16(v644, 24397);
+    int16x8_t v646 = vaddq_s16(v643, v645);
+    int16x8_t v647 = vsubq_s16(v621, v623);
+    int16x8_t v648 = vsubq_s16(v625, v627);
+    int16x8_t v649 = vqrdmulhq_n_s16(v648, 24397);
+    int16x8_t v650 = vaddq_s16(v647, v649);
+    int16x8_t v651 = vqrdmulhq_n_s16(v650, 17921);
+    int16x8_t v652 = vaddq_s16(v646, v651);
+    int16x8_t v653 = vsubq_s16(v601, v600);
+    int16x8_t v654 = vsubq_s16(v605, v604);
+    int16x8_t v655 = vqrdmulhq_n_s16(v654, 27504);
+    int16x8_t v656 = vaddq_s16(v653, v655);
+    int16x8_t v657 = vsubq_s16(v609, v611);
+    int16x8_t v658 = vqrdmulhq_n_s16(v613, 27504);
+    int16x8_t v659_tmp = vqrdmulhq_n_s16(v615, 14606);
+    int16x8_t v659 = vaddq_s16(v659_tmp, v615);
+    int16x8_t v660 = vsubq_s16(v658, v659);
+    int16x8_t v661 = vaddq_s16(v657, v660);
+    int16x8_t v662 = vqrdmulhq_n_s16(v661, 18343);
+    int16x8_t v663 = vaddq_s16(v656, v662);
+    int16x8_t v664 = vsubq_s16(v582, v584);
+    int16x8_t v665 = vsubq_s16(v576, v579);
+    int16x8_t v666 = vqrdmulhq_n_s16(v665, 31869);
+    int16x8_t v667 = vaddq_s16(v664, v666);
+    int16x8_t v668 = vsubq_s16(v587, v590);
+    int16x8_t v669_tmp = vqrdmulhq_n_s16(v594, 23444);
+    int16x8_t v669 = vaddq_s16(v669_tmp, v594);
+    int16x8_t v670 = vsubq_s16(v592, v669);
+    int16x8_t v671 = vqrdmulhq_n_s16(v670, 31869);
+    int16x8_t v672 = vaddq_s16(v668, v671);
+    int16x8_t v673 = vqrdmulhq_n_s16(v672, 18830);
+    int16x8_t v674 = vaddq_s16(v667, v673);
+    int16x8_t v675 = vsubq_s16(v553, v555);
+    int16x8_t v676 = vsubq_s16(v557, v559);
+    int16x8_t v677_tmp = vqrdmulhq_n_s16(v676, 5552);
+    int16x8_t v677 = vaddq_s16(v677_tmp, v676);
+    int16x8_t v678 = vaddq_s16(v675, v677);
+    int16x8_t v679 = vsubq_s16(v563, v565);
+    int16x8_t v680 = vsubq_s16(v567, v570);
+    int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 5552);
+    int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+    int16x8_t v682 = vaddq_s16(v679, v681);
+    int16x8_t v683 = vqrdmulhq_n_s16(v682, 19393);
+    int16x8_t v684 = vaddq_s16(v678, v683);
+    int16x8_t v685 = vsubq_s16(v507, v512);
+    int16x8_t v686 = vsubq_s16(v517, v522);
+    int16x8_t v687_tmp = vqrdmulhq_n_s16(v686, 15865);
+    int16x8_t v687 = vaddq_s16(v687_tmp, v686);
+    int16x8_t v688 = vaddq_s16(v685, v687);
+    int16x8_t v689 = vsubq_s16(v529, v534);
+    int16x8_t v690_tmp = vqrdmulhq_n_s16(v548, 28937);
+    int16x8_t v690 = vaddq_s16(v690_tmp, v548);
+    int16x8_t v691 = vsubq_s16(v539, v690);
+    int16x8_t v692_tmp = vqrdmulhq_n_s16(v691, 15865);
+    int16x8_t v692 = vaddq_s16(v692_tmp, v691);
+    int16x8_t v693 = vaddq_s16(v689, v692);
+    int16x8_t v694 = vqrdmulhq_n_s16(v693, 20040);
+    int16x8_t v695 = vaddq_s16(v688, v694);
+    int16x8_t v696 = vsubq_s16(v476, v486);
+    int16x8_t v697_tmp = vqrdmulhq_n_s16(v696, 1893);
+    int16x8_t v697 = vmlaq_n_s16(v697_tmp, v696, 2);
+    int16x8_t v698 = vsubq_s16(v493, v499);
+    int16x8_t v699 = vaddq_s16(v697, v698);
+    int16x8_t v700 = vqrdmulhq_n_s16(v699, 20783);
+    int16x8_t v701 = vsubq_s16(v450, v456);
+    int16x8_t v702 = vsubq_s16(v462, v468);
+    int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 1893);
+    int16x8_t v703 = vmlaq_n_s16(v703_tmp, v702, 2);
+    int16x8_t v704 = vaddq_s16(v701, v703);
+    int16x8_t v705 = vaddq_s16(v700, v704);
+    int16x8_t v706 = vsubq_s16(v384, v373);
+    int16x8_t v707 = vsubq_s16(v343, v358);
+    int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 13357);
+    int16x8_t v708 = vmlaq_n_s16(v708_tmp, v707, 3);
+    int16x8_t v709 = vaddq_s16(v706, v708);
+    int16x8_t v710 = vsubq_s16(v398, v413);
+    int16x8_t v711 = vsubq_s16(v427, v441);
+    int16x8_t v712_tmp = vqrdmulhq_n_s16(v711, 13357);
+    int16x8_t v712 = vmlaq_n_s16(v712_tmp, v711, 3);
+    int16x8_t v713 = vaddq_s16(v710, v712);
+    int16x8_t v714 = vqrdmulhq_n_s16(v713, 21637);
+    int16x8_t v715 = vaddq_s16(v709, v714);
+    int16x8_t v716 = vsubq_s16(v25, v60);
+    int16x8_t v717 = vsubq_s16(v102, v140);
+    int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 6226);
+    int16x8_t v718 = vmlaq_n_s16(v718_tmp, v717, 10);
+    int16x8_t v719 = vaddq_s16(v716, v718);
+    int16x8_t v720 = vsubq_s16(v280, v326);
+    int16x8_t v721_tmp = vqrdmulhq_n_s16(v720, 6226);
+    int16x8_t v721 = vmlaq_n_s16(v721_tmp, v720, 10);
+    int16x8_t v722 = vsubq_s16(v184, v237);
+    int16x8_t v723 = vaddq_s16(v721, v722);
+    int16x8_t v724 = vqrdmulhq_n_s16(v723, 22622);
+    int16x8_t v725 = vaddq_s16(v719, v724);
+    int16x8_t v726 = vsubq_s16(v716, v718);
+    int16x8_t v727 = vsubq_s16(v722, v721);
+    int16x8_t v728 = vqrdmulhq_n_s16(v727, 23761);
+    int16x8_t v729 = vaddq_s16(v726, v728);
+    int16x8_t v730 = vsubq_s16(v706, v708);
+    int16x8_t v731 = vsubq_s16(v710, v712);
+    int16x8_t v732 = vqrdmulhq_n_s16(v731, 25084);
+    int16x8_t v733 = vaddq_s16(v730, v732);
+    int16x8_t v734 = vsubq_s16(v701, v703);
+    int16x8_t v735 = vsubq_s16(v698, v697);
+    int16x8_t v736 = vqrdmulhq_n_s16(v735, 26631);
+    int16x8_t v737 = vaddq_s16(v734, v736);
+    int16x8_t v738 = vsubq_s16(v685, v687);
+    int16x8_t v739 = vsubq_s16(v689, v692);
+    int16x8_t v740 = vqrdmulhq_n_s16(v739, 28454);
+    int16x8_t v741 = vaddq_s16(v738, v740);
+    int16x8_t v742 = vsubq_s16(v675, v677);
+    int16x8_t v743 = vsubq_s16(v679, v681);
+    int16x8_t v744 = vqrdmulhq_n_s16(v743, 30624);
+    int16x8_t v745 = vaddq_s16(v742, v744);
+    int16x8_t v746 = vsubq_s16(v664, v666);
+    int16x8_t v747 = vsubq_s16(v668, v671);
+    int16x8_t v748_tmp = vqrdmulhq_n_s16(v747, 472);
+    int16x8_t v748 = vaddq_s16(v748_tmp, v747);
+    int16x8_t v749 = vaddq_s16(v746, v748);
+    int16x8_t v750 = vsubq_s16(v653, v655);
+    int16x8_t v751 = vsubq_s16(v657, v660);
+    int16x8_t v752_tmp = vqrdmulhq_n_s16(v751, 3672);
+    int16x8_t v752 = vaddq_s16(v752_tmp, v751);
+    int16x8_t v753 = vaddq_s16(v750, v752);
+    int16x8_t v754 = vsubq_s16(v643, v645);
+    int16x8_t v755 = vsubq_s16(v647, v649);
+    int16x8_t v756_tmp = vqrdmulhq_n_s16(v755, 7662);
+    int16x8_t v756 = vaddq_s16(v756_tmp, v755);
+    int16x8_t v757 = vaddq_s16(v754, v756);
+    int16x8_t v758 = vsubq_s16(v635, v640);
+    int16x8_t v759 = vsubq_s16(v624, v629);
+    int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 12756);
+    int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+    int16x8_t v761 = vaddq_s16(v758, v760);
+    int16x8_t v762 = vsubq_s16(v602, v607);
+    int16x8_t v763 = vsubq_s16(v612, v617);
+    int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 19463);
+    int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+    int16x8_t v765 = vaddq_s16(v762, v764);
+    int16x8_t v766 = vsubq_s16(v585, v581);
+    int16x8_t v767 = vsubq_s16(v591, v595);
+    int16x8_t v768_tmp = vqrdmulhq_n_s16(v767, 28661);
+    int16x8_t v768 = vaddq_s16(v768_tmp, v767);
+    int16x8_t v769 = vaddq_s16(v766, v768);
+    int16x8_t v770 = vsubq_s16(v556, v561);
+    int16x8_t v771 = vsubq_s16(v566, v572);
+    int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 9242);
+    int16x8_t v772 = vmlaq_n_s16(v772_tmp, v771, 2);
+    int16x8_t v773 = vaddq_s16(v770, v772);
+    int16x8_t v774 = vsubq_s16(v513, v524);
+    int16x8_t v775 = vsubq_s16(v535, v549);
+    int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 30298);
+    int16x8_t v776 = vmlaq_n_s16(v776_tmp, v775, 2);
+    int16x8_t v777 = vaddq_s16(v774, v776);
+    int16x8_t v778 = vsubq_s16(v457, v470);
+    int16x8_t v779 = vsubq_s16(v500, v488);
+    int16x8_t v780_tmp = vqrdmulhq_n_s16(v779, 2773);
+    int16x8_t v780 = vmlaq_n_s16(v780_tmp, v779, 4);
+    int16x8_t v781 = vaddq_s16(v778, v780);
+    int16x8_t v782 = vsubq_s16(v385, v360);
+    int16x8_t v783 = vsubq_s16(v414, v443);
+    int16x8_t v784_tmp = vqrdmulhq_n_s16(v783, 26108);
+    int16x8_t v784 = vmlaq_n_s16(v784_tmp, v783, 6);
+    int16x8_t v785 = vaddq_s16(v782, v784);
+    int16x8_t v786 = vsubq_s16(v61, v142);
+    int16x8_t v787 = vsubq_s16(v238, v328);
+    int16x8_t v788_tmp = vqrdmulhq_n_s16(v787, 12251);
+    int16x8_t v788 = vmlaq_n_s16(v788_tmp, v787, 20);
+    int16x8_t v789 = vaddq_s16(v786, v788);
+    int16x8_t v790 = vsubq_s16(v786, v788);
+    int16x8_t v791 = vsubq_s16(v782, v784);
+    int16x8_t v792 = vsubq_s16(v778, v780);
+    int16x8_t v793 = vsubq_s16(v774, v776);
+    int16x8_t v794 = vsubq_s16(v770, v772);
+    int16x8_t v795 = vsubq_s16(v766, v768);
+    int16x8_t v796 = vsubq_s16(v762, v764);
+    int16x8_t v797 = vsubq_s16(v758, v760);
+    int16x8_t v798 = vsubq_s16(v754, v756);
+    int16x8_t v799 = vsubq_s16(v750, v752);
+    int16x8_t v800 = vsubq_s16(v746, v748);
+    int16x8_t v801 = vsubq_s16(v742, v744);
+    int16x8_t v802 = vsubq_s16(v738, v740);
+    int16x8_t v803 = vsubq_s16(v734, v736);
+    int16x8_t v804 = vsubq_s16(v730, v732);
+    int16x8_t v805 = vsubq_s16(v726, v728);
+    int16x8_t v806 = vsubq_s16(v719, v724);
+    int16x8_t v807 = vsubq_s16(v709, v714);
+    int16x8_t v808 = vsubq_s16(v704, v700);
+    int16x8_t v809 = vsubq_s16(v688, v694);
+    int16x8_t v810 = vsubq_s16(v678, v683);
+    int16x8_t v811 = vsubq_s16(v667, v673);
+    int16x8_t v812 = vsubq_s16(v656, v662);
+    int16x8_t v813 = vsubq_s16(v646, v651);
+    int16x8_t v814 = vsubq_s16(v641, v631);
+    int16x8_t v815 = vsubq_s16(v608, v619);
+    int16x8_t v816 = vsubq_s16(v586, v597);
+    int16x8_t v817 = vsubq_s16(v562, v574);
+    int16x8_t v818 = vsubq_s16(v525, v551);
+    int16x8_t v819 = vsubq_s16(v471, v502);
+    int16x8_t v820 = vsubq_s16(v386, v445);
+    int16x8_t v821 = vsubq_s16(v143, v330);
+    vst1q_s16(out + out_stride * 0 + i, v331);
+    vst1q_s16(out + out_stride * 1 + i, v446);
+    vst1q_s16(out + out_stride * 2 + i, v503);
+    vst1q_s16(out + out_stride * 3 + i, v552);
+    vst1q_s16(out + out_stride * 4 + i, v575);
+    vst1q_s16(out + out_stride * 5 + i, v598);
+    vst1q_s16(out + out_stride * 6 + i, v620);
+    vst1q_s16(out + out_stride * 7 + i, v642);
+    vst1q_s16(out + out_stride * 8 + i, v652);
+    vst1q_s16(out + out_stride * 9 + i, v663);
+    vst1q_s16(out + out_stride * 10 + i, v674);
+    vst1q_s16(out + out_stride * 11 + i, v684);
+    vst1q_s16(out + out_stride * 12 + i, v695);
+    vst1q_s16(out + out_stride * 13 + i, v705);
+    vst1q_s16(out + out_stride * 14 + i, v715);
+    vst1q_s16(out + out_stride * 15 + i, v725);
+    vst1q_s16(out + out_stride * 16 + i, v729);
+    vst1q_s16(out + out_stride * 17 + i, v733);
+    vst1q_s16(out + out_stride * 18 + i, v737);
+    vst1q_s16(out + out_stride * 19 + i, v741);
+    vst1q_s16(out + out_stride * 20 + i, v745);
+    vst1q_s16(out + out_stride * 21 + i, v749);
+    vst1q_s16(out + out_stride * 22 + i, v753);
+    vst1q_s16(out + out_stride * 23 + i, v757);
+    vst1q_s16(out + out_stride * 24 + i, v761);
+    vst1q_s16(out + out_stride * 25 + i, v765);
+    vst1q_s16(out + out_stride * 26 + i, v769);
+    vst1q_s16(out + out_stride * 27 + i, v773);
+    vst1q_s16(out + out_stride * 28 + i, v777);
+    vst1q_s16(out + out_stride * 29 + i, v781);
+    vst1q_s16(out + out_stride * 30 + i, v785);
+    vst1q_s16(out + out_stride * 31 + i, v789);
+    vst1q_s16(out + out_stride * 32 + i, v790);
+    vst1q_s16(out + out_stride * 33 + i, v791);
+    vst1q_s16(out + out_stride * 34 + i, v792);
+    vst1q_s16(out + out_stride * 35 + i, v793);
+    vst1q_s16(out + out_stride * 36 + i, v794);
+    vst1q_s16(out + out_stride * 37 + i, v795);
+    vst1q_s16(out + out_stride * 38 + i, v796);
+    vst1q_s16(out + out_stride * 39 + i, v797);
+    vst1q_s16(out + out_stride * 40 + i, v798);
+    vst1q_s16(out + out_stride * 41 + i, v799);
+    vst1q_s16(out + out_stride * 42 + i, v800);
+    vst1q_s16(out + out_stride * 43 + i, v801);
+    vst1q_s16(out + out_stride * 44 + i, v802);
+    vst1q_s16(out + out_stride * 45 + i, v803);
+    vst1q_s16(out + out_stride * 46 + i, v804);
+    vst1q_s16(out + out_stride * 47 + i, v805);
+    vst1q_s16(out + out_stride * 48 + i, v806);
+    vst1q_s16(out + out_stride * 49 + i, v807);
+    vst1q_s16(out + out_stride * 50 + i, v808);
+    vst1q_s16(out + out_stride * 51 + i, v809);
+    vst1q_s16(out + out_stride * 52 + i, v810);
+    vst1q_s16(out + out_stride * 53 + i, v811);
+    vst1q_s16(out + out_stride * 54 + i, v812);
+    vst1q_s16(out + out_stride * 55 + i, v813);
+    vst1q_s16(out + out_stride * 56 + i, v814);
+    vst1q_s16(out + out_stride * 57 + i, v815);
+    vst1q_s16(out + out_stride * 58 + i, v816);
+    vst1q_s16(out + out_stride * 59 + i, v817);
+    vst1q_s16(out + out_stride * 60 + i, v818);
+    vst1q_s16(out + out_stride * 61 + i, v819);
+    vst1q_s16(out + out_stride * 62 + i, v820);
+    vst1q_s16(out + out_stride * 63 + i, v821);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h
new file mode 100644
index 0000000000..946ace4a0c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct8-inl.h
@@ -0,0 +1,80 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<8>) { return 1; }
+
+void FastIDCT(FastDCTTag<8>, const int16_t* in, size_t in_stride, int16_t* out,
+              size_t out_stride, size_t count) {
+  JXL_ASSERT(count % 8 == 0);
+  for (size_t i = 0; i < count; i += 8) {
+    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+    int16x8_t v1 = vld1q_s16(in + in_stride * 4 + i);
+    int16x8_t v2 = vaddq_s16(v0, v1);
+    int16x8_t v3 = vld1q_s16(in + in_stride * 2 + i);
+    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+    int16x8_t v5 = vld1q_s16(in + in_stride * 6 + i);
+    int16x8_t v6 = vaddq_s16(v5, v3);
+    int16x8_t v7 = vaddq_s16(v4, v6);
+    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+    int16x8_t v9 = vaddq_s16(v2, v8);
+    int16x8_t v10 = vld1q_s16(in + in_stride * 1 + i);
+    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+    int16x8_t v12 = vld1q_s16(in + in_stride * 5 + i);
+    int16x8_t v13 = vld1q_s16(in + in_stride * 3 + i);
+    int16x8_t v14 = vaddq_s16(v12, v13);
+    int16x8_t v15 = vaddq_s16(v11, v14);
+    int16x8_t v16 = vaddq_s16(v13, v10);
+    int16x8_t v17 = vqrdmulhq_n_s16(v16, 25080);
+    int16x8_t v18 = vld1q_s16(in + in_stride * 7 + i);
+    int16x8_t v19 = vaddq_s16(v18, v12);
+    int16x8_t v20 = vaddq_s16(v16, v19);
+    int16x8_t v21 = vqrdmulhq_n_s16(v20, 17734);
+    int16x8_t v22 = vaddq_s16(v17, v21);
+    int16x8_t v23 = vaddq_s16(v15, v22);
+    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+    int16x8_t v25 = vaddq_s16(v9, v24);
+    int16x8_t v26 = vsubq_s16(v0, v1);
+    int16x8_t v27 = vsubq_s16(v4, v6);
+    int16x8_t v28_tmp = vqrdmulhq_n_s16(v27, 10045);
+    int16x8_t v28 = vaddq_s16(v28_tmp, v27);
+    int16x8_t v29 = vaddq_s16(v26, v28);
+    int16x8_t v30 = vsubq_s16(v11, v14);
+    int16x8_t v31 = vqrdmulhq_n_s16(v16, 17734);
+    int16x8_t v32_tmp = vqrdmulhq_n_s16(v19, 10045);
+    int16x8_t v32 = vaddq_s16(v32_tmp, v19);
+    int16x8_t v33 = vsubq_s16(v31, v32);
+    int16x8_t v34 = vaddq_s16(v30, v33);
+    int16x8_t v35 = vqrdmulhq_n_s16(v34, 19705);
+    int16x8_t v36 = vaddq_s16(v29, v35);
+    int16x8_t v37 = vsubq_s16(v26, v28);
+    int16x8_t v38 = vsubq_s16(v30, v33);
+    int16x8_t v39 = vqrdmulhq_n_s16(v38, 29490);
+    int16x8_t v40 = vaddq_s16(v37, v39);
+    int16x8_t v41 = vsubq_s16(v2, v8);
+    int16x8_t v42 = vsubq_s16(v15, v22);
+    int16x8_t v43_tmp = vqrdmulhq_n_s16(v42, 18446);
+    int16x8_t v43 = vmlaq_n_s16(v43_tmp, v42, 2);
+    int16x8_t v44 = vaddq_s16(v41, v43);
+    int16x8_t v45 = vsubq_s16(v41, v43);
+    int16x8_t v46 = vsubq_s16(v37, v39);
+    int16x8_t v47 = vsubq_s16(v29, v35);
+    int16x8_t v48 = vsubq_s16(v9, v24);
+    vst1q_s16(out + out_stride * 0 + i, v25);
+    vst1q_s16(out + out_stride * 1 + i, v36);
+    vst1q_s16(out + out_stride * 2 + i, v40);
+    vst1q_s16(out + out_stride * 3 + i, v44);
+    vst1q_s16(out + out_stride * 4 + i, v45);
+    vst1q_s16(out + out_stride * 5 + i, v46);
+    vst1q_s16(out + out_stride * 6 + i, v47);
+    vst1q_s16(out + out_stride * 7 + i, v48);
+  }
+}
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc
new file mode 100644
index 0000000000..5bb1a79cc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <numeric>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_dct_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct-inl.h"
+#include "lib/jxl/fast_dct-inl.h"
+#include "lib/jxl/fast_dct.h"
+#include "lib/jxl/transpose-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFastTranspose() {
+#if HWY_TARGET == HWY_NEON
+  auto array_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* array = array_mem.get();
+  auto transposed_mem = hwy::AllocateAligned<int16_t>(N * M);
+  int16_t* transposed = transposed_mem.get();
+  std::iota(array, array + N * M, 0);
+  for (size_t j = 0; j < 100000000 / (N * M); j++) {
+    FastTransposeBlock(array, M, N, M, transposed, N);
+  }
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      EXPECT_EQ(array[j * M + i], transposed[i * N + j]);
+    }
+  }
+#endif
+}
+
+template <size_t N, size_t M>
+HWY_NOINLINE void TestFloatTranspose() {
+  auto array_mem = hwy::AllocateAligned<float>(N * M);
+  float* array = array_mem.get();
+  auto transposed_mem = hwy::AllocateAligned<float>(N * M);
+  float* transposed = transposed_mem.get();
+  std::iota(array, array + N * M, 0);
+  for (size_t j = 0; j < 100000000 / (N * M); j++) {
+    Transpose<N, M>::Run(DCTFrom(array, M), DCTTo(transposed, N));
+  }
+  for (size_t i = 0; i < M; i++) {
+    for (size_t j = 0; j < N; j++) {
+      EXPECT_EQ(array[j * M + i], transposed[i * N + j]);
+    }
+  }
+}
+
+// TODO(sboukortt): re-enable the FloatIDCT tests once we find out why they fail
+// in ASAN mode in the CI runners and seemingly not locally.
+
+HWY_NOINLINE void TestFastTranspose8x8() { TestFastTranspose<8, 8>(); }
+HWY_NOINLINE void TestFloatTranspose8x8() { TestFloatTranspose<8, 8>(); }
+HWY_NOINLINE void TestFastIDCT8x8() { TestFastIDCT<8, 8>(); }
+HWY_NOINLINE void TestFloatIDCT8x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose8x16() { TestFastTranspose<8, 16>(); }
+HWY_NOINLINE void TestFloatTranspose8x16() { TestFloatTranspose<8, 16>(); }
+HWY_NOINLINE void TestFastIDCT8x16() { TestFastIDCT<8, 16>(); }
+HWY_NOINLINE void TestFloatIDCT8x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose8x32() { TestFastTranspose<8, 32>(); }
+HWY_NOINLINE void TestFloatTranspose8x32() { TestFloatTranspose<8, 32>(); }
+HWY_NOINLINE void TestFastIDCT8x32() { TestFastIDCT<8, 32>(); }
+HWY_NOINLINE void TestFloatIDCT8x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<8, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x8() { TestFastTranspose<16, 8>(); }
+HWY_NOINLINE void TestFloatTranspose16x8() { TestFloatTranspose<16, 8>(); }
+HWY_NOINLINE void TestFastIDCT16x8() { TestFastIDCT<16, 8>(); }
+HWY_NOINLINE void TestFloatIDCT16x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x16() { TestFastTranspose<16, 16>(); }
+HWY_NOINLINE void TestFloatTranspose16x16() { TestFloatTranspose<16, 16>(); }
+HWY_NOINLINE void TestFastIDCT16x16() { TestFastIDCT<16, 16>(); }
+HWY_NOINLINE void TestFloatIDCT16x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose16x32() { TestFastTranspose<16, 32>(); }
+HWY_NOINLINE void TestFloatTranspose16x32() { TestFloatTranspose<16, 32>(); }
+HWY_NOINLINE void TestFastIDCT16x32() { TestFastIDCT<16, 32>(); }
+HWY_NOINLINE void TestFloatIDCT16x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<16, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x8() { TestFastTranspose<32, 8>(); }
+HWY_NOINLINE void TestFloatTranspose32x8() { TestFloatTranspose<32, 8>(); }
+HWY_NOINLINE void TestFastIDCT32x8() { TestFastIDCT<32, 8>(); }
+HWY_NOINLINE void TestFloatIDCT32x8() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 8>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x16() { TestFastTranspose<32, 16>(); }
+HWY_NOINLINE void TestFloatTranspose32x16() { TestFloatTranspose<32, 16>(); }
+HWY_NOINLINE void TestFastIDCT32x16() { TestFastIDCT<32, 16>(); }
+HWY_NOINLINE void TestFloatIDCT32x16() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 16>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x32() { TestFastTranspose<32, 32>(); }
+HWY_NOINLINE void TestFloatTranspose32x32() { TestFloatTranspose<32, 32>(); }
+HWY_NOINLINE void TestFastIDCT32x32() { TestFastIDCT<32, 32>(); }
+HWY_NOINLINE void TestFloatIDCT32x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose32x64() { TestFastTranspose<32, 64>(); }
+HWY_NOINLINE void TestFloatTranspose32x64() { TestFloatTranspose<32, 64>(); }
+HWY_NOINLINE void TestFastIDCT32x64() { TestFastIDCT<32, 64>(); }
+HWY_NOINLINE void TestFloatIDCT32x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<32, 64>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x32() { TestFastTranspose<64, 32>(); }
+HWY_NOINLINE void TestFloatTranspose64x32() { TestFloatTranspose<64, 32>(); }
+HWY_NOINLINE void TestFastIDCT64x32() { TestFastIDCT<64, 32>(); }
+HWY_NOINLINE void TestFloatIDCT64x32() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 32>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x64() { TestFastTranspose<64, 64>(); }
+HWY_NOINLINE void TestFloatTranspose64x64() { TestFloatTranspose<64, 64>(); }
+HWY_NOINLINE void TestFastIDCT64x64() { TestFastIDCT<64, 64>(); }
+HWY_NOINLINE void TestFloatIDCT64x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 64>();
+#endif
+}
+HWY_NOINLINE void TestFastTranspose64x128() { TestFastTranspose<64, 128>(); }
+HWY_NOINLINE void TestFloatTranspose64x128() { TestFloatTranspose<64, 128>(); }
+/*
+HWY_NOINLINE void TestFastIDCT64x128() { TestFastIDCT<64, 128>(); }
+HWY_NOINLINE void TestFloatIDCT64x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<64, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x64() { TestFastTranspose<128, 64>(); }
+HWY_NOINLINE void TestFloatTranspose128x64() { TestFloatTranspose<128, 64>(); }
+/*
+HWY_NOINLINE void TestFastIDCT128x64() { TestFastIDCT<128, 64>(); }
+HWY_NOINLINE void TestFloatIDCT128x64() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 64>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x128() { TestFastTranspose<128, 128>(); }
+HWY_NOINLINE void TestFloatTranspose128x128() {
+  TestFloatTranspose<128, 128>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT128x128() { TestFastIDCT<128, 128>(); }
+HWY_NOINLINE void TestFloatIDCT128x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose128x256() { TestFastTranspose<128, 256>(); }
+HWY_NOINLINE void TestFloatTranspose128x256() {
+  TestFloatTranspose<128, 256>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT128x256() { TestFastIDCT<128, 256>(); }
+HWY_NOINLINE void TestFloatIDCT128x256() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<128, 256>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose256x128() { TestFastTranspose<256, 128>(); }
+HWY_NOINLINE void TestFloatTranspose256x128() {
+  TestFloatTranspose<256, 128>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT256x128() { TestFastIDCT<256, 128>(); }
+HWY_NOINLINE void TestFloatIDCT256x128() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<256, 128>();
+#endif
+}
+*/
+HWY_NOINLINE void TestFastTranspose256x256() { TestFastTranspose<256, 256>(); }
+HWY_NOINLINE void TestFloatTranspose256x256() {
+  TestFloatTranspose<256, 256>();
+}
+/*
+HWY_NOINLINE void TestFastIDCT256x256() { TestFastIDCT<256, 256>(); }
+HWY_NOINLINE void TestFloatIDCT256x256() {
+#if HWY_TARGET == HWY_SCALAR && \
+    (defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER))
+  GTEST_SKIP();
+#else
+  TestFloatIDCT<256, 256>();
+#endif
+}
+*/
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class FastDCTTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastDCTTargetTest);
+
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatTranspose256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastTranspose256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT8x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT16x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x8);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x16);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT32x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x32);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x64);
+/*
+ * DCT-128 and above have very large errors just by rounding inputs.
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT64x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x64);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT128x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x128);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFloatIDCT256x256);
+HWY_EXPORT_AND_TEST_P(FastDCTTargetTest, TestFastIDCT256x256);
+*/
+
+TEST(FastDCTTest, TestWrapperFloat) { BenchmarkFloatIDCT32x32(); }
+TEST(FastDCTTest, TestWrapperFast) { BenchmarkFastIDCT32x32(); }
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/fast_math-inl.h b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h
new file mode 100644
index 0000000000..5c48034290
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_math-inl.h
@@ -0,0 +1,236 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD math ops (log2, encoder only, cos, erf for splines)
+
+#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_FAST_MATH_INL_H_
+#undef LIB_JXL_FAST_MATH_INL_H_
+#else
+#define LIB_JXL_FAST_MATH_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Le;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Xor;
+
+// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
+// L1 error ~3.9E-6
+template <class DF, class V>
+V FastLog2f(const DF df, V x) {
+  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
+  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
+                                          HWY_REP4(1.4287160470083755E+00f),
+                                          HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
+                                          HWY_REP4(1.0096718572241148E+00f),
+                                          HWY_REP4(1.7409343003366853E-01f)};
+
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+
+  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+  // Shifted exponent = log2; also used to clear mantissa.
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
+             exp_val);
+}
+
+// max relative error ~3e-7
+template <class DF, class V>
+V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
+  return Div(num, den);
+}
+
+// max relative error ~3e-5
+template <class DF, class V>
+V FastPowf(const DF df, V base, V exponent) {
+  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
+}
+
+// Computes cosine like std::cos.
+// L1 error 7e-5.
+template <class DF, class V>
+V FastCosf(const DF df, V x) {
+  // Step 1: range reduction to [0, 2pi)
+  const auto pi2 = Set(df, kPi * 2.0f);
+  const auto pi2_inv = Set(df, 0.5f / kPi);
+  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
+  const auto xmodpi2 = Sub(x, npi2);
+  // Step 2: range reduction to [0, pi]
+  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
+  // Step 3: range reduction to [0, pi/2]
+  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
+  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
+  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
+  // duplication steps faster, on x/4.
+  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
+  const auto x2 = Mul(xs, xs);
+  const auto x4 = Mul(x2, x2);
+  const auto cosx_prescaling =
+      MulAdd(x4, Set(df, 0.06960438),
+             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
+  // Step 5: angle duplication.
+  const auto cosx_scale1 =
+      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
+  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
+  // Step 6: change sign if needed.
+  const Rebind<uint32_t, DF> du;
+  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
+  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
+}
+
+// Computes the error function like std::erf.
+// L1 error 7e-4.
+template <class DF, class V>
+V FastErff(const DF df, V x) {
+  // Formula from
+  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
+  // but constants have been recomputed.
+  const auto xle0 = Le(x, Zero(df));
+  const auto absx = Abs(x);
+  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
+  const auto denom1 =
+      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
+  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
+  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
+  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
+  const auto denom5 = Mul(denom4, denom4);
+  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
+  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
+  // Change sign if needed.
+  const Rebind<uint32_t, DF> du;
+  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
+  return BitCast(df, Xor(signbit, BitCast(du, result)));
+}
+
+inline float FastLog2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastLog2f(D, Set(D, f)));
+}
+
+inline float FastPow2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPow2f(D, Set(D, f)));
+}
+
+inline float FastPowf(float b, float e) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
+}
+
+inline float FastCosf(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastCosf(D, Set(D, f)));
+}
+
+inline float FastErff(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastErff(D, Set(D, f)));
+}
+
+// Returns cbrt(x) + add with 6 ulp max error.
+// Modified from vectormath_exp.h, Apache 2 license.
+// https://www.agner.org/optimize/vectorclass.zip
+template <class V>
+V CubeRootAndAdd(const V x, const V add) {
+  const HWY_FULL(float) df;
+  const HWY_FULL(int32_t) di;
+
+  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
+  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
+  const auto k1_3 = Set(df, 1.0f / 3);
+  const auto k4_3 = Set(df, 4.0f / 3);
+
+  const auto xa = x;  // assume inputs never negative
+  const auto xa_3 = Mul(k1_3, xa);
+
+  // Multiply exponent by -1/3
+  const auto m1 = BitCast(di, xa);
+  // Special case for 0. 0 is represented with an exponent of 0, so the
+  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
+  // sets those values as 0, which prevents having NaNs in the computations
+  // below.
+  // TODO(eustas): use fused op
+  const auto m2 = IfThenZeroElse(
+      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
+  auto r = BitCast(df, m2);
+
+  // Newton-Raphson iterations
+  for (int i = 0; i < 3; i++) {
+    const auto r2 = Mul(r, r);
+    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
+  }
+  // Final iteration
+  auto r2 = Mul(r, r);
+  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
+  r2 = Mul(r, r);
+  r = MulAdd(r2, x, add);
+
+  return r;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_FAST_MATH_INL_H_
+
+#if HWY_ONCE
+#ifndef FAST_MATH_ONCE
+#define FAST_MATH_ONCE
+
+namespace jxl {
+inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
+inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
+inline float FastPowf(float b, float e) {
+  return HWY_STATIC_DISPATCH(FastPowf)(b, e);
+}
+inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
+inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
+}  // namespace jxl
+
+#endif  // FAST_MATH_ONCE
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/fast_math_test.cc b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc
new file mode 100644
index 0000000000..897aadc120
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_math_test.cc
@@ -0,0 +1,288 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/fast_math_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestFastLog2() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(1e-7f, 1e3f);
+    const auto actual_v = FastLog2f(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::log2(f) - actual);
+    EXPECT_LT(abs_err, 3.1E-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPow2() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_rel_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-100, 100);
+    const auto actual_v = FastPow2f(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float expected = std::pow(2, f);
+    const float rel_err = std::abs(expected - actual) / expected;
+    EXPECT_LT(rel_err, 3.1E-6) << "f = " << f;
+    max_rel_err = std::max(max_rel_err, rel_err);
+  }
+  printf("max rel err %e\n", static_cast<double>(max_rel_err));
+}
+
+HWY_NOINLINE void TestFastPow() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_rel_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float b = rng.UniformF(1e-3f, 1e3f);
+    const float e = rng.UniformF(-10, 10);
+    const auto actual_v = FastPowf(d, Set(d, b), Set(d, e));
+    const float actual = GetLane(actual_v);
+    const float expected = std::pow(b, e);
+    const float rel_err = std::abs(expected - actual) / expected;
+    EXPECT_LT(rel_err, 3E-5) << "b = " << b << " e = " << e;
+    max_rel_err = std::max(max_rel_err, rel_err);
+  }
+  printf("max rel err %e\n", static_cast<double>(max_rel_err));
+}
+
+HWY_NOINLINE void TestFastCos() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-1e3f, 1e3f);
+    const auto actual_v = FastCosf(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::cos(f) - actual);
+    EXPECT_LT(abs_err, 7E-5) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastErf() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(-5.f, 5.f);
+    const auto actual_v = FastErff(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float abs_err = std::abs(std::erf(f) - actual);
+    EXPECT_LT(abs_err, 7E-4) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestCubeRoot() {
+  const HWY_FULL(float) d;
+  for (uint64_t x5 = 0; x5 < 2000000; x5++) {
+    const float x = x5 * 1E-5f;
+    const float expected = cbrtf(x);
+    HWY_ALIGN float approx[MaxLanes(d)];
+    Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx);
+
+    // All lanes are same
+    for (size_t i = 1; i < Lanes(d); ++i) {
+      EXPECT_NEAR(approx[0], approx[i], 5E-7f);
+    }
+    EXPECT_NEAR(approx[0], expected, 8E-7f);
+  }
+}
+
+HWY_NOINLINE void TestFastSRGB() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const auto actual_v = FastLinearToSRGB(d, Set(d, f));
+    const float actual = GetLane(actual_v);
+    const float expected = GetLane(TF_SRGB().EncodedFromDisplay(d, Set(d, f)));
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 1.2E-4) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPQEFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_PQ().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_PQ().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 7e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastHLGEFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_HLG().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 5e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFast709EFD() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_709().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_709().EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 2e-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastPQDFE() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_PQ().DisplayFromEncoded(d, Set(d, f)));
+    const float expected = TF_PQ().DisplayFromEncoded(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 3E-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestFastXYB() {
+  if (!HasFastXYBTosRGB8()) return;
+  ImageMetadata metadata;
+  ImageBundle ib(&metadata);
+  int scaling = 1;
+  int n = 256 * scaling;
+  float inv_scaling = 1.0f / scaling;
+  int kChunk = 32;
+  // The image is divided in chunks to reduce total memory usage.
+  for (int cr = 0; cr < n; cr += kChunk) {
+    for (int cg = 0; cg < n; cg += kChunk) {
+      for (int cb = 0; cb < n; cb += kChunk) {
+        Image3F chunk(kChunk * kChunk, kChunk);
+        for (int ir = 0; ir < kChunk; ir++) {
+          for (int ig = 0; ig < kChunk; ig++) {
+            for (int ib = 0; ib < kChunk; ib++) {
+              float r = (cr + ir) * inv_scaling;
+              float g = (cg + ig) * inv_scaling;
+              float b = (cb + ib) * inv_scaling;
+              chunk.PlaneRow(0, ir)[ig * kChunk + ib] = r * (1.0f / 255);
+              chunk.PlaneRow(1, ir)[ig * kChunk + ib] = g * (1.0f / 255);
+              chunk.PlaneRow(2, ir)[ig * kChunk + ib] = b * (1.0f / 255);
+            }
+          }
+        }
+        ib.SetFromImage(std::move(chunk), ColorEncoding::SRGB());
+        Image3F xyb(kChunk * kChunk, kChunk);
+        std::vector<uint8_t> roundtrip(kChunk * kChunk * kChunk * 3);
+        ToXYB(ib, nullptr, &xyb, GetJxlCms());
+        for (int y = 0; y < kChunk; y++) {
+          const float* xyba[4] = {xyb.PlaneRow(0, y), xyb.PlaneRow(1, y),
+                                  xyb.PlaneRow(2, y), nullptr};
+          jxl::HWY_NAMESPACE::FastXYBTosRGB8(
+              xyba, roundtrip.data() + 3 * xyb.xsize() * y, false, xyb.xsize());
+        }
+        for (int ir = 0; ir < kChunk; ir++) {
+          for (int ig = 0; ig < kChunk; ig++) {
+            for (int ib = 0; ib < kChunk; ib++) {
+              float r = (cr + ir) * inv_scaling;
+              float g = (cg + ig) * inv_scaling;
+              float b = (cb + ib) * inv_scaling;
+              size_t idx = ir * kChunk * kChunk + ig * kChunk + ib;
+              int rr = roundtrip[3 * idx];
+              int rg = roundtrip[3 * idx + 1];
+              int rb = roundtrip[3 * idx + 2];
+              EXPECT_LT(abs(r - rr), 2) << "expected " << r << " got " << rr;
+              EXPECT_LT(abs(g - rg), 2) << "expected " << g << " got " << rg;
+              EXPECT_LT(abs(b - rb), 2) << "expected " << b << " got " << rb;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class FastMathTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(FastMathTargetTest);
+
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastLog2);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow2);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPow);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastCos);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastErf);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestCubeRoot);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastSRGB);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQDFE);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQEFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastHLGEFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFast709EFD);
+HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastXYB);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/field_encodings.h b/third_party/jpeg-xl/lib/jxl/field_encodings.h
new file mode 100644
index 0000000000..613e8fad33
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/field_encodings.h
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FIELD_ENCODINGS_H_
+#define LIB_JXL_FIELD_ENCODINGS_H_
+
+// Constants needed to encode/decode fields; avoids including the full fields.h.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <hwy/base.h>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Macro to define the Fields' derived class Name when compiling with debug
+// names.
+#if JXL_IS_DEBUG_BUILD
+#define JXL_FIELDS_NAME(X) \
+  const char* Name() const override { return #X; }
+#else
+#define JXL_FIELDS_NAME(X)
+#endif  // JXL_IS_DEBUG_BUILD
+
+class Visitor;
+class Fields {
+ public:
+  virtual ~Fields() = default;
+#if JXL_IS_DEBUG_BUILD
+  virtual const char* Name() const = 0;
+#endif  // JXL_IS_DEBUG_BUILD
+  virtual Status VisitFields(Visitor* JXL_RESTRICT visitor) = 0;
+};
+
+// Distribution of U32 values for one particular selector. Represents either a
+// power of two-sized range, or a single value. A separate type ensures this is
+// only passed to the U32Enc ctor.
+struct U32Distr {
+  // No need to validate - all `d` are legitimate.
+  constexpr explicit U32Distr(uint32_t d) : d(d) {}
+
+  static constexpr uint32_t kDirect = 0x80000000u;
+
+  constexpr bool IsDirect() const { return (d & kDirect) != 0; }
+
+  // Only call if IsDirect().
+  constexpr uint32_t Direct() const { return d & (kDirect - 1); }
+
+  // Only call if !IsDirect().
+  constexpr size_t ExtraBits() const { return (d & 0x1F) + 1; }
+  uint32_t Offset() const { return (d >> 5) & 0x3FFFFFF; }
+
+  uint32_t d;
+};
+
+// A direct-coded 31-bit value occupying 2 bits in the bitstream.
+constexpr U32Distr Val(uint32_t value) {
+  return U32Distr(value | U32Distr::kDirect);
+}
+
+// Value - `offset` will be signaled in `bits` extra bits.
+constexpr U32Distr BitsOffset(uint32_t bits, uint32_t offset) {
+  return U32Distr(((bits - 1) & 0x1F) + ((offset & 0x3FFFFFF) << 5));
+}
+
+// Value will be signaled in `bits` extra bits.
+constexpr U32Distr Bits(uint32_t bits) { return BitsOffset(bits, 0); }
+
+// See U32Coder documentation in fields.h.
+class U32Enc {
+ public:
+  constexpr U32Enc(const U32Distr d0, const U32Distr d1, const U32Distr d2,
+                   const U32Distr d3)
+      : d_{d0, d1, d2, d3} {}
+
+  // Returns the U32Distr at `selector` = 0..3, least-significant first.
+  U32Distr GetDistr(const uint32_t selector) const {
+    JXL_ASSERT(selector < 4);
+    return d_[selector];
+  }
+
+ private:
+  U32Distr d_[4];
+};
+
+// Returns bit with the given `index` (0 = least significant).
+template <typename T>
+static inline constexpr uint64_t MakeBit(T index) {
+  return 1ULL << static_cast<uint32_t>(index);
+}
+
+// Returns vector of all possible values of an Enum type. Relies on each Enum
+// providing an overload of EnumBits() that returns a bit array of its values,
+// which implies values must be in [0, 64).
+template <typename Enum>
+std::vector<Enum> Values() {
+  uint64_t bits = EnumBits(Enum());
+
+  std::vector<Enum> values;
+  values.reserve(hwy::PopCount(bits));
+
+  // For each 1-bit in bits: add its index as value
+  while (bits != 0) {
+    const int index = Num0BitsBelowLS1Bit_Nonzero(bits);
+    values.push_back(static_cast<Enum>(index));
+    bits &= bits - 1;  // clear least-significant bit
+  }
+  return values;
+}
+
+// Returns true if value is one of Values<Enum>().
+template <class Enum>
+Status EnumValid(const Enum value) {
+  if (static_cast<uint32_t>(value) >= 64) {
+    return JXL_FAILURE("Value %u too large for %s\n",
+                       static_cast<uint32_t>(value), EnumName(Enum()));
+  }
+  const uint64_t bit = MakeBit(value);
+  if ((EnumBits(Enum()) & bit) == 0) {
+    return JXL_FAILURE("Invalid value %u for %s\n",
+                       static_cast<uint32_t>(value), EnumName(Enum()));
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FIELD_ENCODINGS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fields.cc b/third_party/jpeg-xl/lib/jxl/fields.cc
new file mode 100644
index 0000000000..cd1e72bd94
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields.cc
@@ -0,0 +1,642 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/fields.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/base.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jxl {
+
+namespace {
+
+using ::jxl::fields_internal::VisitorBase;
+
+struct InitVisitor : public VisitorBase {
+  Status Bits(const size_t /*unused*/, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  // Always visit conditional fields to ensure they are initialized.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    // Just initialize this field and don't skip initializing others.
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;
+  }
+
+  Status VisitNested(Fields* /*fields*/) override {
+    // Avoid re-initializing nested bundles (their ctors already called
+    // Bundle::Init for their fields).
+    return true;
+  }
+};
+
+// Similar to InitVisitor, but also initializes nested fields.
+struct SetDefaultVisitor : public VisitorBase {
+  Status Bits(const size_t /*unused*/, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    *value = default_value;
+    return true;
+  }
+
+  // Always visit conditional fields to ensure they are initialized.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    // Just initialize this field and don't skip initializing others.
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;
+  }
+};
+
+class AllDefaultVisitor : public VisitorBase {
+ public:
+  explicit AllDefaultVisitor() : VisitorBase() {}
+
+  Status Bits(const size_t bits, const uint32_t default_value,
+              uint32_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status U32(const U32Enc /*unused*/, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status U64(const uint64_t default_value,
+             uint64_t* JXL_RESTRICT value) override {
+    all_default_ &= *value == default_value;
+    return true;
+  }
+
+  Status F16(const float default_value, float* JXL_RESTRICT value) override {
+    all_default_ &= std::abs(*value - default_value) < 1E-6f;
+    return true;
+  }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT /*all_default*/) override {
+    // Visit all fields so we can compute the actual all_default_ value.
+    return false;
+  }
+
+  bool AllDefault() const { return all_default_; }
+
+ private:
+  bool all_default_ = true;
+};
+
+class ReadVisitor : public VisitorBase {
+ public:
+  explicit ReadVisitor(BitReader* reader) : VisitorBase(), reader_(reader) {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    *value = BitsCoder::Read(bits, reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status U32(const U32Enc dist, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    *value = U32Coder::Read(dist, reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    *value = U64Coder::Read(reader_);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    ok_ &= F16Coder::Read(reader_, value);
+    if (!reader_->AllReadsWithinBounds()) {
+      return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                        "Not enough bytes for header");
+    }
+    return true;
+  }
+
+  void SetDefault(Fields* fields) override { Bundle::SetDefault(fields); }
+
+  bool IsReading() const override { return true; }
+
+  // This never fails because visitors are expected to keep reading until
+  // EndExtensions, see comment there.
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    if (*extensions == 0) return true;
+
+    // For each nonzero bit, i.e. extension that is present:
+    for (uint64_t remaining_extensions = *extensions; remaining_extensions != 0;
+         remaining_extensions &= remaining_extensions - 1) {
+      const size_t idx_extension =
+          Num0BitsBelowLS1Bit_Nonzero(remaining_extensions);
+      // Read additional U64 (one per extension) indicating the number of bits
+      // (allows skipping individual extensions).
+      JXL_RETURN_IF_ERROR(U64(0, &extension_bits_[idx_extension]));
+      if (!SafeAdd(total_extension_bits_, extension_bits_[idx_extension],
+                   total_extension_bits_)) {
+        return JXL_FAILURE("Extension bits overflowed, invalid codestream");
+      }
+    }
+    // Used by EndExtensions to skip past any _remaining_ extensions.
+    pos_after_ext_size_ = reader_->TotalBitsConsumed();
+    JXL_ASSERT(pos_after_ext_size_ != 0);
+    return true;
+  }
+
+  Status EndExtensions() override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::EndExtensions());
+    // Happens if extensions == 0: don't read size, done.
+    if (pos_after_ext_size_ == 0) return true;
+
+    // Not enough bytes as set by BeginExtensions or earlier. Do not return
+    // this as a JXL_FAILURE or false (which can also propagate to error
+    // through e.g. JXL_RETURN_IF_ERROR), since this may be used while
+    // silently checking whether there are enough bytes. If this case must be
+    // treated as an error, reader_>Close() will do this, just like is already
+    // done for non-extension fields.
+    if (!enough_bytes_) return true;
+
+    // Skip new fields this (old?) decoder didn't know about, if any.
+    const size_t bits_read = reader_->TotalBitsConsumed();
+    uint64_t end;
+    if (!SafeAdd(pos_after_ext_size_, total_extension_bits_, end)) {
+      return JXL_FAILURE("Invalid extension size, caused overflow");
+    }
+    if (bits_read > end) {
+      return JXL_FAILURE("Read more extension bits than budgeted");
+    }
+    const size_t remaining_bits = end - bits_read;
+    if (remaining_bits != 0) {
+      JXL_WARNING("Skipping %" PRIuS "-bit extension(s)", remaining_bits);
+      reader_->SkipBits(remaining_bits);
+      if (!reader_->AllReadsWithinBounds()) {
+        return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                          "Not enough bytes for header");
+      }
+    }
+    return true;
+  }
+
+  Status OK() const { return ok_; }
+
+ private:
+  // Whether any error other than not enough bytes occurred.
+  bool ok_ = true;
+
+  // Whether there are enough input bytes to read from.
+  bool enough_bytes_ = true;
+  BitReader* const reader_;
+  // May be 0 even if the corresponding extension is present.
+  uint64_t extension_bits_[Bundle::kMaxExtensions] = {0};
+  uint64_t total_extension_bits_ = 0;
+  size_t pos_after_ext_size_ = 0;  // 0 iff extensions == 0.
+};
+
+class MaxBitsVisitor : public VisitorBase {
+ public:
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += BitsCoder::MaxEncodedBits(bits);
+    return true;
+  }
+
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += U32Coder::MaxEncodedBits(enc);
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT /*value*/) override {
+    max_bits_ += U64Coder::MaxEncodedBits();
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT /*value*/) override {
+    max_bits_ += F16Coder::MaxEncodedBits();
+    return true;
+  }
+
+  Status AllDefault(const Fields& /*fields*/,
+                    bool* JXL_RESTRICT all_default) override {
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return false;  // For max bits, assume nothing is default
+  }
+
+  // Always visit conditional fields to get a (loose) upper bound.
+  Status Conditional(bool /*condition*/) override { return true; }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT /*extensions*/) override {
+    // Skip - extensions are not included in "MaxBits" because their length
+    // is potentially unbounded.
+    return true;
+  }
+
+  Status EndExtensions() override { return true; }
+
+  size_t MaxBits() const { return max_bits_; }
+
+ private:
+  size_t max_bits_ = 0;
+};
+
+class CanEncodeVisitor : public VisitorBase {
+ public:
+  explicit CanEncodeVisitor() : VisitorBase() {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= BitsCoder::CanEncode(bits, *value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= U32Coder::CanEncode(enc, *value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= U64Coder::CanEncode(*value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    size_t encoded_bits = 0;
+    ok_ &= F16Coder::CanEncode(*value, &encoded_bits);
+    encoded_bits_ += encoded_bits;
+    return true;
+  }
+
+  Status AllDefault(const Fields& fields,
+                    bool* JXL_RESTRICT all_default) override {
+    *all_default = Bundle::AllDefault(fields);
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return *all_default;
+  }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    extensions_ = *extensions;
+    if (*extensions != 0) {
+      JXL_ASSERT(pos_after_ext_ == 0);
+      pos_after_ext_ = encoded_bits_;
+      JXL_ASSERT(pos_after_ext_ != 0);  // visited "extensions"
+    }
+    return true;
+  }
+  // EndExtensions = default.
+
+  Status GetSizes(size_t* JXL_RESTRICT extension_bits,
+                  size_t* JXL_RESTRICT total_bits) {
+    JXL_RETURN_IF_ERROR(ok_);
+    *extension_bits = 0;
+    *total_bits = encoded_bits_;
+    // Only if extension field was nonzero will we encode their sizes.
+    if (pos_after_ext_ != 0) {
+      JXL_ASSERT(encoded_bits_ >= pos_after_ext_);
+      *extension_bits = encoded_bits_ - pos_after_ext_;
+      // Also need to encode *extension_bits and bill it to *total_bits.
+      size_t encoded_bits = 0;
+      ok_ &= U64Coder::CanEncode(*extension_bits, &encoded_bits);
+      *total_bits += encoded_bits;
+
+      // TODO(janwas): support encoding individual extension sizes. We
+      // currently ascribe all bits to the first and send zeros for the
+      // others.
+      for (size_t i = 1; i < hwy::PopCount(extensions_); ++i) {
+        encoded_bits = 0;
+        ok_ &= U64Coder::CanEncode(0, &encoded_bits);
+        *total_bits += encoded_bits;
+      }
+    }
+    return true;
+  }
+
+ private:
+  bool ok_ = true;
+  size_t encoded_bits_ = 0;
+  uint64_t extensions_ = 0;
+  // Snapshot of encoded_bits_ after visiting the extension field, but NOT
+  // including the hidden extension sizes.
+  uint64_t pos_after_ext_ = 0;
+};
+}  // namespace
+
+void Bundle::Init(Fields* fields) {
+  InitVisitor visitor;
+  if (!visitor.Visit(fields)) {
+    JXL_ABORT("Init should never fail");
+  }
+}
+void Bundle::SetDefault(Fields* fields) {
+  SetDefaultVisitor visitor;
+  if (!visitor.Visit(fields)) {
+    JXL_ABORT("SetDefault should never fail");
+  }
+}
+bool Bundle::AllDefault(const Fields& fields) {
+  AllDefaultVisitor visitor;
+  if (!visitor.VisitConst(fields)) {
+    JXL_ABORT("AllDefault should never fail");
+  }
+  return visitor.AllDefault();
+}
+size_t Bundle::MaxBits(const Fields& fields) {
+  MaxBitsVisitor visitor;
+#if JXL_ENABLE_ASSERT
+  Status ret =
+#else
+  (void)
+#endif  // JXL_ENABLE_ASSERT
+      visitor.VisitConst(fields);
+  JXL_ASSERT(ret);
+  return visitor.MaxBits();
+}
+Status Bundle::CanEncode(const Fields& fields, size_t* extension_bits,
+                         size_t* total_bits) {
+  CanEncodeVisitor visitor;
+  JXL_QUIET_RETURN_IF_ERROR(visitor.VisitConst(fields));
+  JXL_QUIET_RETURN_IF_ERROR(visitor.GetSizes(extension_bits, total_bits));
+  return true;
+}
+Status Bundle::Read(BitReader* reader, Fields* fields) {
+  ReadVisitor visitor(reader);
+  JXL_RETURN_IF_ERROR(visitor.Visit(fields));
+  return visitor.OK();
+}
+bool Bundle::CanRead(BitReader* reader, Fields* fields) {
+  ReadVisitor visitor(reader);
+  Status status = visitor.Visit(fields);
+  // We are only checking here whether there are enough bytes. We still return
+  // true for other errors because it means there are enough bytes to determine
+  // there's an error. Use Read() to determine which error it is.
+  return status.code() != StatusCode::kNotEnoughBytes;
+}
+
+size_t BitsCoder::MaxEncodedBits(const size_t bits) { return bits; }
+
+Status BitsCoder::CanEncode(const size_t bits, const uint32_t value,
+                            size_t* JXL_RESTRICT encoded_bits) {
+  *encoded_bits = bits;
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %u too large for %" PRIu64 " bits", value,
+                       static_cast<uint64_t>(bits));
+  }
+  return true;
+}
+
+uint32_t BitsCoder::Read(const size_t bits, BitReader* JXL_RESTRICT reader) {
+  return reader->ReadBits(bits);
+}
+
+size_t U32Coder::MaxEncodedBits(const U32Enc enc) {
+  size_t extra_bits = 0;
+  for (uint32_t selector = 0; selector < 4; ++selector) {
+    const U32Distr d = enc.GetDistr(selector);
+    if (d.IsDirect()) {
+      continue;
+    } else {
+      extra_bits = std::max<size_t>(extra_bits, d.ExtraBits());
+    }
+  }
+  return 2 + extra_bits;
+}
+
+Status U32Coder::CanEncode(const U32Enc enc, const uint32_t value,
+                           size_t* JXL_RESTRICT encoded_bits) {
+  uint32_t selector;
+  size_t total_bits;
+  const Status ok = ChooseSelector(enc, value, &selector, &total_bits);
+  *encoded_bits = ok ? total_bits : 0;
+  return ok;
+}
+
+uint32_t U32Coder::Read(const U32Enc enc, BitReader* JXL_RESTRICT reader) {
+  const uint32_t selector = reader->ReadFixedBits<2>();
+  const U32Distr d = enc.GetDistr(selector);
+  if (d.IsDirect()) {
+    return d.Direct();
+  } else {
+    return reader->ReadBits(d.ExtraBits()) + d.Offset();
+  }
+}
+
+Status U32Coder::ChooseSelector(const U32Enc enc, const uint32_t value,
+                                uint32_t* JXL_RESTRICT selector,
+                                size_t* JXL_RESTRICT total_bits) {
+#if JXL_ENABLE_ASSERT
+  const size_t bits_required = 32 - Num0BitsAboveMS1Bit(value);
+#endif  // JXL_ENABLE_ASSERT
+  JXL_ASSERT(bits_required <= 32);
+
+  *selector = 0;
+  *total_bits = 0;
+
+  // It is difficult to verify whether Dist32Byte are sorted, so check all
+  // selectors and keep the one with the fewest total_bits.
+  *total_bits = 64;  // more than any valid encoding
+  for (uint32_t s = 0; s < 4; ++s) {
+    const U32Distr d = enc.GetDistr(s);
+    if (d.IsDirect()) {
+      if (d.Direct() == value) {
+        *selector = s;
+        *total_bits = 2;
+        return true;  // Done, direct is always the best possible.
+      }
+      continue;
+    }
+    const size_t extra_bits = d.ExtraBits();
+    const uint32_t offset = d.Offset();
+    if (value < offset || value >= offset + (1ULL << extra_bits)) continue;
+
+    // Better than prior encoding, remember it:
+    if (2 + extra_bits < *total_bits) {
+      *selector = s;
+      *total_bits = 2 + extra_bits;
+    }
+  }
+
+  if (*total_bits == 64) {
+    return JXL_FAILURE("No feasible selector for %u", value);
+  }
+
+  return true;
+}
+
+uint64_t U64Coder::Read(BitReader* JXL_RESTRICT reader) {
+  uint64_t selector = reader->ReadFixedBits<2>();
+  if (selector == 0) {
+    return 0;
+  }
+  if (selector == 1) {
+    return 1 + reader->ReadFixedBits<4>();
+  }
+  if (selector == 2) {
+    return 17 + reader->ReadFixedBits<8>();
+  }
+
+  // selector 3, varint, groups have first 12, then 8, and last 4 bits.
+  uint64_t result = reader->ReadFixedBits<12>();
+
+  uint64_t shift = 12;
+  while (reader->ReadFixedBits<1>()) {
+    if (shift == 60) {
+      result |= static_cast<uint64_t>(reader->ReadFixedBits<4>()) << shift;
+      break;
+    }
+    result |= static_cast<uint64_t>(reader->ReadFixedBits<8>()) << shift;
+    shift += 8;
+  }
+
+  return result;
+}
+
+// Can always encode, but useful because it also returns bit size.
+Status U64Coder::CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits) {
+  if (value == 0) {
+    *encoded_bits = 2;  // 2 selector bits
+  } else if (value <= 16) {
+    *encoded_bits = 2 + 4;  // 2 selector bits + 4 payload bits
+  } else if (value <= 272) {
+    *encoded_bits = 2 + 8;  // 2 selector bits + 8 payload bits
+  } else {
+    *encoded_bits = 2 + 12;  // 2 selector bits + 12 payload bits
+    value >>= 12;
+    int shift = 12;
+    while (value > 0 && shift < 60) {
+      *encoded_bits += 1 + 8;  // 1 continuation bit + 8 payload bits
+      value >>= 8;
+      shift += 8;
+    }
+    if (value > 0) {
+      // This only could happen if shift == N - 4.
+      *encoded_bits += 1 + 4;  // 1 continuation bit + 4 payload bits
+    } else {
+      *encoded_bits += 1;  // 1 stop bit
+    }
+  }
+
+  return true;
+}
+
+Status F16Coder::Read(BitReader* JXL_RESTRICT reader,
+                      float* JXL_RESTRICT value) {
+  const uint32_t bits16 = reader->ReadFixedBits<16>();
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  if (JXL_UNLIKELY(biased_exp == 31)) {
+    return JXL_FAILURE("F16 infinity or NaN are not supported");
+  }
+
+  // Subnormal or zero
+  if (JXL_UNLIKELY(biased_exp == 0)) {
+    *value = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    if (sign) *value = -*value;
+    return true;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+  memcpy(value, &bits32, sizeof(bits32));
+  return true;
+}
+
+Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) {
+  *encoded_bits = MaxEncodedBits();
+  if (std::isnan(value) || std::isinf(value)) {
+    return JXL_FAILURE("Should not attempt to store NaN and infinity");
+  }
+  return std::abs(value) <= 65504.0f;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/fields.h b/third_party/jpeg-xl/lib/jxl/fields.h
new file mode 100644
index 0000000000..10d0b7aa30
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields.h
@@ -0,0 +1,377 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FIELDS_H_
+#define LIB_JXL_FIELDS_H_
+
+// Forward/backward-compatible 'bundles' with auto-serialized 'fields'.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cinttypes>
+#include <cmath>  // abs
+#include <cstdarg>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+struct AuxOut;
+struct BitWriter;
+
+// Integer coders: BitsCoder (raw), U32Coder (table), U64Coder (varint).
+
+// Reads/writes a given (fixed) number of bits <= 32.
+namespace BitsCoder {
+size_t MaxEncodedBits(size_t bits);
+
+Status CanEncode(size_t bits, uint32_t value,
+                 size_t* JXL_RESTRICT encoded_bits);
+
+uint32_t Read(size_t bits, BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(size_t bits, uint32_t value, BitWriter* JXL_RESTRICT writer);
+}  // namespace BitsCoder
+
+// Encodes u32 using a lookup table and/or extra bits, governed by a per-field
+// encoding `enc` which consists of four distributions `d` chosen via a 2-bit
+// selector (least significant = 0). Each d may have two modes:
+// - direct: if d.IsDirect(), the value is d.Direct();
+// - offset: the value is derived from d.ExtraBits() extra bits plus d.Offset();
+// This encoding is denser than Exp-Golomb or Gamma codes when both small and
+// large values occur.
+//
+// Examples:
+// Direct: U32Enc(Val(8), Val(16), Val(32), Bits(6)), value 32 => 10b.
+// Offset: U32Enc(Val(0), BitsOffset(1, 1), BitsOffset(2, 3), BitsOffset(8, 8))
+//   defines the following prefix code:
+//   00 -> 0
+//   01x -> 1..2
+//   10xx -> 3..7
+//   11xxxxxxxx -> 8..263
+namespace U32Coder {
+size_t MaxEncodedBits(U32Enc enc);
+Status CanEncode(U32Enc enc, uint32_t value, size_t* JXL_RESTRICT encoded_bits);
+uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(U32Enc enc, uint32_t value, BitWriter* JXL_RESTRICT writer);
+
+// "private"
+Status ChooseSelector(U32Enc enc, uint32_t value,
+                      uint32_t* JXL_RESTRICT selector,
+                      size_t* JXL_RESTRICT total_bits);
+}  // namespace U32Coder
+
+// Encodes 64-bit unsigned integers with a fixed distribution, taking 2 bits
+// to encode 0, 6 bits to encode 1 to 16, 10 bits to encode 17 to 272, 15 bits
+// to encode up to 4095, and on the order of log2(value) * 1.125 bits for
+// larger values.
+namespace U64Coder {
+constexpr size_t MaxEncodedBits() { return 2 + 12 + 6 * (8 + 1) + (4 + 1); }
+
+uint64_t Read(BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer);
+
+// Can always encode, but useful because it also returns bit size.
+Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace U64Coder
+
+// IEEE 754 half-precision (binary16). Refuses to read/write NaN/Inf.
+namespace F16Coder {
+constexpr size_t MaxEncodedBits() { return 16; }
+
+// Returns false if the bit representation is NaN or infinity
+Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value);
+
+// Returns false if the value is too large to encode.
+Status Write(float value, BitWriter* JXL_RESTRICT writer);
+Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace F16Coder
+
+// A "bundle" is a forward- and backward compatible collection of fields.
+// They are used for SizeHeader/FrameHeader/GroupHeader. Bundles can be
+// extended by appending(!) fields. Optional fields may be omitted from the
+// bitstream by conditionally visiting them. When reading new bitstreams with
+// old code, we skip unknown fields at the end of the bundle. This requires
+// storing the amount of extra appended bits, and that fields are visited in
+// chronological order of being added to the format, because old decoders
+// cannot skip some future fields and resume reading old fields. Similarly,
+// new readers query bits in an "extensions" field to skip (groups of) fields
+// not present in old bitstreams. Note that each bundle must include an
+// "extensions" field prior to freezing the format, otherwise it cannot be
+// extended.
+//
+// To ensure interoperability, there will be no opaque fields.
+//
+// HOWTO:
+// - basic usage: define a struct with member variables ("fields") and a
+//   VisitFields(v) member function that calls v->U32/Bool etc. for each
+//   field, specifying their default values. The ctor must call
+//   Bundle::Init(this).
+//
+// - print a trace of visitors: ensure each bundle has a static Name() member
+//   function, and change Bundle::Print* to return true.
+//
+// - optional fields: in VisitFields, add if (v->Conditional(your_condition))
+//   { v->Bool(default, &field); }. This prevents reading/writing field
+//   if !your_condition, which is typically computed from a prior field.
+//   WARNING: to ensure all fields are initialized, do not add an else branch;
+//   instead add another if (v->Conditional(!your_condition)).
+//
+// - repeated fields: for dynamic sizes, use e.g. std::vector and in
+//   VisitFields, if (v->IsReading()) field.resize(size) before accessing field.
+//   For static or bounded sizes, use an array or std::array. In all cases,
+//   simply visit each array element as if it were a normal field.
+//
+// - nested bundles: add a bundle as a normal field and in VisitFields call
+//   JXL_RETURN_IF_ERROR(v->VisitNested(&nested));
+//
+// - allow future extensions: define a "uint64_t extensions" field and call
+//   v->BeginExtensions(&extensions) after visiting all non-extension fields,
+//   and `return v->EndExtensions();` after the last extension field.
+//
+// - encode an entire bundle in one bit if ALL its fields equal their default
+//   values: add a "mutable bool all_default" field and as the first visitor:
+//   if (v->AllDefault(*this, &all_default)) {
+//     // Overwrite all serialized fields, but not any nonserialized_*.
+//     v->SetDefault(this);
+//     return true;
+//   }
+//   Note: if extensions are present, AllDefault() == false.
+
+namespace Bundle {
+constexpr size_t kMaxExtensions = 64;  // bits in u64
+
+// Initializes fields to the default values. It is not recursive to nested
+// fields, this function is intended to be called in the constructors so
+// each nested field will already Init itself.
+void Init(Fields* JXL_RESTRICT fields);
+
+// Similar to Init, but recursive to nested fields.
+void SetDefault(Fields* JXL_RESTRICT fields);
+
+// Returns whether ALL fields (including `extensions`, if present) are equal
+// to their default value.
+bool AllDefault(const Fields& fields);
+
+// Returns max number of bits required to encode a T.
+size_t MaxBits(const Fields& fields);
+
+// Returns whether a header's fields can all be encoded, i.e. they have a
+// valid representation. If so, "*total_bits" is the exact number of bits
+// required. Called by Write.
+Status CanEncode(const Fields& fields, size_t* JXL_RESTRICT extension_bits,
+                 size_t* JXL_RESTRICT total_bits);
+
+Status Read(BitReader* reader, Fields* JXL_RESTRICT fields);
+
+// Returns whether enough bits are available to fully read this bundle using
+// Read. Also returns true in case of a codestream error (other than not being
+// large enough): that means enough bits are available to determine there's an
+// error, use Read to get such error status.
+// NOTE: this advances the BitReader, a different one pointing back at the
+// original bit position in the codestream must be created to use Read after
+// this.
+bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields);
+
+Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer, size_t layer,
+             AuxOut* aux_out);
+}  // namespace Bundle
+
+// Different subclasses of Visitor are passed to implementations of Fields
+// throughout their lifetime. Templates used to be used for this but dynamic
+// polymorphism produces more compact executables than template reification did.
+class Visitor {
+ public:
+  virtual ~Visitor() = default;
+  virtual Status Visit(Fields* fields) = 0;
+
+  virtual Status Bool(bool default_value, bool* JXL_RESTRICT value) = 0;
+  virtual Status U32(U32Enc, uint32_t, uint32_t*) = 0;
+
+  // Helper to construct U32Enc from U32Distr.
+  Status U32(const U32Distr d0, const U32Distr d1, const U32Distr d2,
+             const U32Distr d3, const uint32_t default_value,
+             uint32_t* JXL_RESTRICT value) {
+    return U32(U32Enc(d0, d1, d2, d3), default_value, value);
+  }
+
+  template <typename EnumT>
+  Status Enum(const EnumT default_value, EnumT* JXL_RESTRICT value) {
+    uint32_t u32 = static_cast<uint32_t>(*value);
+    // 00 -> 0
+    // 01 -> 1
+    // 10xxxx -> 2..17
+    // 11yyyyyy -> 18..81
+    JXL_RETURN_IF_ERROR(U32(Val(0), Val(1), BitsOffset(4, 2), BitsOffset(6, 18),
+                            static_cast<uint32_t>(default_value), &u32));
+    *value = static_cast<EnumT>(u32);
+    return EnumValid(*value);
+  }
+
+  virtual Status Bits(size_t bits, uint32_t default_value,
+                      uint32_t* JXL_RESTRICT value) = 0;
+  virtual Status U64(uint64_t default_value, uint64_t* JXL_RESTRICT value) = 0;
+  virtual Status F16(float default_value, float* JXL_RESTRICT value) = 0;
+
+  // Returns whether VisitFields should visit some subsequent fields.
+  // "condition" is typically from prior fields, e.g. flags.
+  // Overridden by InitVisitor and MaxBitsVisitor.
+  virtual Status Conditional(bool condition) { return condition; }
+
+  // Overridden by InitVisitor, AllDefaultVisitor and CanEncodeVisitor.
+  virtual Status AllDefault(const Fields& /*fields*/,
+                            bool* JXL_RESTRICT all_default) {
+    JXL_RETURN_IF_ERROR(Bool(true, all_default));
+    return *all_default;
+  }
+
+  virtual void SetDefault(Fields* /*fields*/) {
+    // Do nothing by default, this is overridden by ReadVisitor.
+  }
+
+  // Returns the result of visiting a nested Bundle.
+  // Overridden by InitVisitor.
+  virtual Status VisitNested(Fields* fields) { return Visit(fields); }
+
+  // Overridden by ReadVisitor. Enables dynamically-sized fields.
+  virtual bool IsReading() const { return false; }
+
+  virtual Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) = 0;
+  virtual Status EndExtensions() = 0;
+};
+
+namespace fields_internal {
+// A bundle can be in one of three states concerning extensions: not-begun,
+// active, ended. Bundles may be nested, so we need a stack of states.
+class ExtensionStates {
+ public:
+  void Push() {
+    // Initial state = not-begun.
+    begun_ <<= 1;
+    ended_ <<= 1;
+  }
+
+  // Clears current state; caller must check IsEnded beforehand.
+  void Pop() {
+    begun_ >>= 1;
+    ended_ >>= 1;
+  }
+
+  // Returns true if state == active || state == ended.
+  Status IsBegun() const { return (begun_ & 1) != 0; }
+  // Returns true if state != not-begun && state != active.
+  Status IsEnded() const { return (ended_ & 1) != 0; }
+
+  void Begin() {
+    JXL_ASSERT(!IsBegun());
+    JXL_ASSERT(!IsEnded());
+    begun_ += 1;
+  }
+
+  void End() {
+    JXL_ASSERT(IsBegun());
+    JXL_ASSERT(!IsEnded());
+    ended_ += 1;
+  }
+
+ private:
+  // Current state := least-significant bit of begun_ and ended_.
+  uint64_t begun_ = 0;
+  uint64_t ended_ = 0;
+};
+
+// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each
+// bundle's VisitFields member function calls visitor->U32 etc. We do not
+// overload operator() because a function name is easier to search for.
+
+class VisitorBase : public Visitor {
+ public:
+  explicit VisitorBase() {}
+  ~VisitorBase() override { JXL_ASSERT(depth_ == 0); }
+
+  // This is the only call site of Fields::VisitFields.
+  // Ensures EndExtensions was called.
+  Status Visit(Fields* fields) override {
+    depth_ += 1;
+    JXL_ASSERT(depth_ <= Bundle::kMaxExtensions);
+    extension_states_.Push();
+
+    const Status ok = fields->VisitFields(this);
+
+    if (ok) {
+      // If VisitFields called BeginExtensions, must also call
+      // EndExtensions.
+      JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded());
+    } else {
+      // Failed, undefined state: don't care whether EndExtensions was
+      // called.
+    }
+
+    extension_states_.Pop();
+    JXL_ASSERT(depth_ != 0);
+    depth_ -= 1;
+
+    return ok;
+  }
+
+  // For visitors accepting a const Visitor, need to const-cast so we can call
+  // the non-const Visitor::VisitFields. NOTE: C is not modified except the
+  // `all_default` field by CanEncodeVisitor.
+  Status VisitConst(const Fields& t) { return Visit(const_cast<Fields*>(&t)); }
+
+  // Derived types (overridden by InitVisitor because it is unsafe to read
+  // from *value there)
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    uint32_t bits = *value ? 1 : 0;
+    JXL_RETURN_IF_ERROR(Bits(1, static_cast<uint32_t>(default_value), &bits));
+    JXL_DASSERT(bits <= 1);
+    *value = bits == 1;
+    return true;
+  }
+
+  // Overridden by ReadVisitor and WriteVisitor.
+  // Called before any conditional visit based on "extensions".
+  // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor.
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_RETURN_IF_ERROR(U64(0, extensions));
+
+    extension_states_.Begin();
+    return true;
+  }
+
+  // Called after all extension fields (if any). Although non-extension
+  // fields could be visited afterward, we prefer the convention that
+  // extension fields are always the last to be visited. Overridden by
+  // ReadVisitor.
+  Status EndExtensions() override {
+    extension_states_.End();
+    return true;
+  }
+
+ private:
+  size_t depth_ = 0;  // to check nesting
+  ExtensionStates extension_states_;
+};
+}  // namespace fields_internal
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FIELDS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/fields_test.cc b/third_party/jpeg-xl/lib/jxl/fields_test.cc
new file mode 100644
index 0000000000..cf54c780ea
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fields_test.cc
@@ -0,0 +1,429 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/fields.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+// Ensures `value` round-trips and in exactly `expected_bits_written`.
+void TestU32Coder(const uint32_t value, const size_t expected_bits_written) {
+  const U32Enc enc(Val(0), Bits(4), Val(0x7FFFFFFF), Bits(32));
+
+  BitWriter writer;
+  BitWriter::Allotment allotment(
+      &writer, RoundUpBitsToByteMultiple(U32Coder::MaxEncodedBits(enc)));
+
+  size_t precheck_pos;
+  EXPECT_TRUE(U32Coder::CanEncode(enc, value, &precheck_pos));
+  EXPECT_EQ(expected_bits_written, precheck_pos);
+
+  EXPECT_TRUE(U32Coder::Write(enc, value, &writer));
+  EXPECT_EQ(expected_bits_written, writer.BitsWritten());
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  const uint32_t decoded_value = U32Coder::Read(enc, &reader);
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(FieldsTest, U32CoderTest) {
+  TestU32Coder(0, 2);
+  TestU32Coder(1, 6);
+  TestU32Coder(15, 6);
+  TestU32Coder(0x7FFFFFFF, 2);
+  TestU32Coder(128, 34);
+  TestU32Coder(0x7FFFFFFEu, 34);
+  TestU32Coder(0x80000000u, 34);
+  TestU32Coder(0xFFFFFFFFu, 34);
+}
+
+void TestU64Coder(const uint64_t value, const size_t expected_bits_written) {
+  BitWriter writer;
+  BitWriter::Allotment allotment(
+      &writer, RoundUpBitsToByteMultiple(U64Coder::MaxEncodedBits()));
+
+  size_t precheck_pos;
+  EXPECT_TRUE(U64Coder::CanEncode(value, &precheck_pos));
+  EXPECT_EQ(expected_bits_written, precheck_pos);
+
+  EXPECT_TRUE(U64Coder::Write(value, &writer));
+  EXPECT_EQ(expected_bits_written, writer.BitsWritten());
+
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  const uint64_t decoded_value = U64Coder::Read(&reader);
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+}
+
+TEST(FieldsTest, U64CoderTest) {
+  // Values that should take 2 bits (selector 00): 0
+  TestU64Coder(0, 2);
+
+  // Values that should take 6 bits (2 for selector, 4 for value): 1..16
+  TestU64Coder(1, 6);
+  TestU64Coder(2, 6);
+  TestU64Coder(8, 6);
+  TestU64Coder(15, 6);
+  TestU64Coder(16, 6);
+
+  // Values that should take 10 bits (2 for selector, 8 for value): 17..272
+  TestU64Coder(17, 10);
+  TestU64Coder(18, 10);
+  TestU64Coder(100, 10);
+  TestU64Coder(271, 10);
+  TestU64Coder(272, 10);
+
+  // Values that should take 15 bits (2 for selector, 12 for value, 1 for varint
+  // end): (0)..273..4095
+  TestU64Coder(273, 15);
+  TestU64Coder(274, 15);
+  TestU64Coder(1000, 15);
+  TestU64Coder(4094, 15);
+  TestU64Coder(4095, 15);
+
+  // Take 24 bits (of which 20 actual value): (0)..4096..1048575
+  TestU64Coder(4096, 24);
+  TestU64Coder(4097, 24);
+  TestU64Coder(10000, 24);
+  TestU64Coder(1048574, 24);
+  TestU64Coder(1048575, 24);
+
+  // Take 33 bits (of which 28 actual value): (0)..1048576..268435455
+  TestU64Coder(1048576, 33);
+  TestU64Coder(1048577, 33);
+  TestU64Coder(10000000, 33);
+  TestU64Coder(268435454, 33);
+  TestU64Coder(268435455, 33);
+
+  // Take 42 bits (of which 36 actual value): (0)..268435456..68719476735
+  TestU64Coder(268435456ull, 42);
+  TestU64Coder(268435457ull, 42);
+  TestU64Coder(1000000000ull, 42);
+  TestU64Coder(68719476734ull, 42);
+  TestU64Coder(68719476735ull, 42);
+
+  // Take 51 bits (of which 44 actual value): (0)..68719476736..17592186044415
+  TestU64Coder(68719476736ull, 51);
+  TestU64Coder(68719476737ull, 51);
+  TestU64Coder(1000000000000ull, 51);
+  TestU64Coder(17592186044414ull, 51);
+  TestU64Coder(17592186044415ull, 51);
+
+  // Take 60 bits (of which 52 actual value):
+  // (0)..17592186044416..4503599627370495
+  TestU64Coder(17592186044416ull, 60);
+  TestU64Coder(17592186044417ull, 60);
+  TestU64Coder(100000000000000ull, 60);
+  TestU64Coder(4503599627370494ull, 60);
+  TestU64Coder(4503599627370495ull, 60);
+
+  // Take 69 bits (of which 60 actual value):
+  // (0)..4503599627370496..1152921504606846975
+  TestU64Coder(4503599627370496ull, 69);
+  TestU64Coder(4503599627370497ull, 69);
+  TestU64Coder(10000000000000000ull, 69);
+  TestU64Coder(1152921504606846974ull, 69);
+  TestU64Coder(1152921504606846975ull, 69);
+
+  // Take 73 bits (of which 64 actual value):
+  // (0)..1152921504606846976..18446744073709551615
+  TestU64Coder(1152921504606846976ull, 73);
+  TestU64Coder(1152921504606846977ull, 73);
+  TestU64Coder(10000000000000000000ull, 73);
+  TestU64Coder(18446744073709551614ull, 73);
+  TestU64Coder(18446744073709551615ull, 73);
+}
+
+Status TestF16Coder(const float value) {
+  size_t max_encoded_bits;
+  // It is not a fatal error if it can't be encoded.
+  if (!F16Coder::CanEncode(value, &max_encoded_bits)) return false;
+  EXPECT_EQ(F16Coder::MaxEncodedBits(), max_encoded_bits);
+
+  BitWriter writer;
+  BitWriter::Allotment allotment(&writer,
+                                 RoundUpBitsToByteMultiple(max_encoded_bits));
+
+  EXPECT_TRUE(F16Coder::Write(value, &writer));
+  EXPECT_EQ(F16Coder::MaxEncodedBits(), writer.BitsWritten());
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  float decoded_value;
+  EXPECT_TRUE(F16Coder::Read(&reader, &decoded_value));
+  // All values we test can be represented exactly.
+  EXPECT_EQ(value, decoded_value);
+  EXPECT_TRUE(reader.Close());
+  return true;
+}
+
+TEST(FieldsTest, F16CoderTest) {
+  for (float sign : {-1.0f, 1.0f}) {
+    // (anything less than 1E-3 are subnormals)
+    for (float mag : {0.0f, 0.5f, 1.0f, 2.0f, 2.5f, 16.015625f, 1.0f / 4096,
+                      1.0f / 16384, 65504.0f}) {
+      EXPECT_TRUE(TestF16Coder(sign * mag));
+    }
+  }
+
+  // Out of range
+  EXPECT_FALSE(TestF16Coder(65504.01f));
+  EXPECT_FALSE(TestF16Coder(-65505.0f));
+}
+
+// Ensures Read(Write()) returns the same fields.
+TEST(FieldsTest, TestRoundtripSize) {
+  for (int i = 0; i < 8; i++) {
+    SizeHeader size;
+    ASSERT_TRUE(size.Set(123 + 77 * i, 7 + i));
+
+    size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+    ASSERT_TRUE(Bundle::CanEncode(size, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+
+    BitWriter writer;
+    ASSERT_TRUE(WriteSizeHeader(size, &writer, 0, nullptr));
+    EXPECT_EQ(total_bits, writer.BitsWritten());
+    writer.ZeroPadToByte();
+
+    SizeHeader size2;
+    BitReader reader(writer.GetSpan());
+    ASSERT_TRUE(ReadSizeHeader(&reader, &size2));
+    EXPECT_EQ(total_bits, reader.TotalBitsConsumed());
+    EXPECT_TRUE(reader.Close());
+
+    EXPECT_EQ(size.xsize(), size2.xsize());
+    EXPECT_EQ(size.ysize(), size2.ysize());
+  }
+}
+
+// Ensure all values can be reached by the encoding.
+TEST(FieldsTest, TestCropRect) {
+  CodecMetadata metadata;
+  for (int32_t i = -999; i < 19000; ++i) {
+    FrameHeader f(&metadata);
+    f.custom_size_or_origin = true;
+    f.frame_origin.x0 = i;
+    f.frame_origin.y0 = i;
+    f.frame_size.xsize = 1000 + i;
+    f.frame_size.ysize = 1000 + i;
+    size_t extension_bits = 0, total_bits = 0;
+    ASSERT_TRUE(Bundle::CanEncode(f, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 9u);
+  }
+}
+TEST(FieldsTest, TestPreview) {
+  // (div8 cannot represent 4360, but !div8 can go a little higher)
+  for (uint32_t i = 1; i < 4360; ++i) {
+    PreviewHeader p;
+    ASSERT_TRUE(p.Set(i, i));
+    size_t extension_bits = 0, total_bits = 0;
+    ASSERT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 6u);
+  }
+}
+
+// Ensures Read(Write()) returns the same fields.
+TEST(FieldsTest, TestRoundtripFrame) {
+  CodecMetadata metadata;
+  FrameHeader h(&metadata);
+  h.extensions = 0x800;
+
+  size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+  ASSERT_TRUE(Bundle::CanEncode(h, &extension_bits, &total_bits));
+  EXPECT_EQ(0u, extension_bits);
+  BitWriter writer;
+  ASSERT_TRUE(WriteFrameHeader(h, &writer, nullptr));
+  EXPECT_EQ(total_bits, writer.BitsWritten());
+  writer.ZeroPadToByte();
+
+  FrameHeader h2(&metadata);
+  BitReader reader(writer.GetSpan());
+  ASSERT_TRUE(ReadFrameHeader(&reader, &h2));
+  EXPECT_EQ(total_bits, reader.TotalBitsConsumed());
+  EXPECT_TRUE(reader.Close());
+
+  EXPECT_EQ(h.extensions, h2.extensions);
+  EXPECT_EQ(h.flags, h2.flags);
+}
+
+#ifndef JXL_CRASH_ON_ERROR
+// Ensure out-of-bounds values cause an error.
+TEST(FieldsTest, TestOutOfRange) {
+  SizeHeader h;
+  ASSERT_TRUE(h.Set(0xFFFFFFFFull, 0xFFFFFFFFull));
+  size_t extension_bits = 999, total_bits = 999;  // Initialize as garbage.
+  ASSERT_FALSE(Bundle::CanEncode(h, &extension_bits, &total_bits));
+}
+#endif
+
+struct OldBundle : public Fields {
+  OldBundle() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(OldBundle)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large));
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+    return visitor->EndExtensions();
+  }
+
+  uint32_t old_small;
+  float old_f;
+  uint32_t old_large;
+  uint64_t extensions;
+};
+
+struct NewBundle : public Fields {
+  NewBundle() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(NewBundle)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Bits(2), Bits(3), Bits(4), 1, &old_small));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.125f, &old_f));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Bits(7), Bits(12), Bits(16), Bits(32), 0, &old_large));
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+    if (visitor->Conditional(extensions & 1)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(2), Bits(2), Bits(3), Bits(4), 2, &new_small));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(-2.0f, &new_f));
+    }
+    if (visitor->Conditional(extensions & 2)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Bits(9), Bits(12), Bits(16), Bits(32), 0, &new_large));
+    }
+    return visitor->EndExtensions();
+  }
+
+  uint32_t old_small;
+  float old_f;
+  uint32_t old_large;
+  uint64_t extensions;
+
+  // If extensions & 1
+  uint32_t new_small = 2;
+  float new_f = -2.0f;
+  // If extensions & 2
+  uint32_t new_large = 0;
+};
+
+TEST(FieldsTest, TestNewDecoderOldData) {
+  OldBundle old_bundle;
+  old_bundle.old_large = 123;
+  old_bundle.old_f = 3.75f;
+  old_bundle.extensions = 0;
+
+  // Write to bit stream
+  const size_t kMaxOutBytes = 999;
+  BitWriter writer;
+  // Make sure values are initialized by code under test.
+  size_t extension_bits = 12345, total_bits = 12345;
+  ASSERT_TRUE(Bundle::CanEncode(old_bundle, &extension_bits, &total_bits));
+  ASSERT_LE(total_bits, kMaxOutBytes * kBitsPerByte);
+  EXPECT_EQ(0u, extension_bits);
+  AuxOut aux_out;
+  ASSERT_TRUE(Bundle::Write(old_bundle, &writer, kLayerHeader, &aux_out));
+
+  BitWriter::Allotment allotment(&writer,
+                                 kMaxOutBytes * kBitsPerByte - total_bits);
+  writer.Write(20, 0xA55A);  // sentinel
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
+
+  ASSERT_LE(writer.GetSpan().size(), kMaxOutBytes);
+  BitReader reader(writer.GetSpan());
+  NewBundle new_bundle;
+  ASSERT_TRUE(Bundle::Read(&reader, &new_bundle));
+  EXPECT_EQ(reader.TotalBitsConsumed(),
+            aux_out.layers[kLayerHeader].total_bits);
+  EXPECT_EQ(reader.ReadBits(20), 0xA55Au);
+  EXPECT_TRUE(reader.Close());
+
+  // Old fields are the same in both
+  EXPECT_EQ(old_bundle.extensions, new_bundle.extensions);
+  EXPECT_EQ(old_bundle.old_small, new_bundle.old_small);
+  EXPECT_EQ(old_bundle.old_f, new_bundle.old_f);
+  EXPECT_EQ(old_bundle.old_large, new_bundle.old_large);
+  // New fields match their defaults
+  EXPECT_EQ(2u, new_bundle.new_small);
+  EXPECT_EQ(-2.0f, new_bundle.new_f);
+  EXPECT_EQ(0u, new_bundle.new_large);
+}
+
+TEST(FieldsTest, TestOldDecoderNewData) {
+  NewBundle new_bundle;
+  new_bundle.old_large = 123;
+  new_bundle.extensions = 3;
+  new_bundle.new_f = 999.0f;
+  new_bundle.new_large = 456;
+
+  // Write to bit stream
+  constexpr size_t kMaxOutBytes = 999;
+  BitWriter writer;
+  // Make sure values are initialized by code under test.
+  size_t extension_bits = 12345, total_bits = 12345;
+  ASSERT_TRUE(Bundle::CanEncode(new_bundle, &extension_bits, &total_bits));
+  EXPECT_NE(0u, extension_bits);
+  AuxOut aux_out;
+  ASSERT_TRUE(Bundle::Write(new_bundle, &writer, kLayerHeader, &aux_out));
+  ASSERT_LE(aux_out.layers[kLayerHeader].total_bits,
+            kMaxOutBytes * kBitsPerByte);
+
+  BitWriter::Allotment allotment(
+      &writer,
+      kMaxOutBytes * kBitsPerByte - aux_out.layers[kLayerHeader].total_bits);
+  // Ensure Read skips the additional fields
+  writer.Write(20, 0xA55A);  // sentinel
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
+
+  BitReader reader(writer.GetSpan());
+  OldBundle old_bundle;
+  ASSERT_TRUE(Bundle::Read(&reader, &old_bundle));
+  EXPECT_EQ(reader.TotalBitsConsumed(),
+            aux_out.layers[kLayerHeader].total_bits);
+  EXPECT_EQ(reader.ReadBits(20), 0xA55Au);
+  EXPECT_TRUE(reader.Close());
+
+  // Old fields are the same in both
+  EXPECT_EQ(new_bundle.extensions, old_bundle.extensions);
+  EXPECT_EQ(new_bundle.old_small, old_bundle.old_small);
+  EXPECT_EQ(new_bundle.old_f, old_bundle.old_f);
+  EXPECT_EQ(new_bundle.old_large, old_bundle.old_large);
+  // (Can't check new fields because old decoder doesn't know about them)
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.cc b/third_party/jpeg-xl/lib/jxl/frame_header.cc
new file mode 100644
index 0000000000..475ce8e05e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/frame_header.cc
@@ -0,0 +1,494 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/frame_header.h"
+
+#include <sstream>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+constexpr uint8_t YCbCrChromaSubsampling::kHShift[] = {0, 1, 1, 0};
+constexpr uint8_t YCbCrChromaSubsampling::kVShift[] = {0, 1, 0, 1};
+
+static Status VisitBlendMode(Visitor* JXL_RESTRICT visitor,
+                             BlendMode default_value, BlendMode* blend_mode) {
+  uint32_t encoded = static_cast<uint32_t>(*blend_mode);
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+      Val(static_cast<uint32_t>(BlendMode::kReplace)),
+      Val(static_cast<uint32_t>(BlendMode::kAdd)),
+      Val(static_cast<uint32_t>(BlendMode::kBlend)), BitsOffset(2, 3),
+      static_cast<uint32_t>(default_value), &encoded));
+  if (encoded > 4) {
+    return JXL_FAILURE("Invalid blend_mode");
+  }
+  *blend_mode = static_cast<BlendMode>(encoded);
+  return true;
+}
+
+static Status VisitFrameType(Visitor* JXL_RESTRICT visitor,
+                             FrameType default_value, FrameType* frame_type) {
+  uint32_t encoded = static_cast<uint32_t>(*frame_type);
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(static_cast<uint32_t>(FrameType::kRegularFrame)),
+                   Val(static_cast<uint32_t>(FrameType::kDCFrame)),
+                   Val(static_cast<uint32_t>(FrameType::kReferenceOnly)),
+                   Val(static_cast<uint32_t>(FrameType::kSkipProgressive)),
+                   static_cast<uint32_t>(default_value), &encoded));
+  *frame_type = static_cast<FrameType>(encoded);
+  return true;
+}
+
+BlendingInfo::BlendingInfo() { Bundle::Init(this); }
+
+Status BlendingInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(
+      VisitBlendMode(visitor, BlendMode::kReplace, &mode));
+  if (visitor->Conditional(nonserialized_num_extra_channels > 0 &&
+                           (mode == BlendMode::kBlend ||
+                            mode == BlendMode::kAlphaWeightedAdd))) {
+    // Up to 11 alpha channels for blending.
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(0), Val(1), Val(2), BitsOffset(3, 3), 0, &alpha_channel));
+    if (visitor->IsReading() &&
+        alpha_channel >= nonserialized_num_extra_channels) {
+      return JXL_FAILURE("Invalid alpha channel for blending");
+    }
+  }
+  if (visitor->Conditional((nonserialized_num_extra_channels > 0 &&
+                            (mode == BlendMode::kBlend ||
+                             mode == BlendMode::kAlphaWeightedAdd)) ||
+                           mode == BlendMode::kMul)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &clamp));
+  }
+  // 'old' frame for blending. Only necessary if this is not a full frame, or
+  // blending is not kReplace.
+  if (visitor->Conditional(mode != BlendMode::kReplace ||
+                           nonserialized_is_partial_frame)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &source));
+  }
+  return true;
+}
+
+std::string BlendingInfo::DebugString() const {
+  std::ostringstream os;
+  os << (mode == BlendMode::kReplace            ? "Replace"
+         : mode == BlendMode::kAdd              ? "Add"
+         : mode == BlendMode::kBlend            ? "Blend"
+         : mode == BlendMode::kAlphaWeightedAdd ? "AlphaWeightedAdd"
+                                                : "Mul");
+  if (nonserialized_num_extra_channels > 0 &&
+      (mode == BlendMode::kBlend || mode == BlendMode::kAlphaWeightedAdd)) {
+    os << ",alpha=" << alpha_channel << ",clamp=" << clamp;
+  } else if (mode == BlendMode::kMul) {
+    os << ",clamp=" << clamp;
+  }
+  if (mode != BlendMode::kReplace || nonserialized_is_partial_frame) {
+    os << ",source=" << source;
+  }
+  return os.str();
+}
+
+AnimationFrame::AnimationFrame(const CodecMetadata* metadata)
+    : nonserialized_metadata(metadata) {
+  Bundle::Init(this);
+}
+Status AnimationFrame::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->Conditional(nonserialized_metadata != nullptr &&
+                           nonserialized_metadata->m.have_animation)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Bits(8), Bits(32), 0, &duration));
+  }
+
+  if (visitor->Conditional(
+          nonserialized_metadata != nullptr &&
+          nonserialized_metadata->m.animation.have_timecodes)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(32, 0, &timecode));
+  }
+  return true;
+}
+
+YCbCrChromaSubsampling::YCbCrChromaSubsampling() { Bundle::Init(this); }
+Passes::Passes() { Bundle::Init(this); }
+Status Passes::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(1), Val(2), Val(3), BitsOffset(3, 4), 1, &num_passes));
+  JXL_ASSERT(num_passes <= kMaxNumPasses);  // Cannot happen when reading
+
+  if (visitor->Conditional(num_passes != 1)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(0), Val(1), Val(2), BitsOffset(1, 3), 0, &num_downsample));
+    JXL_ASSERT(num_downsample <= 4);  // 1,2,4,8
+    if (num_downsample > num_passes) {
+      return JXL_FAILURE("num_downsample %u > num_passes %u", num_downsample,
+                         num_passes);
+    }
+
+    for (uint32_t i = 0; i < num_passes - 1; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &shift[i]));
+    }
+    shift[num_passes - 1] = 0;
+
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &downsample[i]));
+      if (i > 0 && downsample[i] >= downsample[i - 1]) {
+        return JXL_FAILURE("downsample sequence should be decreasing");
+      }
+    }
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), Val(1), Val(2), Bits(3), 0, &last_pass[i]));
+      if (i > 0 && last_pass[i] <= last_pass[i - 1]) {
+        return JXL_FAILURE("last_pass sequence should be increasing");
+      }
+      if (last_pass[i] >= num_passes) {
+        return JXL_FAILURE("last_pass %u >= num_passes %u", last_pass[i],
+                           num_passes);
+      }
+    }
+  }
+
+  return true;
+}
+
+std::string Passes::DebugString() const {
+  std::ostringstream os;
+  os << "p=" << num_passes;
+  if (num_downsample) {
+    os << ",ds=";
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      os << last_pass[i] << ":" << downsample[i];
+      if (i + 1 < num_downsample) os << ";";
+    }
+  }
+  bool have_shifts = false;
+  for (uint32_t i = 0; i < num_passes; ++i) {
+    if (shift[i]) have_shifts = true;
+  }
+  if (have_shifts) {
+    os << ",shifts=";
+    for (uint32_t i = 0; i < num_passes; ++i) {
+      os << shift[i];
+      if (i + 1 < num_passes) os << ";";
+    }
+  }
+  return os.str();
+}
+
+FrameHeader::FrameHeader(const CodecMetadata* metadata)
+    : animation_frame(metadata), nonserialized_metadata(metadata) {
+  Bundle::Init(this);
+}
+
+Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
+                       FrameHeader* JXL_RESTRICT frame) {
+  return Bundle::Read(reader, frame);
+}
+
+Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      VisitFrameType(visitor, FrameType::kRegularFrame, &frame_type));
+  if (visitor->IsReading() && nonserialized_is_preview &&
+      frame_type != kRegularFrame) {
+    return JXL_FAILURE("Only regular frame could be a preview");
+  }
+
+  // FrameEncoding.
+  bool is_modular = (encoding == FrameEncoding::kModular);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &is_modular));
+  encoding = (is_modular ? FrameEncoding::kModular : FrameEncoding::kVarDCT);
+
+  // Flags
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U64(0, &flags));
+
+  // Color transform
+  bool xyb_encoded = nonserialized_metadata == nullptr ||
+                     nonserialized_metadata->m.xyb_encoded;
+
+  if (xyb_encoded) {
+    color_transform = ColorTransform::kXYB;
+  } else {
+    // Alternate if kYCbCr.
+    bool alternate = color_transform == ColorTransform::kYCbCr;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alternate));
+    color_transform =
+        (alternate ? ColorTransform::kYCbCr : ColorTransform::kNone);
+  }
+
+  // Chroma subsampling for YCbCr, if no DC frame is used.
+  if (visitor->Conditional(color_transform == ColorTransform::kYCbCr &&
+                           ((flags & kUseDcFrame) == 0))) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&chroma_subsampling));
+  }
+
+  size_t num_extra_channels =
+      nonserialized_metadata != nullptr
+          ? nonserialized_metadata->m.extra_channel_info.size()
+          : 0;
+
+  // Upsampling
+  if (visitor->Conditional((flags & kUseDcFrame) == 0)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &upsampling));
+    if (nonserialized_metadata != nullptr &&
+        visitor->Conditional(num_extra_channels != 0)) {
+      const std::vector<ExtraChannelInfo>& extra_channels =
+          nonserialized_metadata->m.extra_channel_info;
+      extra_channel_upsampling.resize(extra_channels.size(), 1);
+      for (size_t i = 0; i < extra_channels.size(); ++i) {
+        uint32_t dim_shift =
+            nonserialized_metadata->m.extra_channel_info[i].dim_shift;
+        uint32_t& ec_upsampling = extra_channel_upsampling[i];
+        ec_upsampling >>= dim_shift;
+        JXL_QUIET_RETURN_IF_ERROR(
+            visitor->U32(Val(1), Val(2), Val(4), Val(8), 1, &ec_upsampling));
+        ec_upsampling <<= dim_shift;
+        if (ec_upsampling < upsampling) {
+          return JXL_FAILURE(
+              "EC upsampling (%u) < color upsampling (%u), which is invalid.",
+              ec_upsampling, upsampling);
+        }
+        if (ec_upsampling > 8) {
+          return JXL_FAILURE("EC upsampling too large (%u)", ec_upsampling);
+        }
+      }
+    } else {
+      extra_channel_upsampling.clear();
+    }
+  }
+
+  // Modular- or VarDCT-specific data.
+  if (visitor->Conditional(encoding == FrameEncoding::kModular)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 1, &group_size_shift));
+  }
+  if (visitor->Conditional(encoding == FrameEncoding::kVarDCT &&
+                           color_transform == ColorTransform::kXYB)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 3, &x_qm_scale));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 2, &b_qm_scale));
+  } else {
+    x_qm_scale = b_qm_scale = 2;  // noop
+  }
+
+  // Not useful for kPatchSource
+  if (visitor->Conditional(frame_type != FrameType::kReferenceOnly)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&passes));
+  }
+
+  if (visitor->Conditional(frame_type == FrameType::kDCFrame)) {
+    // Up to 4 pyramid levels - for up to 16384x downsampling.
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &dc_level));
+  }
+  if (frame_type != FrameType::kDCFrame) {
+    dc_level = 0;
+  }
+
+  bool is_partial_frame = false;
+  if (visitor->Conditional(frame_type != FrameType::kDCFrame)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &custom_size_or_origin));
+    if (visitor->Conditional(custom_size_or_origin)) {
+      const U32Enc enc(Bits(8), BitsOffset(11, 256), BitsOffset(14, 2304),
+                       BitsOffset(30, 18688));
+      // Frame offset, only if kRegularFrame or kSkipProgressive.
+      if (visitor->Conditional(frame_type == FrameType::kRegularFrame ||
+                               frame_type == FrameType::kSkipProgressive)) {
+        uint32_t ux0 = PackSigned(frame_origin.x0);
+        uint32_t uy0 = PackSigned(frame_origin.y0);
+        JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &ux0));
+        JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &uy0));
+        frame_origin.x0 = UnpackSigned(ux0);
+        frame_origin.y0 = UnpackSigned(uy0);
+      }
+      // Frame size
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.xsize));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(enc, 0, &frame_size.ysize));
+      if (custom_size_or_origin &&
+          (frame_size.xsize == 0 || frame_size.ysize == 0)) {
+        return JXL_FAILURE(
+            "Invalid crop dimensions for frame: zero width or height");
+      }
+      int32_t image_xsize = default_xsize();
+      int32_t image_ysize = default_ysize();
+      if (frame_type == FrameType::kRegularFrame ||
+          frame_type == FrameType::kSkipProgressive) {
+        is_partial_frame |= frame_origin.x0 > 0;
+        is_partial_frame |= frame_origin.y0 > 0;
+        is_partial_frame |= (static_cast<int32_t>(frame_size.xsize) +
+                             frame_origin.x0) < image_xsize;
+        is_partial_frame |= (static_cast<int32_t>(frame_size.ysize) +
+                             frame_origin.y0) < image_ysize;
+      }
+    }
+  }
+
+  // Blending info, animation info and whether this is the last frame or not.
+  if (visitor->Conditional(frame_type == FrameType::kRegularFrame ||
+                           frame_type == FrameType::kSkipProgressive)) {
+    blending_info.nonserialized_num_extra_channels = num_extra_channels;
+    blending_info.nonserialized_is_partial_frame = is_partial_frame;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blending_info));
+    bool replace_all = (blending_info.mode == BlendMode::kReplace);
+    extra_channel_blending_info.resize(num_extra_channels);
+    for (size_t i = 0; i < num_extra_channels; i++) {
+      auto& ec_blending_info = extra_channel_blending_info[i];
+      ec_blending_info.nonserialized_is_partial_frame = is_partial_frame;
+      ec_blending_info.nonserialized_num_extra_channels = num_extra_channels;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&ec_blending_info));
+      replace_all &= (ec_blending_info.mode == BlendMode::kReplace);
+    }
+    if (visitor->IsReading() && nonserialized_is_preview) {
+      if (!replace_all || custom_size_or_origin) {
+        return JXL_FAILURE("Preview is not compatible with blending");
+      }
+    }
+    if (visitor->Conditional(nonserialized_metadata != nullptr &&
+                             nonserialized_metadata->m.have_animation)) {
+      animation_frame.nonserialized_metadata = nonserialized_metadata;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation_frame));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &is_last));
+  }
+  if (frame_type != FrameType::kRegularFrame) {
+    is_last = false;
+  }
+
+  // ID of that can be used to refer to this frame. 0 for a non-zero-duration
+  // frame means that it will not be referenced. Not necessary for the last
+  // frame.
+  if (visitor->Conditional(frame_type != kDCFrame && !is_last)) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), Val(2), Val(3), 0, &save_as_reference));
+  }
+
+  // If this frame is not blended on another frame post-color-transform, it may
+  // be stored for being referenced either before or after the color transform.
+  // If it is blended post-color-transform, it must be blended after. It must
+  // also be blended after if this is a kRegular frame that does not cover the
+  // full frame, as samples outside the partial region are from a
+  // post-color-transform frame.
+  if (frame_type != FrameType::kDCFrame) {
+    if (visitor->Conditional(CanBeReferenced() &&
+                             blending_info.mode == BlendMode::kReplace &&
+                             !is_partial_frame &&
+                             (frame_type == FrameType::kRegularFrame ||
+                              frame_type == FrameType::kSkipProgressive))) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bool(false, &save_before_color_transform));
+    } else if (visitor->Conditional(frame_type == FrameType::kReferenceOnly)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bool(true, &save_before_color_transform));
+      if (!save_before_color_transform &&
+          (frame_size.xsize < nonserialized_metadata->xsize() ||
+           frame_size.ysize < nonserialized_metadata->ysize() ||
+           frame_origin.x0 != 0 || frame_origin.y0 != 0)) {
+        return JXL_FAILURE(
+            "non-patch reference frame with invalid crop: %" PRIuS "x%" PRIuS
+            "%+d%+d",
+            static_cast<size_t>(frame_size.xsize),
+            static_cast<size_t>(frame_size.ysize),
+            static_cast<int>(frame_origin.x0),
+            static_cast<int>(frame_origin.y0));
+      }
+    }
+  } else {
+    save_before_color_transform = true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name));
+
+  loop_filter.nonserialized_is_modular = is_modular;
+  JXL_RETURN_IF_ERROR(visitor->VisitNested(&loop_filter));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+std::string FrameHeader::DebugString() const {
+  std::ostringstream os;
+  os << (encoding == FrameEncoding::kVarDCT ? "VarDCT" : "Modular");
+  os << ",";
+  os << (frame_type == FrameType::kRegularFrame    ? "Regular"
+         : frame_type == FrameType::kDCFrame       ? "DC"
+         : frame_type == FrameType::kReferenceOnly ? "Reference"
+                                                   : "SkipProgressive");
+  if (frame_type == FrameType::kDCFrame) {
+    os << "(lv" << dc_level << ")";
+  }
+
+  if (flags) {
+    os << ",";
+    uint32_t remaining = flags;
+
+#define TEST_FLAG(name)           \
+  if (flags & Flags::k##name) {   \
+    remaining &= ~Flags::k##name; \
+    os << #name;                  \
+    if (remaining) os << "|";     \
+  }
+    TEST_FLAG(Noise);
+    TEST_FLAG(Patches);
+    TEST_FLAG(Splines);
+    TEST_FLAG(UseDcFrame);
+    TEST_FLAG(SkipAdaptiveDCSmoothing);
+#undef TEST_FLAG
+  }
+
+  os << ",";
+  os << (color_transform == ColorTransform::kXYB     ? "XYB"
+         : color_transform == ColorTransform::kYCbCr ? "YCbCr"
+                                                     : "None");
+
+  if (encoding == FrameEncoding::kModular) {
+    os << ",shift=" << group_size_shift;
+  } else if (color_transform == ColorTransform::kXYB) {
+    os << ",qm=" << x_qm_scale << ";" << b_qm_scale;
+  }
+  if (frame_type != FrameType::kReferenceOnly) {
+    os << "," << passes.DebugString();
+  }
+  if (custom_size_or_origin) {
+    os << ",xs=" << frame_size.xsize;
+    os << ",ys=" << frame_size.ysize;
+    if (frame_type == FrameType::kRegularFrame ||
+        frame_type == FrameType::kSkipProgressive) {
+      os << ",x0=" << frame_origin.x0;
+      os << ",y0=" << frame_origin.y0;
+    }
+  }
+  if (upsampling > 1) os << ",up=" << upsampling;
+  if (loop_filter.gab) os << ",Gaborish";
+  if (loop_filter.epf_iters > 0) os << ",epf=" << loop_filter.epf_iters;
+  if (animation_frame.duration > 0) os << ",dur=" << animation_frame.duration;
+  if (frame_type == FrameType::kRegularFrame ||
+      frame_type == FrameType::kSkipProgressive) {
+    os << ",";
+    os << blending_info.DebugString();
+    for (size_t i = 0; i < extra_channel_blending_info.size(); ++i) {
+      os << (i == 0 ? "[" : ";");
+      os << extra_channel_blending_info[i].DebugString();
+      if (i + 1 == extra_channel_blending_info.size()) os << "]";
+    }
+  }
+  if (save_as_reference > 0) os << ",ref=" << save_as_reference;
+  os << "," << (save_before_color_transform ? "before" : "after") << "_ct";
+  if (is_last) os << ",last";
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/frame_header.h b/third_party/jpeg-xl/lib/jxl/frame_header.h
new file mode 100644
index 0000000000..5580bcd6fe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/frame_header.h
@@ -0,0 +1,503 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FRAME_HEADER_H_
+#define LIB_JXL_FRAME_HEADER_H_
+
+// Frame header with backward and forward-compatible extension capability and
+// compressed integer fields.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/loop_filter.h"
+
+namespace jxl {
+
+// TODO(eustas): move to proper place?
+// Also used by extra channel names.
+static inline Status VisitNameString(Visitor* JXL_RESTRICT visitor,
+                                     std::string* name) {
+  uint32_t name_length = static_cast<uint32_t>(name->length());
+  // Allows layer name lengths up to 1071 bytes
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Bits(4), BitsOffset(5, 16),
+                                         BitsOffset(10, 48), 0, &name_length));
+  if (visitor->IsReading()) {
+    name->resize(name_length);
+  }
+  for (size_t i = 0; i < name_length; i++) {
+    uint32_t c = static_cast<uint8_t>((*name)[i]);
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(8, 0, &c));
+    (*name)[i] = static_cast<char>(c);
+  }
+  return true;
+}
+
+enum class FrameEncoding : uint32_t {
+  kVarDCT,
+  kModular,
+};
+
+enum class ColorTransform : uint32_t {
+  kXYB,    // Values are encoded with XYB. May only be used if
+           // ImageBundle::xyb_encoded.
+  kNone,   // Values are encoded according to the attached color profile. May
+           // only be used if !ImageBundle::xyb_encoded.
+  kYCbCr,  // Values are encoded according to the attached color profile, but
+           // transformed to YCbCr. May only be used if
+           // !ImageBundle::xyb_encoded.
+};
+
+inline std::array<int, 3> JpegOrder(ColorTransform ct, bool is_gray) {
+  if (is_gray) {
+    return {{0, 0, 0}};
+  }
+  JXL_ASSERT(ct != ColorTransform::kXYB);
+  if (ct == ColorTransform::kYCbCr) {
+    return {{1, 0, 2}};
+  } else {
+    return {{0, 1, 2}};
+  }
+}
+
+struct YCbCrChromaSubsampling : public Fields {
+  YCbCrChromaSubsampling();
+  JXL_FIELDS_NAME(YCbCrChromaSubsampling)
+  size_t HShift(size_t c) const { return maxhs_ - kHShift[channel_mode_[c]]; }
+  size_t VShift(size_t c) const { return maxvs_ - kVShift[channel_mode_[c]]; }
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override {
+    // TODO(veluca): consider allowing 4x downsamples
+    for (size_t i = 0; i < 3; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 0, &channel_mode_[i]));
+    }
+    Recompute();
+    return true;
+  }
+
+  uint8_t MaxHShift() const { return maxhs_; }
+  uint8_t MaxVShift() const { return maxvs_; }
+
+  uint8_t RawHShift(size_t c) const { return kHShift[channel_mode_[c]]; }
+  uint8_t RawVShift(size_t c) const { return kVShift[channel_mode_[c]]; }
+
+  // Uses JPEG channel order (Y, Cb, Cr).
+  Status Set(const uint8_t* hsample, const uint8_t* vsample) {
+    for (size_t c = 0; c < 3; c++) {
+      size_t cjpeg = c < 2 ? c ^ 1 : c;
+      size_t i = 0;
+      for (; i < 4; i++) {
+        if (1 << kHShift[i] == hsample[cjpeg] &&
+            1 << kVShift[i] == vsample[cjpeg]) {
+          channel_mode_[c] = i;
+          break;
+        }
+      }
+      if (i == 4) {
+        return JXL_FAILURE("Invalid subsample mode");
+      }
+    }
+    Recompute();
+    return true;
+  }
+
+  bool Is444() const {
+    return HShift(0) == 0 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is420() const {
+    return HShift(0) == 1 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is422() const {
+    return HShift(0) == 1 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  bool Is440() const {
+    return HShift(0) == 0 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  std::string DebugString() const {
+    if (Is444()) return "444";
+    if (Is420()) return "420";
+    if (Is422()) return "422";
+    if (Is440()) return "440";
+    return "cs" + std::to_string(channel_mode_[0]) +
+           std::to_string(channel_mode_[1]) + std::to_string(channel_mode_[2]);
+  }
+
+ private:
+  void Recompute() {
+    maxhs_ = 0;
+    maxvs_ = 0;
+    for (size_t i = 0; i < 3; i++) {
+      maxhs_ = std::max(maxhs_, kHShift[channel_mode_[i]]);
+      maxvs_ = std::max(maxvs_, kVShift[channel_mode_[i]]);
+    }
+  }
+  static const uint8_t kHShift[4];
+  static const uint8_t kVShift[4];
+  uint32_t channel_mode_[3];
+  uint8_t maxhs_;
+  uint8_t maxvs_;
+};
+
+// Indicates how to combine the current frame with a previously-saved one. Can
+// be independently controlled for color and extra channels. Formulas are
+// indicative and treat alpha as if it is in range 0.0-1.0. In descriptions
+// below, alpha channel is the extra channel of type alpha used for blending
+// according to the blend_channel, or fully opaque if there is no alpha channel.
+// The blending specified here is used for performing blending *after* color
+// transforms - in linear sRGB if blending a XYB-encoded frame on another
+// XYB-encoded frame, in sRGB if blending a frame with kColorSpace == kSRGB, or
+// in the original colorspace otherwise. Blending in XYB or YCbCr is done by
+// using patches.
+enum class BlendMode {
+  // The new values (in the crop) replace the old ones: sample = new
+  kReplace = 0,
+  // The new values (in the crop) get added to the old ones: sample = old + new
+  kAdd = 1,
+  // The new values (in the crop) replace the old ones if alpha>0:
+  // For the alpha channel that is used as source:
+  // alpha = old + new * (1 - old)
+  // For other channels if !alpha_associated:
+  // sample = ((1 - new_alpha) * old * old_alpha + new_alpha * new) / alpha
+  // For other channels if alpha_associated:
+  // sample = (1 - new_alpha) * old + new
+  // The alpha formula applies to the alpha used for the division in the other
+  // channels formula, and applies to the alpha channel itself if its
+  // blend_channel value matches itself.
+  kBlend = 2,
+  // The new values (in the crop) are added to the old ones if alpha>0:
+  // For the alpha channel that is used as source:
+  // sample = sample = old + new * (1 - old)
+  // For other channels: sample = old + alpha * new
+  kAlphaWeightedAdd = 3,
+  // The new values (in the crop) get multiplied by the old ones:
+  // sample = old * new
+  // The range of the new value matters for multiplication purposes, and its
+  // nominal range of 0..1 is computed the same way as this is done for the
+  // alpha values in kBlend and kAlphaWeightedAdd.
+  // If using kMul as a blend mode for color channels, no color transform is
+  // performed on the current frame.
+  kMul = 4,
+};
+
+struct BlendingInfo : public Fields {
+  BlendingInfo();
+  JXL_FIELDS_NAME(BlendingInfo)
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+  BlendMode mode;
+  // Which extra channel to use as alpha channel for blending, only encoded
+  // for blend modes that involve alpha and if there are more than 1 extra
+  // channels.
+  uint32_t alpha_channel;
+  // Clamp alpha or channel values to 0-1 range.
+  bool clamp;
+  // Frame ID to copy from (0-3). Only encoded if blend_mode is not kReplace.
+  uint32_t source;
+
+  std::string DebugString() const;
+
+  size_t nonserialized_num_extra_channels = 0;
+  bool nonserialized_is_partial_frame = false;
+};
+
+// Origin of the current frame. Not present for frames of type
+// kOnlyPatches.
+struct FrameOrigin {
+  int32_t x0, y0;  // can be negative.
+};
+
+// Size of the current frame.
+struct FrameSize {
+  uint32_t xsize, ysize;
+};
+
+// AnimationFrame defines duration of animation frames.
+struct AnimationFrame : public Fields {
+  explicit AnimationFrame(const CodecMetadata* metadata);
+  JXL_FIELDS_NAME(AnimationFrame)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // How long to wait [in ticks, see Animation{}] after rendering.
+  // May be 0 if the current frame serves as a foundation for another frame.
+  uint32_t duration;
+
+  uint32_t timecode;  // 0xHHMMSSFF
+
+  // Must be set to the one ImageMetadata acting as the full codestream header,
+  // with correct xyb_encoded, list of extra channels, etc...
+  const CodecMetadata* nonserialized_metadata = nullptr;
+};
+
+// For decoding to lower resolutions. Only used for kRegular frames.
+struct Passes : public Fields {
+  Passes();
+  JXL_FIELDS_NAME(Passes)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  void GetDownsamplingBracket(size_t pass, int& minShift, int& maxShift) const {
+    maxShift = 2;
+    minShift = 3;
+    for (size_t i = 0;; i++) {
+      for (uint32_t j = 0; j < num_downsample; ++j) {
+        if (i == last_pass[j]) {
+          if (downsample[j] == 8) minShift = 3;
+          if (downsample[j] == 4) minShift = 2;
+          if (downsample[j] == 2) minShift = 1;
+          if (downsample[j] == 1) minShift = 0;
+        }
+      }
+      if (i == num_passes - 1) minShift = 0;
+      if (i == pass) return;
+      maxShift = minShift - 1;
+    }
+  }
+
+  uint32_t GetDownsamplingTargetForCompletedPasses(uint32_t num_p) const {
+    if (num_p >= num_passes) return 1;
+    uint32_t retval = 8;
+    for (uint32_t i = 0; i < num_downsample; ++i) {
+      if (num_p > last_pass[i]) {
+        retval = std::min(retval, downsample[i]);
+      }
+    }
+    return retval;
+  }
+
+  std::string DebugString() const;
+
+  uint32_t num_passes;      // <= kMaxNumPasses
+  uint32_t num_downsample;  // <= num_passes
+
+  // Array of num_downsample pairs. downsample=1/last_pass=num_passes-1 and
+  // downsample=8/last_pass=0 need not be specified; they are implicit.
+  uint32_t downsample[kMaxNumPasses];
+  uint32_t last_pass[kMaxNumPasses];
+  // Array of shift values for each pass. It is implicitly assumed to be 0 for
+  // the last pass.
+  uint32_t shift[kMaxNumPasses];
+};
+
+enum FrameType {
+  // A "regular" frame: might be a crop, and will be blended on a previous
+  // frame, if any, and displayed or blended in future frames.
+  kRegularFrame = 0,
+  // A DC frame: this frame is downsampled and will be *only* used as the DC of
+  // a future frame and, possibly, for previews. Cannot be cropped, blended, or
+  // referenced by patches or blending modes. Frames that *use* a DC frame
+  // cannot have non-default sizes either.
+  kDCFrame = 1,
+  // A PatchesSource frame: this frame will be only used as a source frame for
+  // taking patches. Can be cropped, but cannot have non-(0, 0) x0 and y0.
+  kReferenceOnly = 2,
+  // Same as kRegularFrame, but not used for progressive rendering. This also
+  // implies no early display of DC.
+  kSkipProgressive = 3,
+};
+
+// Image/frame := one of more of these, where the last has is_last = true.
+// Starts at a byte-aligned address "a"; the next pass starts at "a + size".
+struct FrameHeader : public Fields {
+  // Optional postprocessing steps. These flags are the source of truth;
+  // Override must set/clear them rather than change their meaning. Values
+  // chosen such that typical flags == 0 (encoded in only two bits).
+  enum Flags {
+    // Often but not always off => low bit value:
+
+    // Inject noise into decoded output.
+    kNoise = 1,
+
+    // Overlay patches.
+    kPatches = 2,
+
+    // 4, 8 = reserved for future sometimes-off
+
+    // Overlay splines.
+    kSplines = 16,
+
+    kUseDcFrame = 32,  // Implies kSkipAdaptiveDCSmoothing.
+
+    // 64 = reserved for future often-off
+
+    // Almost always on => negated:
+
+    kSkipAdaptiveDCSmoothing = 128,
+  };
+
+  explicit FrameHeader(const CodecMetadata* metadata);
+  JXL_FIELDS_NAME(FrameHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Sets/clears `flag` based upon `condition`.
+  void UpdateFlag(const bool condition, const uint64_t flag) {
+    if (condition) {
+      flags |= flag;
+    } else {
+      flags &= ~flag;
+    }
+  }
+
+  // Returns true if this frame is supposed to be saved for future usage by
+  // other frames.
+  bool CanBeReferenced() const {
+    // DC frames cannot be referenced. The last frame cannot be referenced. A
+    // duration 0 frame makes little sense if it is not referenced. A
+    // non-duration 0 frame may or may not be referenced.
+    return !is_last && frame_type != FrameType::kDCFrame &&
+           (animation_frame.duration == 0 || save_as_reference != 0);
+  }
+
+  mutable bool all_default;
+
+  // Always present
+  FrameEncoding encoding;
+  // Some versions of UBSAN complain in VisitFrameType if not initialized.
+  FrameType frame_type = FrameType::kRegularFrame;
+
+  uint64_t flags;
+
+  ColorTransform color_transform;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  uint32_t group_size_shift;  // only if encoding == kModular;
+
+  uint32_t x_qm_scale;  // only if VarDCT and color_transform == kXYB
+  uint32_t b_qm_scale;  // only if VarDCT and color_transform == kXYB
+
+  std::string name;
+
+  // Skipped for kReferenceOnly.
+  Passes passes;
+
+  // Skipped for kDCFrame
+  bool custom_size_or_origin;
+  FrameSize frame_size;
+
+  // upsampling factors for color and extra channels.
+  // Upsampling is always performed before applying any inverse color transform.
+  // Skipped (1) if kUseDCFrame
+  uint32_t upsampling;
+  std::vector<uint32_t> extra_channel_upsampling;
+
+  // Only for kRegular frames.
+  FrameOrigin frame_origin;
+
+  BlendingInfo blending_info;
+  std::vector<BlendingInfo> extra_channel_blending_info;
+
+  // Animation info for this frame.
+  AnimationFrame animation_frame;
+
+  // This is the last frame.
+  bool is_last;
+
+  // ID to refer to this frame with. 0-3, not present if kDCFrame.
+  // 0 has a special meaning for kRegular frames of nonzero duration: it defines
+  // a frame that will not be referenced in the future.
+  uint32_t save_as_reference;
+
+  // Whether to save this frame before or after the color transform. A frame
+  // that is saved before the color tansform can only be used for blending
+  // through patches. On the contrary, a frame that is saved after the color
+  // transform can only be used for blending through blending modes.
+  // Irrelevant for extra channel blending. Can only be true if
+  // blending_info.mode == kReplace and this is not a partial kRegularFrame; if
+  // this is a DC frame, it is always true.
+  bool save_before_color_transform;
+
+  uint32_t dc_level;  // 1-4 if kDCFrame (0 otherwise).
+
+  // Must be set to the one ImageMetadata acting as the full codestream header,
+  // with correct xyb_encoded, list of extra channels, etc...
+  const CodecMetadata* nonserialized_metadata = nullptr;
+
+  // NOTE: This is ignored by AllDefault.
+  LoopFilter loop_filter;
+
+  bool nonserialized_is_preview = false;
+
+  size_t default_xsize() const {
+    if (!nonserialized_metadata) return 0;
+    if (nonserialized_is_preview) {
+      return nonserialized_metadata->m.preview_size.xsize();
+    }
+    return nonserialized_metadata->xsize();
+  }
+
+  size_t default_ysize() const {
+    if (!nonserialized_metadata) return 0;
+    if (nonserialized_is_preview) {
+      return nonserialized_metadata->m.preview_size.ysize();
+    }
+    return nonserialized_metadata->ysize();
+  }
+
+  FrameDimensions ToFrameDimensions() const {
+    size_t xsize = default_xsize();
+    size_t ysize = default_ysize();
+
+    xsize = frame_size.xsize ? frame_size.xsize : xsize;
+    ysize = frame_size.ysize ? frame_size.ysize : ysize;
+
+    if (dc_level != 0) {
+      xsize = DivCeil(xsize, 1 << (3 * dc_level));
+      ysize = DivCeil(ysize, 1 << (3 * dc_level));
+    }
+
+    FrameDimensions frame_dim;
+    frame_dim.Set(xsize, ysize, group_size_shift,
+                  chroma_subsampling.MaxHShift(),
+                  chroma_subsampling.MaxVShift(),
+                  encoding == FrameEncoding::kModular, upsampling);
+    return frame_dim;
+  }
+
+  // True if a color transform should be applied to this frame.
+  bool needs_color_transform() const {
+    return !save_before_color_transform ||
+           frame_type == FrameType::kRegularFrame ||
+           frame_type == FrameType::kSkipProgressive;
+  }
+
+  std::string DebugString() const;
+
+  uint64_t extensions;
+};
+
+Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
+                       FrameHeader* JXL_RESTRICT frame);
+
+// Shared by enc/dec. 5F and 13 are by far the most common for d1/2/4/8, 0
+// ensures low overhead for small images.
+static constexpr U32Enc kOrderEnc =
+    U32Enc(Val(0x5F), Val(0x13), Val(0), Bits(kNumOrders));
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FRAME_HEADER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc
new file mode 100644
index 0000000000..131ec4fa83
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gamma_correct_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(GammaCorrectTest, TestLinearToSrgbEdgeCases) {
+  EXPECT_EQ(0, LinearToSrgb8Direct(0.0));
+  EXPECT_NEAR(0, LinearToSrgb8Direct(1E-6f), 2E-5);
+  EXPECT_EQ(0, LinearToSrgb8Direct(-1E-6f));
+  EXPECT_EQ(0, LinearToSrgb8Direct(-1E6));
+  EXPECT_NEAR(1, LinearToSrgb8Direct(1 - 1E-6f), 1E-5);
+  EXPECT_EQ(1, LinearToSrgb8Direct(1 + 1E-6f));
+  EXPECT_EQ(1, LinearToSrgb8Direct(1E6));
+}
+
+TEST(GammaCorrectTest, TestRoundTrip) {
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (double linear = 0.0; linear <= 1.0; linear += 1E-7) {
+    const double srgb = LinearToSrgb8Direct(linear);
+    const double linear2 = Srgb8ToLinearDirect(srgb);
+    ASSERT_LT(std::abs(linear - linear2), 2E-13)
+        << "linear = " << linear << ", linear2 = " << linear2;
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
new file mode 100644
index 0000000000..82384e4c64
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
@@ -0,0 +1,623 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/gauss_blur.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <cmath>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc"
+#include <hwy/cache_control.h>
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/matrix_ops.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Broadcast;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulSub;
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::ShiftLeftLanes;
+#endif
+using hwy::HWY_NAMESPACE::Vec;
+
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out) {
+  // Although the current output depends on the previous output, we can unroll
+  // up to 4x by precomputing up to fourth powers of the constants. Beyond that,
+  // numerical precision might become a problem. Macro because this is tested
+  // in #if alongside HWY_TARGET.
+#define JXL_GAUSS_MAX_LANES 4
+  using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES);
+  using V = Vec<D>;
+  const D d;
+  const V mul_in_1 = Load(d, rg->mul_in + 0 * 4);
+  const V mul_in_3 = Load(d, rg->mul_in + 1 * 4);
+  const V mul_in_5 = Load(d, rg->mul_in + 2 * 4);
+  const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4);
+  const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4);
+  const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4);
+  const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4);
+  const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4);
+  const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4);
+  V prev_1 = Zero(d);
+  V prev_3 = Zero(d);
+  V prev_5 = Zero(d);
+  V prev2_1 = Zero(d);
+  V prev2_3 = Zero(d);
+  V prev2_5 = Zero(d);
+
+  const intptr_t N = rg->radius;
+
+  intptr_t n = -N + 1;
+  // Left side with bounds checks and only write output after n >= 0.
+  const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d));
+  for (; n < std::min(first_aligned, width); ++n) {
+    const intptr_t left = n - N - 1;
+    const intptr_t right = n + N - 1;
+    const float left_val = left >= 0 ? in[left] : 0.0f;
+    const float right_val = right < width ? in[right] : 0.0f;
+    const V sum = Set(d, left_val + right_val);
+
+    // (Only processing a single lane here, no need to broadcast)
+    V out_1 = Mul(sum, mul_in_1);
+    V out_3 = Mul(sum, mul_in_3);
+    V out_5 = Mul(sum, mul_in_5);
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+
+    if (n >= 0) {
+      out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
+    }
+  }
+
+  // The above loop is effectively scalar but it is convenient to use the same
+  // prev/prev2 variables, so broadcast to each lane before the unrolled loop.
+#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1
+  prev2_1 = Broadcast<0>(prev2_1);
+  prev2_3 = Broadcast<0>(prev2_3);
+  prev2_5 = Broadcast<0>(prev2_5);
+  prev_1 = Broadcast<0>(prev_1);
+  prev_3 = Broadcast<0>(prev_3);
+  prev_5 = Broadcast<0>(prev_5);
+#endif
+
+  // Unrolled, no bounds checking needed.
+  for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) {
+    const V sum = Add(LoadU(d, in + n - N - 1), LoadU(d, in + n + N - 1));
+
+    // To get a vector of output(s), we multiply broadcasted vectors (of each
+    // input plus the two previous outputs) and add them all together.
+    // Incremental broadcasting and shifting is expected to be cheaper than
+    // horizontal adds or transposing 4x4 values because they run on a different
+    // port, concurrently with the FMA.
+    const V in0 = Broadcast<0>(sum);
+    V out_1 = Mul(in0, mul_in_1);
+    V out_3 = Mul(in0, mul_in_3);
+    V out_5 = Mul(in0, mul_in_5);
+
+#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2
+    const V in1 = Broadcast<1>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5);
+
+#if JXL_GAUSS_MAX_LANES >= 4
+    const V in2 = Broadcast<2>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5);
+
+    const V in3 = Broadcast<3>(sum);
+    out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1);
+    out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3);
+    out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5);
+#endif
+#endif
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+#if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+#else
+    prev2_1 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_1);
+    prev2_3 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_3);
+    prev2_5 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_5);
+    prev_1 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_1);
+    prev_3 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_3);
+    prev_5 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_5);
+#endif
+
+    Store(Add(out_1, Add(out_3, out_5)), d, out + n);
+  }
+
+  // Remainder handling with bounds checks
+  for (; n < width; ++n) {
+    const intptr_t left = n - N - 1;
+    const intptr_t right = n + N - 1;
+    const float left_val = left >= 0 ? in[left] : 0.0f;
+    const float right_val = right < width ? in[right] : 0.0f;
+    const V sum = Set(d, left_val + right_val);
+
+    // (Only processing a single lane here, no need to broadcast)
+    V out_1 = Mul(sum, mul_in_1);
+    V out_3 = Mul(sum, mul_in_3);
+    V out_5 = Mul(sum, mul_in_5);
+
+    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
+    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
+    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
+    prev2_1 = prev_1;
+    prev2_3 = prev_3;
+    prev2_5 = prev_5;
+
+    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
+    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
+    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
+    prev_1 = out_1;
+    prev_3 = out_3;
+    prev_5 = out_5;
+
+    out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
+  }
+}
+
+// Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo.
+constexpr size_t kMod = 4;
+
+// Avoids an unnecessary store during warmup.
+struct OutputNone {
+  template <class V>
+  void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/,
+                  ptrdiff_t /*offset*/) const {}
+};
+
+// Common case: write output vectors in all VerticalBlock except warmup.
+struct OutputStore {
+  template <class V>
+  void operator()(const V& out, float* JXL_RESTRICT pos,
+                  ptrdiff_t offset) const {
+    // Stream helps for large images but is slower for images that fit in cache.
+    Store(out, HWY_FULL(float)(), pos + offset);
+  }
+};
+
+// At top/bottom borders, we don't have two inputs to load, so avoid addition.
+// pos may even point to all zeros if the row is outside the input image.
+class SingleInput {
+ public:
+  explicit SingleInput(const float* pos) : pos_(pos) {}
+  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
+    return Load(HWY_FULL(float)(), pos_ + offset);
+  }
+  const float* pos_;
+};
+
+// In the middle of the image, we need to load from a row above and below, and
+// return the sum.
+class TwoInputs {
+ public:
+  TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {}
+  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
+    const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset);
+    const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset);
+    return Add(in1, in2);
+  }
+
+ private:
+  const float* pos1_;
+  const float* pos2_;
+};
+
+// Block := kVectors consecutive full vectors (one cache line except on the
+// right boundary, where we can only rely on having one vector). Unrolling to
+// the cache line size improves cache utilization.
+template <size_t kVectors, class V, class Input, class Output>
+void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1,
+                   const V& n2_3, const V& n2_5, const Input& input,
+                   size_t& ctr, float* ring_buffer, const Output output,
+                   float* JXL_RESTRICT out_pos) {
+  const HWY_FULL(float) d;
+  constexpr size_t kVN = MaxLanes(d);
+  // More cache-friendly to process an entirely cache line at a time
+  constexpr size_t kLanes = kVectors * kVN;
+
+  float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod;
+  float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod;
+  float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod;
+
+  const size_t n_0 = (++ctr) % kMod;
+  const size_t n_1 = (ctr - 1) % kMod;
+  const size_t n_2 = (ctr - 2) % kMod;
+
+  for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) {
+    const V sum = input(idx_vec * kVN);
+
+    const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN);
+    const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN);
+    const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN);
+    const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN);
+    // (35)
+    const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1));
+    const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3));
+    const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5));
+    Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN);
+    Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN);
+    Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN);
+    output(Add(y1, Add(y3, y5)), out_pos, idx_vec * kVN);
+  }
+  // NOTE: flushing cache line out_pos hurts performance - less so with
+  // clflushopt than clflush but still a significant slowdown.
+}
+
+// Reads/writes one block (kVectors full vectors) in each row.
+template <size_t kVectors>
+void VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                   const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) {
+  // We're iterating vertically, so use multiple full-length vectors (each lane
+  // is one column of row n).
+  using D = HWY_FULL(float);
+  using V = Vec<D>;
+  const D d;
+  constexpr size_t kVN = MaxLanes(d);
+  // More cache-friendly to process an entirely cache line at a time
+  constexpr size_t kLanes = kVectors * kVN;
+#if HWY_TARGET == HWY_SCALAR
+  const V d1_1 = Set(d, rg->d1[0 * 4]);
+  const V d1_3 = Set(d, rg->d1[1 * 4]);
+  const V d1_5 = Set(d, rg->d1[2 * 4]);
+  const V n2_1 = Set(d, rg->n2[0 * 4]);
+  const V n2_3 = Set(d, rg->n2[1 * 4]);
+  const V n2_5 = Set(d, rg->n2[2 * 4]);
+#else
+  const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4);
+  const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4);
+  const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4);
+  const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4);
+  const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4);
+  const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4);
+#endif
+
+  const size_t N = rg->radius;
+  const size_t ysize = in.ysize();
+
+  size_t ctr = 0;
+  HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0};
+  HWY_ALIGN static constexpr float zero[kLanes] = {0};
+
+  // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds.
+  ssize_t n = -static_cast<ssize_t>(N) + 1;
+  for (; n < 0; ++n) {
+    // bottom is always non-negative since n is initialized in -N + 1.
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
+        ring_buffer, OutputNone(), nullptr);
+  }
+  JXL_DASSERT(n >= 0);
+
+  // Start producing output; top is still out of bounds.
+  for (; static_cast<size_t>(n) < std::min(N + 1, ysize); ++n) {
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
+        ring_buffer, OutputStore(), out->Row(n) + x);
+  }
+
+  // Interior outputs with prefetching and without bounds checks.
+  constexpr size_t kPrefetchRows = 8;
+  for (; n < static_cast<ssize_t>(ysize - N + 1 - kPrefetchRows); ++n) {
+    const size_t top = n - N - 1;
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr,
+        ring_buffer, OutputStore(), out->Row(n) + x);
+    hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x);
+    hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x);
+  }
+
+  // Bottom border without prefetching and with bounds checks.
+  for (; static_cast<size_t>(n) < ysize; ++n) {
+    const size_t top = n - N - 1;
+    const size_t bottom = n + N - 1;
+    VerticalBlock<kVectors>(
+        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
+        TwoInputs(in.ConstRow(top) + x,
+                  bottom < ysize ? in.ConstRow(bottom) + x : zero),
+        ctr, ring_buffer, OutputStore(), out->Row(n) + x);
+  }
+}
+
+// Apply 1D vertical scan to multiple columns (one per vector lane).
+// Not yet parallelized.
+void FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                          const ImageF& in, ThreadPool* /*pool*/,
+                          ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(in, *out));
+
+  constexpr size_t kCacheLineLanes = 64 / sizeof(float);
+  constexpr size_t kVN = MaxLanes(HWY_FULL(float)());
+  constexpr size_t kCacheLineVectors =
+      (kVN < kCacheLineLanes) ? (kCacheLineLanes / kVN) : 4;
+  constexpr size_t kFastPace = kCacheLineVectors * kVN;
+
+  size_t x = 0;
+  for (; x + kFastPace <= in.xsize(); x += kFastPace) {
+    VerticalStrip<kCacheLineVectors>(rg, in, x, out);
+  }
+  for (; x < in.xsize(); x += kVN) {
+    VerticalStrip<1>(rg, in, x, out);
+  }
+}
+
+// TODO(veluca): consider replacing with FastGaussian.
+ImageF ConvolveXSampleAndTranspose(const ImageF& in,
+                                   const std::vector<float>& kernel,
+                                   const size_t res) {
+  JXL_ASSERT(kernel.size() % 2 == 1);
+  JXL_ASSERT(in.xsize() % res == 0);
+  const size_t offset = res / 2;
+  const size_t out_xsize = in.xsize() / res;
+  ImageF out(in.ysize(), out_xsize);
+  const int r = kernel.size() / 2;
+  HWY_FULL(float) df;
+  std::vector<float> row_tmp(in.xsize() + 2 * r + Lanes(df));
+  float* const JXL_RESTRICT rowp = &row_tmp[r];
+  std::vector<float> padded_k = kernel;
+  padded_k.resize(padded_k.size() + Lanes(df));
+  const float* const kernelp = &padded_k[r];
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
+    size_t x = offset, ox = 0;
+    for (; x < static_cast<uint32_t>(r) && x < in.xsize(); x += res, ++ox) {
+      float sum = 0.0f;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = sum;
+    }
+    for (; x + r < in.xsize(); x += res, ++ox) {
+      auto sum = Zero(df);
+      for (int i = -r; i <= r; i += Lanes(df)) {
+        sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum);
+      }
+      out.Row(ox)[y] = GetLane(SumOfLanes(df, sum));
+    }
+    for (; x < in.xsize(); x += res, ++ox) {
+      float sum = 0.0f;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = sum;
+    }
+  }
+  return out;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(FastGaussian1D);
+HWY_EXPORT(ConvolveXSampleAndTranspose);
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out);
+}
+
+HWY_EXPORT(FastGaussianVertical);  // Local function.
+
+void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
+                        float* const JXL_RESTRICT row_out, const int xsize,
+                        const int radius) {
+  const int lastcol = xsize - 1;
+  for (int x = 1; x <= radius; ++x) {
+    row_out[-x] = row_in[std::min(x, xsize - 1)];
+  }
+  memcpy(row_out, row_in, xsize * sizeof(row_out[0]));
+  for (int x = 1; x <= radius; ++x) {
+    row_out[lastcol + x] = row_in[std::max(0, lastcol - x)];
+  }
+}
+
+ImageF ConvolveXSampleAndTranspose(const ImageF& in,
+                                   const std::vector<float>& kernel,
+                                   const size_t res) {
+  return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res);
+}
+
+Image3F ConvolveXSampleAndTranspose(const Image3F& in,
+                                    const std::vector<float>& kernel,
+                                    const size_t res) {
+  return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res),
+                 ConvolveXSampleAndTranspose(in.Plane(1), kernel, res),
+                 ConvolveXSampleAndTranspose(in.Plane(2), kernel, res));
+}
+
+ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
+                         const size_t res) {
+  ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res);
+  return ConvolveXSampleAndTranspose(tmp, kernel, res);
+}
+
+// Implements "Recursive Implementation of the Gaussian Filter Using Truncated
+// Cosine Functions" by Charalampidis [2016].
+hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
+  PROFILER_FUNC;
+  auto rg = hwy::MakeUniqueAligned<RecursiveGaussian>();
+  constexpr double kPi = 3.141592653589793238;
+
+  const double radius = roundf(3.2795 * sigma + 0.2546);  // (57), "N"
+
+  // Table I, first row
+  const double pi_div_2r = kPi / (2.0 * radius);
+  const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r};
+
+  // (37), k={1,3,5}
+  const double p_1 = +1.0 / std::tan(0.5 * omega[0]);
+  const double p_3 = -1.0 / std::tan(0.5 * omega[1]);
+  const double p_5 = +1.0 / std::tan(0.5 * omega[2]);
+
+  // (44), k={1,3,5}
+  const double r_1 = +p_1 * p_1 / std::sin(omega[0]);
+  const double r_3 = -p_3 * p_3 / std::sin(omega[1]);
+  const double r_5 = +p_5 * p_5 / std::sin(omega[2]);
+
+  // (50), k={1,3,5}
+  const double neg_half_sigma2 = -0.5 * sigma * sigma;
+  const double recip_radius = 1.0 / radius;
+  double rho[3];
+  for (size_t i = 0; i < 3; ++i) {
+    rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius;
+  }
+
+  // second part of (52), k1,k2 = 1,3; 3,5; 5,1
+  const double D_13 = p_1 * r_3 - r_1 * p_3;
+  const double D_35 = p_3 * r_5 - r_3 * p_5;
+  const double D_51 = p_5 * r_1 - r_5 * p_1;
+
+  // (52), k=5
+  const double recip_d13 = 1.0 / D_13;
+  const double zeta_15 = D_35 * recip_d13;
+  const double zeta_35 = D_51 * recip_d13;
+
+  double A[9] = {p_1,     p_3,     p_5,  //
+                 r_1,     r_3,     r_5,  //  (56)
+                 zeta_15, zeta_35, 1};
+  JXL_CHECK(Inv3x3Matrix(A));
+  const double gamma[3] = {1, radius * radius - sigma * sigma,  // (55)
+                           zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]};
+  double beta[3];
+  Mul3x3Vector(A, gamma, beta);  // (53)
+
+  // Sanity check: correctly solved for beta (IIR filter weights are normalized)
+  const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5;  // (39)
+  JXL_ASSERT(std::abs(sum - 1) < 1E-12);
+  (void)sum;
+
+  rg->radius = static_cast<int>(radius);
+
+  double n2[3];
+  double d1[3];
+  for (size_t i = 0; i < 3; ++i) {
+    n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0));  // (33)
+    d1[i] = -2.0 * std::cos(omega[i]);                       // (33)
+
+    for (size_t lane = 0; lane < 4; ++lane) {
+      rg->n2[4 * i + lane] = static_cast<float>(n2[i]);
+      rg->d1[4 * i + lane] = static_cast<float>(d1[i]);
+    }
+
+    const double d_2 = d1[i] * d1[i];
+
+    // Obtained by expanding (35) for four consecutive outputs via sympy:
+    // n, d, p, pp = symbols('n d p pp')
+    // i0, i1, i2, i3 = symbols('i0 i1 i2 i3')
+    // o0, o1, o2, o3 = symbols('o0 o1 o2 o3')
+    // o0 = n*i0 - d*p - pp
+    // o1 = n*i1 - d*o0 - p
+    // o2 = n*i2 - d*o1 - o0
+    // o3 = n*i3 - d*o2 - o1
+    // Then expand(o3) and gather terms for p(prev), pp(prev2) etc.
+    rg->mul_prev[4 * i + 0] = -d1[i];
+    rg->mul_prev[4 * i + 1] = d_2 - 1.0;
+    rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i];
+    rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0;
+    rg->mul_prev2[4 * i + 0] = -1.0;
+    rg->mul_prev2[4 * i + 1] = d1[i];
+    rg->mul_prev2[4 * i + 2] = -d_2 + 1.0;
+    rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i];
+    rg->mul_in[4 * i + 0] = n2[i];
+    rg->mul_in[4 * i + 1] = -d1[i] * n2[i];
+    rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i];
+    rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i];
+  }
+  return rg;
+}
+
+namespace {
+
+// Apply 1D horizontal scan to each row.
+void FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                            const ImageF& in, ThreadPool* pool,
+                            ImageF* JXL_RESTRICT out) {
+  PROFILER_FUNC;
+  JXL_CHECK(SameSize(in, *out));
+
+  const intptr_t xsize = in.xsize();
+  JXL_CHECK(RunOnPool(
+      pool, 0, in.ysize(), ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /*thread*/) {
+        const size_t y = task;
+        const float* row_in = in.ConstRow(y);
+        float* JXL_RESTRICT row_out = out->Row(y);
+        FastGaussian1D(rg, row_in, xsize, row_out);
+      },
+      "FastGaussianHorizontal"));
+}
+
+}  // namespace
+
+void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
+                  ImageF* JXL_RESTRICT out) {
+  FastGaussianHorizontal(rg, in, pool, temp);
+  HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur.h b/third_party/jpeg-xl/lib/jxl/gauss_blur.h
new file mode 100644
index 0000000000..fb4741f03a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.h
@@ -0,0 +1,94 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_GAUSS_BLUR_H_
+#define LIB_JXL_GAUSS_BLUR_H_
+
+#include <stddef.h>
+
+#include <cmath>
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+std::vector<T> GaussianKernel(int radius, T sigma) {
+  JXL_ASSERT(sigma > 0.0);
+  std::vector<T> kernel(2 * radius + 1);
+  const T scaler = -1.0 / (2 * sigma * sigma);
+  double sum = 0.0;
+  for (int i = -radius; i <= radius; ++i) {
+    const T val = std::exp(scaler * i * i);
+    kernel[i + radius] = val;
+    sum += val;
+  }
+  for (size_t i = 0; i < kernel.size(); ++i) {
+    kernel[i] /= sum;
+  }
+  return kernel;
+}
+
+// All convolution functions below apply mirroring of the input on the borders
+// in the following way:
+//
+//     input: [a0 a1 a2 ...  aN]
+//     mirrored input: [aR ... a1 | a0 a1 a2 .... aN | aN-1 ... aN-R]
+//
+// where R is the radius of the kernel (i.e. kernel size is 2*R+1).
+
+// REQUIRES: in.xsize() and in.ysize() are integer multiples of res.
+ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
+                         const size_t res);
+
+// Private, used by test.
+void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
+                        float* const JXL_RESTRICT row_out, const int xsize,
+                        const int radius);
+
+// Only for use by CreateRecursiveGaussian and FastGaussian*.
+#pragma pack(push, 1)
+struct RecursiveGaussian {
+  // For k={1,3,5} in that order, each broadcasted 4x for LoadDup128. Used only
+  // for vertical passes.
+  float n2[3 * 4];
+  float d1[3 * 4];
+
+  // We unroll horizontal passes 4x - one output per lane. These are each lane's
+  // multiplier for the previous output (relative to the first of the four
+  // outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the lane index.
+  float mul_prev[3 * 4];
+  // Ditto for the second to last output.
+  float mul_prev2[3 * 4];
+
+  // We multiply a vector of inputs 0..3 by a vector shifted from this array.
+  // in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0.
+  float mul_in[3 * 4];
+
+  size_t radius;
+};
+#pragma pack(pop)
+
+// Precomputation for FastGaussian*; users may use the same pointer/storage in
+// subsequent calls to FastGaussian* with the same sigma.
+hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma);
+
+// 1D Gaussian with zero-pad boundary handling and runtime independent of sigma.
+void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                    const float* JXL_RESTRICT in, intptr_t width,
+                    float* JXL_RESTRICT out);
+
+// 2D Gaussian with zero-pad boundary handling and runtime independent of sigma.
+void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
+                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
+                  ImageF* JXL_RESTRICT out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_GAUSS_BLUR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
new file mode 100644
index 0000000000..b1bb64abc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
@@ -0,0 +1,126 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <hwy/targets.h>
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+namespace {
+
+JXL_MAYBE_UNUSED ImageF Convolve(const ImageF& in,
+                                 const std::vector<float>& kernel) {
+  return ConvolveAndSample(in, kernel, 1);
+}
+
+void BM_GaussBlur1d(benchmark::State& state) {
+  // Uncomment to disable SIMD and force and scalar implementation
+  // hwy::DisableTargets(~HWY_SCALAR);
+  // Uncomment to run AVX2
+  // hwy::DisableTargets(HWY_AVX3);
+
+  const size_t length = state.range();
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(length, 1);
+  const float expected = length;
+  FillImage(expected, &in);
+
+  ImageF temp(length, 1);
+  ImageF out(length, 1);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  for (auto _ : state) {
+    FastGaussian1D(rg, in.Row(0), length, out.Row(0));
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(0)[length / 2] - expected) / expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(length * state.iterations());
+}
+
+void BM_GaussBlur2d(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  ThreadPool* null_pool = nullptr;
+  const auto rg = CreateRecursiveGaussian(sigma);
+  for (auto _ : state) {
+    FastGaussian(rg, in, null_pool, &temp, &out);
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+void BM_GaussBlurFir(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  const double sigma = 7.0;  // (from Butteraugli application)
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  for (auto _ : state) {
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(Convolve(in, kernel).ConstRow(ysize / 2)[xsize / 2] -
+                        expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+void BM_GaussBlurSep7(benchmark::State& state) {
+  // See GaussBlur1d for SIMD changes.
+
+  const size_t xsize = state.range();
+  const size_t ysize = xsize;
+  ImageF in(xsize, ysize);
+  const float expected = xsize + ysize;
+  FillImage(expected, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  ThreadPool* null_pool = nullptr;
+  // Gaussian with sigma 1
+  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
+                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
+                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
+  for (auto _ : state) {
+    Separable7(in, Rect(in), weights, null_pool, &out);
+    // Prevent optimizing out
+    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
+                   expected <
+               9E-5);
+  }
+  state.SetItemsProcessed(xsize * ysize * state.iterations());
+}
+
+BENCHMARK(BM_GaussBlur1d)->Range(1 << 8, 1 << 14);
+BENCHMARK(BM_GaussBlur2d)->Range(1 << 7, 1 << 10);
+BENCHMARK(BM_GaussBlurFir)->Range(1 << 7, 1 << 10);
+BENCHMARK(BM_GaussBlurSep7)->Range(1 << 7, 1 << 10);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
new file mode 100644
index 0000000000..097c1aa8df
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
@@ -0,0 +1,453 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/gauss_blur.h"
+
+#include <cmath>
+#include <hwy/targets.h>
+#include <vector>
+
+#include "lib/extras/time.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+bool NearEdge(const int64_t width, const int64_t peak) {
+  // When around 3*sigma from the edge, there is negligible truncation.
+  return peak < 10 || peak > width - 10;
+}
+
+// Follow the curve downwards by scanning right from `peak` and verifying
+// identical values at the same offset to the left.
+void VerifySymmetric(const int64_t width, const int64_t peak,
+                     const float* out) {
+  const double tolerance = NearEdge(width, peak) ? 0.015 : 6E-7;
+  for (int64_t i = 1;; ++i) {
+    // Stop if we passed either end of the array
+    if (peak - i < 0 || peak + i >= width) break;
+    EXPECT_GT(out[peak + i - 1] + tolerance, out[peak + i]);  // descending
+    EXPECT_NEAR(out[peak - i], out[peak + i], tolerance);     // symmetric
+  }
+}
+
+void TestImpulseResponse(size_t width, size_t peak) {
+  const auto rg3 = CreateRecursiveGaussian(3.0);
+  const auto rg4 = CreateRecursiveGaussian(4.0);
+  const auto rg5 = CreateRecursiveGaussian(5.0);
+
+  // Extra padding for 4x unrolling
+  auto in = hwy::AllocateAligned<float>(width + 3);
+  memset(in.get(), 0, sizeof(float) * (width + 3));
+  in[peak] = 1.0f;
+
+  auto out3 = hwy::AllocateAligned<float>(width + 3);
+  auto out4 = hwy::AllocateAligned<float>(width + 3);
+  auto out5 = hwy::AllocateAligned<float>(width + 3);
+  FastGaussian1D(rg3, in.get(), width, out3.get());
+  FastGaussian1D(rg4, out3.get(), width, out4.get());
+  FastGaussian1D(rg5, in.get(), width, out5.get());
+
+  VerifySymmetric(width, peak, out3.get());
+  VerifySymmetric(width, peak, out4.get());
+  VerifySymmetric(width, peak, out5.get());
+
+  // Wider kernel has flatter peak
+  EXPECT_LT(out5[peak] + 0.05, out3[peak]);
+
+  // Gauss3 o Gauss4 ~= Gauss5
+  const double tolerance = NearEdge(width, peak) ? 0.04 : 0.01;
+  for (size_t i = 0; i < width; ++i) {
+    EXPECT_NEAR(out4[i], out5[i], tolerance);
+  }
+}
+
+void TestImpulseResponseForWidth(size_t width) {
+  for (size_t i = 0; i < width; ++i) {
+    TestImpulseResponse(width, i);
+  }
+}
+
+TEST(GaussBlurTest, ImpulseResponse) {
+  TestImpulseResponseForWidth(10);  // tiny even
+  TestImpulseResponseForWidth(15);  // small odd
+  TestImpulseResponseForWidth(32);  // power of two
+  TestImpulseResponseForWidth(31);  // power of two - 1
+  TestImpulseResponseForWidth(33);  // power of two + 1
+}
+
+ImageF Convolve(const ImageF& in, const std::vector<float>& kernel) {
+  return ConvolveAndSample(in, kernel, 1);
+}
+
+// Higher-precision version for accuracy test.
+ImageF ConvolveAndTransposeF64(const ImageF& in,
+                               const std::vector<double>& kernel) {
+  JXL_ASSERT(kernel.size() % 2 == 1);
+  ImageF out(in.ysize(), in.xsize());
+  const int r = kernel.size() / 2;
+  std::vector<float> row_tmp(in.xsize() + 2 * r);
+  float* const JXL_RESTRICT rowp = &row_tmp[r];
+  const double* const kernelp = &kernel[r];
+  for (size_t y = 0; y < in.ysize(); ++y) {
+    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
+    for (size_t x = 0, ox = 0; x < in.xsize(); ++x, ++ox) {
+      double sum = 0.0;
+      for (int i = -r; i <= r; ++i) {
+        sum += rowp[std::max<int>(
+                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
+               kernelp[i];
+      }
+      out.Row(ox)[y] = static_cast<float>(sum);
+    }
+  }
+  return out;
+}
+
+ImageF ConvolveF64(const ImageF& in, const std::vector<double>& kernel) {
+  ImageF tmp = ConvolveAndTransposeF64(in, kernel);
+  return ConvolveAndTransposeF64(tmp, kernel);
+}
+
+void TestDirac2D(size_t xsize, size_t ysize, double sigma) {
+  ImageF in(xsize, ysize);
+  ZeroFillImage(&in);
+  // We anyway ignore the border below, so might as well choose the middle.
+  in.Row(ysize / 2)[xsize / 2] = 1.0f;
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out);
+
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  const ImageF expected = Convolve(in, kernel);
+
+  const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4;
+  const size_t border = 2 * sigma;
+
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, 1E-8, _, border));
+}
+
+TEST(GaussBlurTest, Test2D) {
+  const std::vector<int> dimensions{6, 15, 17, 64, 50, 49};
+  for (int xsize : dimensions) {
+    for (int ysize : dimensions) {
+      for (double sigma : {1.0, 2.5, 3.6, 7.0}) {
+        TestDirac2D(static_cast<size_t>(xsize), static_cast<size_t>(ysize),
+                    sigma);
+      }
+    }
+  }
+}
+
+// Slow (44 sec). To run, remove the disabled prefix.
+TEST(GaussBlurTest, DISABLED_SlowTestDirac1D) {
+  const double sigma = 7.0;
+  const auto rg = CreateRecursiveGaussian(sigma);
+
+  // IPOL accuracy test uses 10^-15 tolerance, this is 2*10^-11.
+  const size_t radius = static_cast<size_t>(7 * sigma);
+  const std::vector<double> kernel = GaussianKernel(radius, sigma);
+
+  const size_t length = 16384;
+  ImageF inputs(length, 1);
+  ZeroFillImage(&inputs);
+
+  auto outputs = hwy::AllocateAligned<float>(length);
+
+  // One per center position
+  auto sum_abs_err = hwy::AllocateAligned<double>(length);
+  std::fill(sum_abs_err.get(), sum_abs_err.get() + length, 0.0);
+
+  for (size_t center = radius; center < length - radius; ++center) {
+    inputs.Row(0)[center - 1] = 0.0f;  // reset last peak, entire array now 0
+    inputs.Row(0)[center] = 1.0f;
+    FastGaussian1D(rg, inputs.Row(0), length, outputs.get());
+
+    const ImageF outputs_fir = ConvolveF64(inputs, kernel);
+
+    for (size_t i = 0; i < length; ++i) {
+      const float abs_err = std::abs(outputs[i] - outputs_fir.Row(0)[i]);
+      sum_abs_err[i] += static_cast<double>(abs_err);
+    }
+  }
+
+  const double max_abs_err =
+      *std::max_element(sum_abs_err.get(), sum_abs_err.get() + length);
+  printf("Max abs err: %.8e\n", max_abs_err);
+}
+
+void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma,
+                double max_l1, double max_rel) {
+  printf("%4" PRIuS " x %4" PRIuS " %4.1f %4.1f sigma %.1f\n", xsize, ysize,
+         min, max, sigma);
+  ImageF in(xsize, ysize);
+  RandomFillImage(&in, min, max, 65537 + xsize * 129 + ysize);
+  // FastGaussian/Convolve handle borders differently, so keep those pixels 0.
+  const size_t border = 4 * sigma;
+  SetBorder(border, 0.0f, &in);
+
+  ImageF temp(xsize, ysize);
+  ImageF out(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out);
+
+  const std::vector<float> kernel =
+      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+  const ImageF expected = Convolve(in, kernel);
+
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, max_rel, _, border));
+}
+
+void TestRandomForSizes(float min, float max, double sigma) {
+  double max_l1 = 6E-3;
+  double max_rel = 3E-3;
+  TestRandom(128, 1, min, max, sigma, max_l1, max_rel);
+  TestRandom(1, 128, min, max, sigma, max_l1, max_rel);
+  TestRandom(30, 201, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
+  TestRandom(201, 30, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
+  TestRandom(201, 201, min, max, sigma, max_l1 * 2.0, max_rel * 1.2);
+}
+
+TEST(GaussBlurTest, TestRandom) {
+  // small non-negative
+  TestRandomForSizes(0.0f, 10.0f, 3.0f);
+  TestRandomForSizes(0.0f, 10.0f, 7.0f);
+
+  // small negative
+  TestRandomForSizes(-4.0f, -1.0f, 3.0f);
+  TestRandomForSizes(-4.0f, -1.0f, 7.0f);
+
+  // mixed positive/negative
+  TestRandomForSizes(-6.0f, 6.0f, 3.0f);
+  TestRandomForSizes(-6.0f, 6.0f, 7.0f);
+}
+
+TEST(GaussBlurTest, TestSign) {
+  const size_t xsize = 500;
+  const size_t ysize = 606;
+  ImageF in(xsize, ysize);
+
+  ZeroFillImage(&in);
+  const float center[33 * 33] = {
+      -0.128445f, -0.098473f, -0.121883f, -0.093601f, 0.095665f,  -0.271332f,
+      -0.705475f, -1.324005f, -2.020741f, -1.329464f, 1.834064f,  4.787300f,
+      5.834560f,  5.272720f,  3.967960f,  3.547935f,  3.432732f,  3.383015f,
+      3.239326f,  3.290806f,  3.298954f,  3.397808f,  3.359730f,  3.533844f,
+      3.511856f,  3.436787f,  3.428310f,  3.460209f,  3.550011f,  3.590942f,
+      3.593109f,  3.560005f,  3.443165f,  0.089741f,  0.179230f,  -0.032997f,
+      -0.182610f, 0.005669f,  -0.244759f, -0.395123f, -0.514961f, -1.003529f,
+      -1.798656f, -2.377975f, 0.222191f,  3.957664f,  5.946804f,  5.543129f,
+      4.290096f,  3.621010f,  3.407257f,  3.392494f,  3.345367f,  3.391903f,
+      3.441605f,  3.429260f,  3.444969f,  3.507130f,  3.518612f,  3.443111f,
+      3.475948f,  3.536148f,  3.470333f,  3.628311f,  3.600243f,  3.292892f,
+      -0.226730f, -0.573616f, -0.762165f, -0.398739f, -0.189842f, -0.275921f,
+      -0.446739f, -0.550037f, -0.461033f, -0.724792f, -1.448349f, -1.814064f,
+      -0.491032f, 2.817703f,  5.213242f,  5.675629f,  4.864548f,  3.876324f,
+      3.535587f,  3.530312f,  3.413765f,  3.386261f,  3.404854f,  3.383472f,
+      3.420830f,  3.326496f,  3.257877f,  3.362152f,  3.489609f,  3.619587f,
+      3.555805f,  3.423164f,  3.309708f,  -0.483940f, -0.502926f, -0.592983f,
+      -0.492527f, -0.413616f, -0.482555f, -0.475506f, -0.447990f, -0.338120f,
+      -0.189072f, -0.376427f, -0.910828f, -1.878044f, -1.937927f, 1.423218f,
+      4.871609f,  5.767548f,  5.103741f,  3.983868f,  3.633003f,  3.458263f,
+      3.507309f,  3.247021f,  3.220612f,  3.326061f,  3.352814f,  3.291061f,
+      3.322739f,  3.444302f,  3.506207f,  3.556839f,  3.529575f,  3.457024f,
+      -0.408161f, -0.431343f, -0.454369f, -0.356419f, -0.380924f, -0.399452f,
+      -0.439476f, -0.412189f, -0.306816f, -0.008213f, -0.325813f, -0.537842f,
+      -0.984100f, -1.805332f, -2.028198f, 0.773205f,  4.423046f,  5.604839f,
+      5.231617f,  4.080299f,  3.603008f,  3.498741f,  3.517010f,  3.333897f,
+      3.381336f,  3.342617f,  3.369686f,  3.434155f,  3.490452f,  3.607029f,
+      3.555298f,  3.702297f,  3.618679f,  -0.503609f, -0.578564f, -0.419014f,
+      -0.239883f, 0.269836f,  0.022984f,  -0.455067f, -0.621777f, -0.304176f,
+      -0.163792f, -0.490250f, -0.466637f, -0.391792f, -0.657940f, -1.498035f,
+      -1.895836f, 0.036537f,  3.462456f,  5.586445f,  5.658791f,  4.434784f,
+      3.423435f,  3.318848f,  3.202328f,  3.532764f,  3.436687f,  3.354881f,
+      3.356941f,  3.382645f,  3.503902f,  3.512867f,  3.632366f,  3.537312f,
+      -0.274734f, -0.658829f, -0.726532f, -0.281254f, 0.053196f,  -0.064991f,
+      -0.608517f, -0.720966f, -0.070602f, -0.111320f, -0.440956f, -0.492180f,
+      -0.488762f, -0.569283f, -1.012741f, -1.582779f, -2.101479f, -1.392380f,
+      2.451153f,  5.555855f,  6.096313f,  5.230045f,  4.068172f,  3.404274f,
+      3.392586f,  3.326065f,  3.156670f,  3.284828f,  3.347012f,  3.319252f,
+      3.352310f,  3.610790f,  3.499847f,  -0.150600f, -0.314445f, -0.093575f,
+      -0.057384f, 0.053688f,  -0.189255f, -0.263515f, -0.318653f, 0.053246f,
+      0.080627f,  -0.119553f, -0.152454f, -0.305420f, -0.404869f, -0.385944f,
+      -0.689949f, -1.204914f, -1.985748f, -1.711361f, 1.260658f,  4.626896f,
+      5.888351f,  5.450989f,  4.070587f,  3.539200f,  3.383492f,  3.296318f,
+      3.267334f,  3.436028f,  3.463005f,  3.502625f,  3.522282f,  3.403763f,
+      -0.348049f, -0.302303f, -0.137016f, -0.041737f, -0.164001f, -0.358849f,
+      -0.469627f, -0.428291f, -0.375797f, -0.246346f, -0.118950f, -0.084229f,
+      -0.205681f, -0.241199f, -0.391796f, -0.323151f, -0.241211f, -0.834137f,
+      -1.684219f, -1.972137f, 0.448399f,  4.019985f,  5.648144f,  5.647846f,
+      4.295094f,  3.641884f,  3.374790f,  3.197342f,  3.425545f,  3.507481f,
+      3.478065f,  3.430889f,  3.341900f,  -1.016304f, -0.959221f, -0.909466f,
+      -0.810715f, -0.590729f, -0.594467f, -0.646721f, -0.629364f, -0.528561f,
+      -0.551819f, -0.301086f, -0.149101f, -0.060146f, -0.162220f, -0.326210f,
+      -0.156548f, -0.036293f, -0.426098f, -1.145470f, -1.628998f, -2.003052f,
+      -1.142891f, 2.885162f,  5.652863f,  5.718426f,  4.911140f,  3.234222f,
+      3.473373f,  3.577183f,  3.271603f,  3.410435f,  3.505489f,  3.434032f,
+      -0.508911f, -0.438797f, -0.437450f, -0.627426f, -0.511745f, -0.304874f,
+      -0.274246f, -0.261841f, -0.228466f, -0.342491f, -0.528206f, -0.490082f,
+      -0.516350f, -0.361694f, -0.398514f, -0.276020f, -0.210369f, -0.355938f,
+      -0.402622f, -0.538864f, -1.249573f, -2.100105f, -0.996178f, 1.886410f,
+      4.929745f,  5.630871f,  5.444199f,  4.042740f,  3.739189f,  3.691399f,
+      3.391956f,  3.469696f,  3.431232f,  0.204849f,  0.205433f,  -0.131927f,
+      -0.367908f, -0.374378f, -0.126820f, -0.186951f, -0.228565f, -0.081776f,
+      -0.143143f, -0.379230f, -0.598701f, -0.458019f, -0.295586f, -0.407730f,
+      -0.245853f, -0.043140f, 0.024242f,  -0.038998f, -0.044151f, -0.425991f,
+      -1.240753f, -1.943146f, -2.174755f, 0.523415f,  4.376751f,  5.956558f,
+      5.850082f,  4.403152f,  3.517399f,  3.560753f,  3.554836f,  3.471985f,
+      -0.508503f, -0.109783f, 0.057747f,  0.190079f,  -0.257153f, -0.591980f,
+      -0.666771f, -0.525391f, -0.293060f, -0.489731f, -0.304855f, -0.259644f,
+      -0.367825f, -0.346977f, -0.292889f, -0.215652f, -0.120705f, -0.176010f,
+      -0.422905f, -0.114647f, -0.289749f, -0.374203f, -0.606754f, -1.127949f,
+      -1.994583f, -0.588058f, 3.415840f,  5.603470f,  5.811581f,  4.959423f,
+      3.721760f,  3.710499f,  3.785461f,  -0.554588f, -0.565517f, -0.434578f,
+      -0.012482f, -0.284660f, -0.699795f, -0.957535f, -0.755135f, -0.382034f,
+      -0.321552f, -0.287571f, -0.279537f, -0.314972f, -0.256287f, -0.372818f,
+      -0.316017f, -0.287975f, -0.365639f, -0.512589f, -0.420692f, -0.436485f,
+      -0.295353f, -0.451958f, -0.755459f, -1.272358f, -2.301353f, -1.776161f,
+      1.572483f,  4.826286f,  5.741898f,  5.162853f,  4.028049f,  3.686325f,
+      -0.495590f, -0.664413f, -0.760044f, -0.152634f, -0.286480f, -0.340462f,
+      0.076477f,  0.187706f,  -0.068787f, -0.293491f, -0.361145f, -0.292515f,
+      -0.140671f, -0.190723f, -0.333302f, -0.368168f, -0.192581f, -0.154499f,
+      -0.236544f, -0.124405f, -0.208321f, -0.465607f, -0.883080f, -1.104813f,
+      -1.210567f, -1.415665f, -1.924683f, -1.634758f, 0.601017f,  4.276672f,
+      5.501350f,  5.331257f,  3.809288f,  -0.727722f, -0.533619f, -0.511524f,
+      -0.470688f, -0.610710f, -0.575130f, -0.311115f, -0.090420f, -0.297676f,
+      -0.646118f, -0.742805f, -0.485050f, -0.330910f, -0.275417f, -0.357037f,
+      -0.425598f, -0.481876f, -0.488941f, -0.393551f, -0.051105f, -0.090755f,
+      -0.328674f, -0.536369f, -0.533684f, -0.336960f, -0.689194f, -1.187195f,
+      -1.860954f, -2.290253f, -0.424774f, 3.050060f,  5.083332f,  5.291920f,
+      -0.343605f, -0.190975f, -0.303692f, -0.456512f, -0.681820f, -0.690693f,
+      -0.416729f, -0.286446f, -0.442055f, -0.709148f, -0.569160f, -0.382423f,
+      -0.402321f, -0.383362f, -0.366413f, -0.290718f, -0.110069f, -0.220280f,
+      -0.279018f, -0.255424f, -0.262081f, -0.487556f, -0.444492f, -0.250500f,
+      -0.119583f, -0.291557f, -0.537781f, -1.104073f, -1.737091f, -1.697441f,
+      -0.323456f, 2.042049f,  4.605103f,  -0.310631f, -0.279568f, -0.012695f,
+      -0.160130f, -0.358746f, -0.421101f, -0.559677f, -0.474136f, -0.416565f,
+      -0.561817f, -0.534672f, -0.519157f, -0.767197f, -0.605831f, -0.186523f,
+      0.219872f,  0.264984f,  -0.193432f, -0.363182f, -0.467472f, -0.462009f,
+      -0.571053f, -0.522476f, -0.315903f, -0.237427f, -0.147320f, -0.100201f,
+      -0.237568f, -0.763435f, -1.242043f, -2.135159f, -1.409485f, 1.236370f,
+      -0.474247f, -0.517906f, -0.410217f, -0.542244f, -0.795986f, -0.590004f,
+      -0.388863f, -0.462921f, -0.810627f, -0.778637f, -0.512486f, -0.718025f,
+      -0.710854f, -0.482513f, -0.318233f, -0.194962f, -0.220116f, -0.421673f,
+      -0.534233f, -0.403339f, -0.389332f, -0.407303f, -0.437355f, -0.469730f,
+      -0.359600f, -0.352745f, -0.466755f, -0.414585f, -0.430756f, -0.656822f,
+      -1.237038f, -2.046097f, -1.574898f, -0.593815f, -0.582165f, -0.336098f,
+      -0.372612f, -0.554386f, -0.410603f, -0.428276f, -0.647644f, -0.640720f,
+      -0.582207f, -0.414112f, -0.435547f, -0.435505f, -0.332561f, -0.248116f,
+      -0.340221f, -0.277855f, -0.352699f, -0.377319f, -0.230850f, -0.313267f,
+      -0.446270f, -0.346237f, -0.420422f, -0.530781f, -0.400341f, -0.463661f,
+      -0.209091f, -0.056705f, -0.011772f, -0.169388f, -0.736275f, -1.463017f,
+      -0.752701f, -0.668865f, -0.329765f, -0.299347f, -0.245667f, -0.286999f,
+      -0.520420f, -0.675438f, -0.255753f, 0.141357f,  -0.079639f, -0.419476f,
+      -0.374069f, -0.046253f, 0.116116f,  -0.145847f, -0.380371f, -0.563412f,
+      -0.638634f, -0.310116f, -0.260914f, -0.508404f, -0.465508f, -0.527824f,
+      -0.370979f, -0.305595f, -0.244694f, -0.254490f, 0.009968f,  -0.050201f,
+      -0.331219f, -0.614960f, -0.788208f, -0.483242f, -0.367516f, -0.186951f,
+      -0.180031f, 0.129711f,  -0.127811f, -0.384750f, -0.499542f, -0.418613f,
+      -0.121635f, 0.203197f,  -0.167290f, -0.397270f, -0.355461f, -0.218746f,
+      -0.376785f, -0.521698f, -0.721581f, -0.845741f, -0.535439f, -0.220882f,
+      -0.309067f, -0.555248f, -0.690342f, -0.664948f, -0.390102f, 0.020355f,
+      -0.130447f, -0.173252f, -0.170059f, -0.633663f, -0.956001f, -0.621696f,
+      -0.388302f, -0.342262f, -0.244370f, -0.386948f, -0.401421f, -0.172979f,
+      -0.206163f, -0.450058f, -0.525789f, -0.549274f, -0.349251f, -0.474613f,
+      -0.667976f, -0.435600f, -0.175369f, -0.196877f, -0.202976f, -0.242481f,
+      -0.258369f, -0.189133f, -0.395397f, -0.765499f, -0.944016f, -0.850967f,
+      -0.631561f, -0.152493f, -0.046432f, -0.262066f, -0.195919f, 0.048218f,
+      0.084972f,  0.039902f,  0.000618f,  -0.404430f, -0.447456f, -0.418076f,
+      -0.631935f, -0.717415f, -0.502888f, -0.530514f, -0.747826f, -0.704041f,
+      -0.674969f, -0.516853f, -0.418446f, -0.327740f, -0.308815f, -0.481636f,
+      -0.440083f, -0.481720f, -0.341053f, -0.283897f, -0.324368f, -0.352829f,
+      -0.434349f, -0.545589f, -0.533104f, -0.472755f, -0.570496f, -0.557735f,
+      -0.708176f, -0.493332f, -0.194416f, -0.186249f, -0.256710f, -0.271835f,
+      -0.304752f, -0.431267f, -0.422398f, -0.646725f, -0.680801f, -0.249031f,
+      -0.058567f, -0.213890f, -0.383949f, -0.540291f, -0.549877f, -0.225567f,
+      -0.037174f, -0.499874f, -0.641010f, -0.628044f, -0.390549f, -0.311497f,
+      -0.542313f, -0.569565f, -0.473408f, -0.331245f, -0.357197f, -0.285599f,
+      -0.200157f, -0.201866f, -0.124428f, -0.346016f, -0.392311f, -0.264496f,
+      -0.285370f, -0.436974f, -0.523483f, -0.410461f, -0.267925f, -0.055016f,
+      -0.382458f, -0.319771f, -0.049927f, 0.124329f,  0.266102f,  -0.106606f,
+      -0.773647f, -0.973053f, -0.708206f, -0.486137f, -0.319923f, -0.493900f,
+      -0.490860f, -0.324986f, -0.147346f, -0.146088f, -0.161758f, -0.084396f,
+      -0.379494f, 0.041626f,  -0.113361f, -0.277767f, 0.083366f,  0.126476f,
+      0.139057f,  0.038040f,  0.038162f,  -0.242126f, -0.411736f, -0.370049f,
+      -0.455357f, -0.039257f, 0.264442f,  -0.271492f, -0.425346f, -0.514847f,
+      -0.448650f, -0.580399f, -0.652603f, -0.774803f, -0.692524f, -0.579578f,
+      -0.465206f, -0.386265f, -0.458012f, -0.446594f, -0.284893f, -0.345448f,
+      -0.350876f, -0.440350f, -0.360378f, -0.270428f, 0.237213f,  -0.063602f,
+      -0.364529f, -0.179867f, 0.078197f,  0.117947f,  -0.093410f, -0.359119f,
+      -0.480961f, -0.540638f, -0.436287f, -0.598576f, -0.253735f, -0.060093f,
+      -0.549145f, -0.808327f, -0.698593f, -0.595764f, -0.582508f, -0.497353f,
+      -0.480892f, -0.584240f, -0.665791f, -0.690903f, -0.743446f, -0.796677f,
+      -0.782391f, -0.649010f, -0.628139f, -0.880848f, -0.829361f, -0.373272f,
+      -0.223667f, 0.174572f,  -0.348743f, -0.798901f, -0.692307f, -0.607609f,
+      -0.401455f, -0.480919f, -0.450798f, -0.435413f, -0.322338f, -0.228382f,
+      -0.450466f, -0.504440f, -0.477402f, -0.662224f, -0.583397f, -0.217445f,
+      -0.157459f, -0.079584f, -0.226168f, -0.488720f, -0.669624f, -0.666878f,
+      -0.565311f, -0.549625f, -0.364601f, -0.497627f, -0.736897f, -0.763023f,
+      -0.741020f, -0.404503f, 0.184814f,  -0.075315f, -0.281513f, -0.532906f,
+      -0.405800f, -0.313438f, -0.536652f, -0.403381f, 0.011967f,  0.103310f,
+      -0.269848f, -0.508656f, -0.445923f, -0.644859f, -0.617870f, -0.500927f,
+      -0.371559f, -0.125580f, 0.028625f,  -0.154713f, -0.442024f, -0.492764f,
+      -0.199371f, 0.236305f,  0.225925f,  0.075577f,  -0.285812f, -0.437145f,
+      -0.374260f, -0.156693f, -0.129635f, -0.243206f, -0.123058f, 0.162148f,
+      -0.313152f, -0.337982f, -0.358421f, 0.040070f,  0.038925f,  -0.333313f,
+      -0.351662f, 0.023014f,  0.091362f,  -0.282890f, -0.373253f, -0.389050f,
+      -0.532707f, -0.423347f, -0.349968f, -0.287045f, -0.202442f, -0.308430f,
+      -0.222801f, -0.106323f, -0.056358f, 0.027222f,  0.390732f,  0.033558f,
+      -0.160088f, -0.382217f, -0.535282f, -0.515900f, -0.022736f, 0.165665f,
+      -0.111408f, -0.233784f, -0.312357f, -0.541885f, -0.480022f, -0.482513f,
+      -0.246254f, 0.132244f,  0.090134f,  0.234634f,  -0.089249f, -0.460854f,
+      -0.515457f, -0.450874f, -0.311031f, -0.387680f, -0.360554f, -0.179241f,
+      -0.283817f, -0.475815f, -0.246399f, -0.388958f, -0.551140f, -0.496239f,
+      -0.559879f, -0.379761f, -0.254288f, -0.395111f, -0.613018f, -0.459427f,
+      -0.263580f, -0.268929f, 0.080826f,  0.115616f,  -0.097324f, -0.325310f,
+      -0.480450f, -0.313286f, -0.310371f, -0.517361f, -0.288288f, -0.112679f,
+      -0.173241f, -0.221664f, -0.039452f, -0.107578f, -0.089630f, -0.483768f,
+      -0.571087f, -0.497108f, -0.321533f, -0.375492f, -0.540363f, -0.406815f,
+      -0.388512f, -0.514561f, -0.540192f, -0.402412f, -0.232246f, -0.304749f,
+      -0.383724f, -0.679596f, -0.685463f, -0.694538f, -0.642937f, -0.425789f,
+      0.103271f,  -0.194862f, -0.487999f, -0.717281f, -0.681850f, -0.709286f,
+      -0.615398f, -0.554245f, -0.254681f, -0.049950f, -0.002914f, -0.095383f,
+      -0.370911f, -0.564224f, -0.242714f};
+  const size_t xtest = xsize / 2;
+  const size_t ytest = ysize / 2;
+
+  for (intptr_t dy = -16; dy <= 16; ++dy) {
+    float* row = in.Row(ytest + dy);
+    for (intptr_t dx = -16; dx <= 16; ++dx)
+      row[xtest + dx] = center[(dy + 16) * 33 + (dx + 16)];
+  }
+
+  const double sigma = 7.155933;
+
+  ImageF temp(xsize, ysize);
+  ImageF out_rg(xsize, ysize);
+  const auto rg = CreateRecursiveGaussian(sigma);
+  ThreadPool* null_pool = nullptr;
+  FastGaussian(rg, in, null_pool, &temp, &out_rg);
+
+  ImageF out_old;
+  {
+    const std::vector<float> kernel =
+        GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
+    printf("old kernel size %" PRIuS "\n", kernel.size());
+    out_old = Convolve(in, kernel);
+  }
+
+  printf("rg %.4f old %.4f\n", out_rg.Row(ytest)[xtest],
+         out_old.Row(ytest)[xtest]);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/gradient_test.cc b/third_party/jpeg-xl/lib/jxl/gradient_test.cc
new file mode 100644
index 0000000000..282fe89f0a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/gradient_test.cc
@@ -0,0 +1,207 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <array>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+
+// Returns distance of point p to line p0..p1, the result is signed and is not
+// normalized.
+double PointLineDist(double x0, double y0, double x1, double y1, double x,
+                     double y) {
+  return (y1 - y0) * x - (x1 - x0) * y + x1 * y0 - y1 * x0;
+}
+
+// Generates a test image with a gradient from one color to another.
+// Angle in degrees, colors can be given in hex as 0xRRGGBB. The angle is the
+// angle in which the change direction happens.
+Image3F GenerateTestGradient(uint32_t color0, uint32_t color1, double angle,
+                             size_t xsize, size_t ysize) {
+  Image3F image(xsize, ysize);
+
+  double x0 = xsize / 2;
+  double y0 = ysize / 2;
+  double x1 = x0 + std::sin(angle / 360.0 * 2.0 * kPi);
+  double y1 = y0 + std::cos(angle / 360.0 * 2.0 * kPi);
+
+  double maxdist =
+      std::max<double>(fabs(PointLineDist(x0, y0, x1, y1, 0, 0)),
+                       fabs(PointLineDist(x0, y0, x1, y1, xsize, 0)));
+
+  for (size_t c = 0; c < 3; ++c) {
+    float c0 = ((color0 >> (8 * (2 - c))) & 255);
+    float c1 = ((color1 >> (8 * (2 - c))) & 255);
+    for (size_t y = 0; y < ysize; ++y) {
+      float* row = image.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        double dist = PointLineDist(x0, y0, x1, y1, x, y);
+        double v = ((dist / maxdist) + 1.0) / 2.0;
+        float color = c0 * (1.0 - v) + c1 * v;
+        row[x] = color;
+      }
+    }
+  }
+
+  return image;
+}
+
+// Computes the max of the horizontal and vertical second derivative for each
+// pixel, where second derivative means absolute value of difference of left
+// delta and right delta (top/bottom for vertical direction).
+// The radius over which the derivative is computed is only 1 pixel and it only
+// checks two angles (hor and ver), but this approximation works well enough.
+static ImageF Gradient2(const ImageF& image) {
+  size_t xsize = image.xsize();
+  size_t ysize = image.ysize();
+  ImageF image2(image.xsize(), image.ysize());
+  for (size_t y = 1; y + 1 < ysize; y++) {
+    const auto* JXL_RESTRICT row0 = image.Row(y - 1);
+    const auto* JXL_RESTRICT row1 = image.Row(y);
+    const auto* JXL_RESTRICT row2 = image.Row(y + 1);
+    auto* row_out = image2.Row(y);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      float ddx = (row1[x] - row1[x - 1]) - (row1[x + 1] - row1[x]);
+      float ddy = (row1[x] - row0[x]) - (row2[x] - row1[x]);
+      row_out[x] = std::max(fabsf(ddx), fabsf(ddy));
+    }
+  }
+  // Copy to the borders
+  if (ysize > 2) {
+    auto* JXL_RESTRICT row0 = image2.Row(0);
+    const auto* JXL_RESTRICT row1 = image2.Row(1);
+    const auto* JXL_RESTRICT row2 = image2.Row(ysize - 2);
+    auto* JXL_RESTRICT row3 = image2.Row(ysize - 1);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      row0[x] = row1[x];
+      row3[x] = row2[x];
+    }
+  } else {
+    const auto* row0_in = image.Row(0);
+    const auto* row1_in = image.Row(ysize - 1);
+    auto* row0_out = image2.Row(0);
+    auto* row1_out = image2.Row(ysize - 1);
+    for (size_t x = 1; x + 1 < xsize; x++) {
+      // Image too narrow, take first derivative instead
+      row0_out[x] = row1_out[x] = fabsf(row0_in[x] - row1_in[x]);
+    }
+  }
+  if (xsize > 2) {
+    for (size_t y = 0; y < ysize; y++) {
+      auto* row = image2.Row(y);
+      row[0] = row[1];
+      row[xsize - 1] = row[xsize - 2];
+    }
+  } else {
+    for (size_t y = 0; y < ysize; y++) {
+      const auto* JXL_RESTRICT row_in = image.Row(y);
+      auto* row_out = image2.Row(y);
+      // Image too narrow, take first derivative instead
+      row_out[0] = row_out[xsize - 1] = fabsf(row_in[0] - row_in[xsize - 1]);
+    }
+  }
+  return image2;
+}
+
+static Image3F Gradient2(const Image3F& image) {
+  return Image3F(Gradient2(image.Plane(0)), Gradient2(image.Plane(1)),
+                 Gradient2(image.Plane(2)));
+}
+
+/*
+Tests if roundtrip with jxl on a gradient image doesn't cause banding.
+Only tests if use_gradient is true. Set to false for debugging to see the
+distance values.
+Angle in degrees, colors can be given in hex as 0xRRGGBB.
+*/
+void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1,
+                  size_t xsize, size_t ysize, float angle, bool fast_mode,
+                  float butteraugli_distance, bool use_gradient = true) {
+  CompressParams cparams;
+  cparams.butteraugli_distance = butteraugli_distance;
+  if (fast_mode) {
+    cparams.speed_tier = SpeedTier::kSquirrel;
+  }
+  Image3F gradient = GenerateTestGradient(color0, color1, angle, xsize, ysize);
+
+  CodecInOut io;
+  io.metadata.m.SetUintSamples(8);
+  io.metadata.m.color_encoding = ColorEncoding::SRGB();
+  io.SetFromImage(std::move(gradient), io.metadata.m.color_encoding);
+
+  CodecInOut io2;
+
+  PaddedBytes compressed;
+  AuxOut* aux_out = nullptr;
+  PassesEncoderState enc_state;
+  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         aux_out, pool));
+  EXPECT_TRUE(
+      test::DecodeFile({}, Span<const uint8_t>(compressed), &io2, pool));
+  EXPECT_TRUE(
+      io2.Main().TransformTo(io2.metadata.m.color_encoding, GetJxlCms(), pool));
+
+  if (use_gradient) {
+    // Test that the gradient map worked. For that, we take a second derivative
+    // of the image with Gradient2 to measure how linear the change is in x and
+    // y direction. For a well handled gradient, we expect max values around
+    // 0.1, while if there is noticeable banding, which means the gradient map
+    // failed, the values are around 0.5-1.0 (regardless of
+    // butteraugli_distance).
+    Image3F gradient2 = Gradient2(*io2.Main().color());
+
+    std::array<float, 3> image_max;
+    Image3Max(gradient2, &image_max);
+
+    // TODO(jyrki): These values used to work with 0.2, 0.2, 0.2.
+    EXPECT_LE(image_max[0], 3.15);
+    EXPECT_LE(image_max[1], 1.72);
+    EXPECT_LE(image_max[2], 5.05);
+  }
+}
+
+static constexpr bool fast_mode = true;
+
+TEST(GradientTest, SteepGradient) {
+  test::ThreadPoolForTests pool(8);
+  // Relatively steep gradients, colors from the sky of stp.png
+  TestGradient(&pool, 0xd99d58, 0x889ab1, 512, 512, 90, fast_mode, 3.0);
+}
+
+TEST(GradientTest, SubtleGradient) {
+  test::ThreadPoolForTests pool(8);
+  // Very subtle gradient
+  TestGradient(&pool, 0xb89b7b, 0xa89b8d, 512, 512, 90, fast_mode, 4.0);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/headers.cc b/third_party/jpeg-xl/lib/jxl/headers.cc
new file mode 100644
index 0000000000..dc53726385
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/headers.cc
@@ -0,0 +1,194 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/headers.h"
+
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+
+struct Rational {
+  constexpr explicit Rational(uint32_t num, uint32_t den)
+      : num(num), den(den) {}
+
+  // Returns floor(multiplicand * rational).
+  constexpr uint32_t MulTruncate(uint32_t multiplicand) const {
+    return uint64_t(multiplicand) * num / den;
+  }
+
+  uint32_t num;
+  uint32_t den;
+};
+
+Rational FixedAspectRatios(uint32_t ratio) {
+  JXL_ASSERT(0 != ratio && ratio < 8);
+  // Other candidates: 5/4, 7/5, 14/9, 16/10, 5/3, 21/9, 12/5
+  constexpr Rational kRatios[7] = {Rational(1, 1),    // square
+                                   Rational(12, 10),  //
+                                   Rational(4, 3),    // camera
+                                   Rational(3, 2),    // mobile camera
+                                   Rational(16, 9),   // camera/display
+                                   Rational(5, 4),    //
+                                   Rational(2, 1)};   //
+  return kRatios[ratio - 1];
+}
+
+uint32_t FindAspectRatio(uint32_t xsize, uint32_t ysize) {
+  for (uint32_t r = 1; r < 8; ++r) {
+    if (xsize == FixedAspectRatios(r).MulTruncate(ysize)) {
+      return r;
+    }
+  }
+  return 0;  // Must send xsize instead
+}
+
+}  // namespace
+
+size_t SizeHeader::xsize() const {
+  if (ratio_ != 0) {
+    return FixedAspectRatios(ratio_).MulTruncate(
+        static_cast<uint32_t>(ysize()));
+  }
+  return small_ ? ((xsize_div8_minus_1_ + 1) * 8) : xsize_;
+}
+
+Status SizeHeader::Set(size_t xsize64, size_t ysize64) {
+  if (xsize64 > 0xFFFFFFFFull || ysize64 > 0xFFFFFFFFull) {
+    return JXL_FAILURE("Image too large");
+  }
+  const uint32_t xsize32 = static_cast<uint32_t>(xsize64);
+  const uint32_t ysize32 = static_cast<uint32_t>(ysize64);
+  if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty image");
+  ratio_ = FindAspectRatio(xsize32, ysize32);
+  small_ = ysize64 <= 256 && (ysize64 % kBlockDim) == 0 &&
+           (ratio_ != 0 || (xsize64 <= 256 && (xsize64 % kBlockDim) == 0));
+  if (small_) {
+    ysize_div8_minus_1_ = ysize32 / 8 - 1;
+  } else {
+    ysize_ = ysize32;
+  }
+
+  if (ratio_ == 0) {
+    if (small_) {
+      xsize_div8_minus_1_ = xsize32 / 8 - 1;
+    } else {
+      xsize_ = xsize32;
+    }
+  }
+  JXL_ASSERT(xsize() == xsize64);
+  JXL_ASSERT(ysize() == ysize64);
+  return true;
+}
+
+Status PreviewHeader::Set(size_t xsize64, size_t ysize64) {
+  const uint32_t xsize32 = static_cast<uint32_t>(xsize64);
+  const uint32_t ysize32 = static_cast<uint32_t>(ysize64);
+  if (xsize64 == 0 || ysize64 == 0) return JXL_FAILURE("Empty preview");
+  div8_ = (xsize64 % kBlockDim) == 0 && (ysize64 % kBlockDim) == 0;
+  if (div8_) {
+    ysize_div8_ = ysize32 / 8;
+  } else {
+    ysize_ = ysize32;
+  }
+
+  ratio_ = FindAspectRatio(xsize32, ysize32);
+  if (ratio_ == 0) {
+    if (div8_) {
+      xsize_div8_ = xsize32 / 8;
+    } else {
+      xsize_ = xsize32;
+    }
+  }
+  JXL_ASSERT(xsize() == xsize64);
+  JXL_ASSERT(ysize() == ysize64);
+  return true;
+}
+
+size_t PreviewHeader::xsize() const {
+  if (ratio_ != 0) {
+    return FixedAspectRatios(ratio_).MulTruncate(
+        static_cast<uint32_t>(ysize()));
+  }
+  return div8_ ? (xsize_div8_ * 8) : xsize_;
+}
+
+SizeHeader::SizeHeader() { Bundle::Init(this); }
+Status SizeHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &small_));
+
+  if (visitor->Conditional(small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &ysize_div8_minus_1_));
+  }
+  if (visitor->Conditional(!small_)) {
+    // (Could still be small, but non-multiple of 8.)
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1),
+                                           BitsOffset(18, 1), BitsOffset(30, 1),
+                                           1, &ysize_));
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_));
+  if (visitor->Conditional(ratio_ == 0 && small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, 0, &xsize_div8_minus_1_));
+  }
+  if (visitor->Conditional(ratio_ == 0 && !small_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(9, 1), BitsOffset(13, 1),
+                                           BitsOffset(18, 1), BitsOffset(30, 1),
+                                           1, &xsize_));
+  }
+
+  return true;
+}
+
+PreviewHeader::PreviewHeader() { Bundle::Init(this); }
+Status PreviewHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &div8_));
+
+  if (visitor->Conditional(div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1),
+                                           BitsOffset(9, 33), 1, &ysize_div8_));
+  }
+  if (visitor->Conditional(!div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65),
+                                           BitsOffset(10, 321),
+                                           BitsOffset(12, 1345), 1, &ysize_));
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &ratio_));
+  if (visitor->Conditional(ratio_ == 0 && div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), Val(32), BitsOffset(5, 1),
+                                           BitsOffset(9, 33), 1, &xsize_div8_));
+  }
+  if (visitor->Conditional(ratio_ == 0 && !div8_)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(BitsOffset(6, 1), BitsOffset(8, 65),
+                                           BitsOffset(10, 321),
+                                           BitsOffset(12, 1345), 1, &xsize_));
+  }
+
+  return true;
+}
+
+AnimationHeader::AnimationHeader() { Bundle::Init(this); }
+Status AnimationHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(100), Val(1000), BitsOffset(10, 1),
+                                         BitsOffset(30, 1), 1, &tps_numerator));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Val(1001), BitsOffset(8, 1),
+                                         BitsOffset(10, 1), 1,
+                                         &tps_denominator));
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(0), Bits(3), Bits(16), Bits(32), 0, &num_loops));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_timecodes));
+  return true;
+}
+
+Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
+                      SizeHeader* JXL_RESTRICT size) {
+  return Bundle::Read(reader, size);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/headers.h b/third_party/jpeg-xl/lib/jxl/headers.h
new file mode 100644
index 0000000000..3cce84dabc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/headers.h
@@ -0,0 +1,97 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_HEADERS_H_
+#define LIB_JXL_HEADERS_H_
+
+// Codestream headers.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// Reserved by ISO/IEC 10918-1. LF causes files opened in text mode to be
+// rejected because the marker changes to 0x0D instead. The 0xFF prefix also
+// ensures there were no 7-bit transmission limitations.
+static constexpr uint8_t kCodestreamMarker = 0x0A;
+
+// Compact representation of image dimensions (best case: 9 bits) so decoders
+// can preallocate early.
+class SizeHeader : public Fields {
+ public:
+  SizeHeader();
+  JXL_FIELDS_NAME(SizeHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  Status Set(size_t xsize, size_t ysize);
+
+  size_t xsize() const;
+  size_t ysize() const {
+    return small_ ? ((ysize_div8_minus_1_ + 1) * 8) : ysize_;
+  }
+
+ private:
+  bool small_;  // xsize and ysize <= 256 and divisible by 8.
+
+  uint32_t ysize_div8_minus_1_;
+  uint32_t ysize_;
+
+  uint32_t ratio_;
+  uint32_t xsize_div8_minus_1_;
+  uint32_t xsize_;
+};
+
+// (Similar to SizeHeader but different encoding because previews are smaller)
+class PreviewHeader : public Fields {
+ public:
+  PreviewHeader();
+  JXL_FIELDS_NAME(PreviewHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  Status Set(size_t xsize, size_t ysize);
+
+  size_t xsize() const;
+  size_t ysize() const { return div8_ ? (ysize_div8_ * 8) : ysize_; }
+
+ private:
+  bool div8_;  // xsize and ysize divisible by 8.
+
+  uint32_t ysize_div8_;
+  uint32_t ysize_;
+
+  uint32_t ratio_;
+  uint32_t xsize_div8_;
+  uint32_t xsize_;
+};
+
+struct AnimationHeader : public Fields {
+  AnimationHeader();
+  JXL_FIELDS_NAME(AnimationHeader)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Ticks per second (expressed as rational number to support NTSC)
+  uint32_t tps_numerator;
+  uint32_t tps_denominator;
+
+  uint32_t num_loops;  // 0 means to repeat infinitely.
+
+  bool have_timecodes;
+};
+
+Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
+                      SizeHeader* JXL_RESTRICT size);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HEADERS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.cc b/third_party/jpeg-xl/lib/jxl/huffman_table.cc
new file mode 100644
index 0000000000..9ae7865af6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/huffman_table.cc
@@ -0,0 +1,161 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/huffman_table.h"
+
+#include <cstring> /* for memcpy */
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/dec_huffman.h"
+
+namespace jxl {
+
+/* Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
+   bit-wise reversal of the len least significant bits of key. */
+static inline int GetNextKey(int key, int len) {
+  int step = 1u << (len - 1);
+  while (key & step) {
+    step >>= 1;
+  }
+  return (key & (step - 1)) + step;
+}
+
+/* Stores code in table[0], table[step], table[2*step], ..., table[end] */
+/* Assumes that end is an integer multiple of step */
+static inline void ReplicateValue(HuffmanCode* table, int step, int end,
+                                  HuffmanCode code) {
+  do {
+    end -= step;
+    table[end] = code;
+  } while (end > 0);
+}
+
+/* Returns the table width of the next 2nd level table. count is the histogram
+   of bit lengths for the remaining symbols, len is the code length of the next
+   processed symbol */
+static inline size_t NextTableBitSize(const uint16_t* const count, size_t len,
+                                      int root_bits) {
+  size_t left = 1u << (len - root_bits);
+  while (len < PREFIX_MAX_BITS) {
+    if (left <= count[len]) break;
+    left -= count[len];
+    ++len;
+    left <<= 1;
+  }
+  return len - root_bits;
+}
+
+uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits,
+                           const uint8_t* const code_lengths,
+                           size_t code_lengths_size, uint16_t* count) {
+  HuffmanCode code;   /* current table entry */
+  HuffmanCode* table; /* next available space in table */
+  size_t len;         /* current code length */
+  size_t symbol;      /* symbol index in original or sorted table */
+  int key;            /* reversed prefix code */
+  int step;           /* step size to replicate values in current table */
+  int low;            /* low bits for current root entry */
+  int mask;           /* mask for low bits */
+  size_t table_bits;  /* key length of current table */
+  int table_size;     /* size of current table */
+  int total_size;     /* sum of root table size and 2nd level table sizes */
+  /* offsets in sorted table for each length */
+  uint16_t offset[PREFIX_MAX_BITS + 1];
+  size_t max_length = 1;
+
+  if (code_lengths_size > 1u << PREFIX_MAX_BITS) return 0;
+
+  /* symbols sorted by code length */
+  std::vector<uint16_t> sorted_storage(code_lengths_size);
+  uint16_t* sorted = sorted_storage.data();
+
+  /* generate offsets into sorted symbol table by code length */
+  {
+    uint16_t sum = 0;
+    for (len = 1; len <= PREFIX_MAX_BITS; len++) {
+      offset[len] = sum;
+      if (count[len]) {
+        sum = static_cast<uint16_t>(sum + count[len]);
+        max_length = len;
+      }
+    }
+  }
+
+  /* sort symbols by length, by symbol order within each length */
+  for (symbol = 0; symbol < code_lengths_size; symbol++) {
+    if (code_lengths[symbol] != 0) {
+      sorted[offset[code_lengths[symbol]]++] = symbol;
+    }
+  }
+
+  table = root_table;
+  table_bits = root_bits;
+  table_size = 1u << table_bits;
+  total_size = table_size;
+
+  /* special case code with only one value */
+  if (offset[PREFIX_MAX_BITS] == 1) {
+    code.bits = 0;
+    code.value = static_cast<uint16_t>(sorted[0]);
+    for (key = 0; key < total_size; ++key) {
+      table[key] = code;
+    }
+    return total_size;
+  }
+
+  /* fill in root table */
+  /* let's reduce the table size to a smaller size if possible, and */
+  /* create the repetitions by memcpy if possible in the coming loop */
+  if (table_bits > max_length) {
+    table_bits = max_length;
+    table_size = 1u << table_bits;
+  }
+  key = 0;
+  symbol = 0;
+  code.bits = 1;
+  step = 2;
+  do {
+    for (; count[code.bits] != 0; --count[code.bits]) {
+      code.value = static_cast<uint16_t>(sorted[symbol++]);
+      ReplicateValue(&table[key], step, table_size, code);
+      key = GetNextKey(key, code.bits);
+    }
+    step <<= 1;
+  } while (++code.bits <= table_bits);
+
+  /* if root_bits != table_bits we only created one fraction of the */
+  /* table, and we need to replicate it now. */
+  while (total_size != table_size) {
+    memcpy(&table[table_size], &table[0], table_size * sizeof(table[0]));
+    table_size <<= 1;
+  }
+
+  /* fill in 2nd level tables and add pointers to root table */
+  mask = total_size - 1;
+  low = -1;
+  for (len = root_bits + 1, step = 2; len <= max_length; ++len, step <<= 1) {
+    for (; count[len] != 0; --count[len]) {
+      if ((key & mask) != low) {
+        table += table_size;
+        table_bits = NextTableBitSize(count, len, root_bits);
+        table_size = 1u << table_bits;
+        total_size += table_size;
+        low = key & mask;
+        root_table[low].bits = static_cast<uint8_t>(table_bits + root_bits);
+        root_table[low].value =
+            static_cast<uint16_t>((table - root_table) - low);
+      }
+      code.bits = static_cast<uint8_t>(len - root_bits);
+      code.value = static_cast<uint16_t>(sorted[symbol++]);
+      ReplicateValue(&table[key >> root_bits], step, table_size, code);
+      key = GetNextKey(key, len);
+    }
+  }
+
+  return total_size;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/huffman_table.h b/third_party/jpeg-xl/lib/jxl/huffman_table.h
new file mode 100644
index 0000000000..11cdb2fc45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/huffman_table.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_HUFFMAN_TABLE_H_
+#define LIB_JXL_HUFFMAN_TABLE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+namespace jxl {
+
+struct HuffmanCode {
+  uint8_t bits;   /* number of bits used for this symbol */
+  uint16_t value; /* symbol value or table offset */
+};
+
+/* Builds Huffman lookup table assuming code lengths are in symbol order. */
+/* Returns 0 in case of error (invalid tree or memory error), otherwise
+   populated size of table. */
+uint32_t BuildHuffmanTable(HuffmanCode* root_table, int root_bits,
+                           const uint8_t* code_lengths,
+                           size_t code_lengths_size, uint16_t* count);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_HUFFMAN_TABLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/iaca_test.cc b/third_party/jpeg-xl/lib/jxl/iaca_test.cc
new file mode 100644
index 0000000000..e25d9316d5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/iaca_test.cc
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/iaca.h"
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(IacaTest, MarkersDefaultToDisabledAndDoNotCrash) {
+  BeginIACA();
+  EndIACA();
+}
+
+TEST(IacaTest, ScopeDefaultToDisabledAndDoNotCrash) { ScopeIACA iaca; }
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.cc b/third_party/jpeg-xl/lib/jxl/icc_codec.cc
new file mode 100644
index 0000000000..f367461c0f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec.cc
@@ -0,0 +1,389 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/icc_codec_common.h"
+
+namespace jxl {
+namespace {
+
+// Shuffles or interleaves bytes, for example with width 2, turns "ABCDabcd"
+// into "AaBbCcDc". Transposes a matrix of ceil(size / width) columns and
+// width rows. There are size elements, size may be < width * height, if so the
+// last elements of the rightmost column are missing, the missing spots are
+// transposed along with the filled spots, and the result has the missing
+// elements at the end of the bottom row. The input is the input matrix in
+// scanline order but with missing elements skipped (which may occur in multiple
+// locations), the output is the result matrix in scanline order (with
+// no need to skip missing elements as they are past the end of the data).
+void Shuffle(uint8_t* data, size_t size, size_t width) {
+  size_t height = (size + width - 1) / width;  // amount of rows of output
+  PaddedBytes result(size);
+  // i = output index, j input index
+  size_t s = 0, j = 0;
+  for (size_t i = 0; i < size; i++) {
+    result[i] = data[j];
+    j += height;
+    if (j >= size) j = ++s;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    data[i] = result[i];
+  }
+}
+
+// TODO(eustas): should be 20, or even 18, once DecodeVarInt is improved;
+//               currently DecodeVarInt does not signal the errors, and marks
+//               11 bytes as used even if only 10 are used (and 9 is enough for
+//               63-bit values).
+constexpr const size_t kPreambleSize = 22;  // enough for reading 2 VarInts
+
+}  // namespace
+
+// Mimics the beginning of UnpredictICC for quick validity check.
+// At least kPreambleSize bytes of data should be valid at invocation time.
+Status CheckPreamble(const PaddedBytes& data, size_t enc_size,
+                     size_t output_limit) {
+  const uint8_t* enc = data.data();
+  size_t size = data.size();
+  size_t pos = 0;
+  uint64_t osize = DecodeVarInt(enc, size, &pos);
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(osize));
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t csize = DecodeVarInt(enc, size, &pos);
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(csize));
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size));
+  // We expect that UnpredictICC inflates input, not the other way round.
+  if (osize + 65536 < enc_size) return JXL_FAILURE("Malformed ICC");
+  if (output_limit && osize > output_limit) {
+    return JXL_FAILURE("Decoded ICC is too large");
+  }
+  return true;
+}
+
+// Decodes the result of PredictICC back to a valid ICC profile.
+Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result) {
+  if (!result->empty()) return JXL_FAILURE("result must be empty initially");
+  size_t pos = 0;
+  // TODO(lode): technically speaking we need to check that the entire varint
+  // decoding never goes out of bounds, not just the first byte. This requires
+  // a DecodeVarInt function that returns an error code. It is safe to use
+  // DecodeVarInt with out of bounds values, it silently returns, but the
+  // specification requires an error. Idem for all DecodeVarInt below.
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t osize = DecodeVarInt(enc, size, &pos);  // Output size
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(osize));
+  if (pos >= size) return JXL_FAILURE("Out of bounds");
+  uint64_t csize = DecodeVarInt(enc, size, &pos);  // Commands size
+  // Every command is translated to at least on byte.
+  JXL_RETURN_IF_ERROR(CheckIs32Bit(csize));
+  size_t cpos = pos;  // pos in commands stream
+  JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, csize, size));
+  size_t commands_end = cpos + csize;
+  pos = commands_end;  // pos in data stream
+
+  // Header
+  PaddedBytes header = ICCInitialHeaderPrediction();
+  EncodeUint32(0, osize, &header);
+  for (size_t i = 0; i <= kICCHeaderSize; i++) {
+    if (result->size() == osize) {
+      if (cpos != commands_end) return JXL_FAILURE("Not all commands used");
+      if (pos != size) return JXL_FAILURE("Not all data used");
+      return true;  // Valid end
+    }
+    if (i == kICCHeaderSize) break;  // Done
+    ICCPredictHeader(result->data(), result->size(), header.data(), i);
+    if (pos >= size) return JXL_FAILURE("Out of bounds");
+    result->push_back(enc[pos++] + header[i]);
+  }
+  if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+
+  // Tag list
+  uint64_t numtags = DecodeVarInt(enc, size, &cpos);
+
+  if (numtags != 0) {
+    numtags--;
+    JXL_RETURN_IF_ERROR(CheckIs32Bit(numtags));
+    AppendUint32(numtags, result);
+    uint64_t prevtagstart = kICCHeaderSize + numtags * 12;
+    uint64_t prevtagsize = 0;
+    for (;;) {
+      if (result->size() > osize) return JXL_FAILURE("Invalid result size");
+      if (cpos > commands_end) return JXL_FAILURE("Out of bounds");
+      if (cpos == commands_end) break;  // Valid end
+      uint8_t command = enc[cpos++];
+      uint8_t tagcode = command & 63;
+      Tag tag;
+      if (tagcode == 0) {
+        break;
+      } else if (tagcode == kCommandTagUnknown) {
+        JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 4, size));
+        tag = DecodeKeyword(enc, size, pos);
+        pos += 4;
+      } else if (tagcode == kCommandTagTRC) {
+        tag = kRtrcTag;
+      } else if (tagcode == kCommandTagXYZ) {
+        tag = kRxyzTag;
+      } else {
+        if (tagcode - kCommandTagStringFirst >= kNumTagStrings) {
+          return JXL_FAILURE("Unknown tagcode");
+        }
+        tag = *kTagStrings[tagcode - kCommandTagStringFirst];
+      }
+      AppendKeyword(tag, result);
+
+      uint64_t tagstart;
+      uint64_t tagsize = prevtagsize;
+      if (tag == kRxyzTag || tag == kGxyzTag || tag == kBxyzTag ||
+          tag == kKxyzTag || tag == kWtptTag || tag == kBkptTag ||
+          tag == kLumiTag) {
+        tagsize = 20;
+      }
+
+      if (command & kFlagBitOffset) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        tagstart = DecodeVarInt(enc, size, &cpos);
+      } else {
+        JXL_RETURN_IF_ERROR(CheckIs32Bit(prevtagstart));
+        tagstart = prevtagstart + prevtagsize;
+      }
+      JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart));
+      AppendUint32(tagstart, result);
+      if (command & kFlagBitSize) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        tagsize = DecodeVarInt(enc, size, &cpos);
+      }
+      JXL_RETURN_IF_ERROR(CheckIs32Bit(tagsize));
+      AppendUint32(tagsize, result);
+      prevtagstart = tagstart;
+      prevtagsize = tagsize;
+
+      if (tagcode == kCommandTagTRC) {
+        AppendKeyword(kGtrcTag, result);
+        AppendUint32(tagstart, result);
+        AppendUint32(tagsize, result);
+        AppendKeyword(kBtrcTag, result);
+        AppendUint32(tagstart, result);
+        AppendUint32(tagsize, result);
+      }
+
+      if (tagcode == kCommandTagXYZ) {
+        JXL_RETURN_IF_ERROR(CheckIs32Bit(tagstart + tagsize * 2));
+        AppendKeyword(kGxyzTag, result);
+        AppendUint32(tagstart + tagsize, result);
+        AppendUint32(tagsize, result);
+        AppendKeyword(kBxyzTag, result);
+        AppendUint32(tagstart + tagsize * 2, result);
+        AppendUint32(tagsize, result);
+      }
+    }
+  }
+
+  // Main Content
+  for (;;) {
+    if (result->size() > osize) return JXL_FAILURE("Invalid result size");
+    if (cpos > commands_end) return JXL_FAILURE("Out of bounds");
+    if (cpos == commands_end) break;  // Valid end
+    uint8_t command = enc[cpos++];
+    if (command == kCommandInsert) {
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+      for (size_t i = 0; i < num; i++) {
+        result->push_back(enc[pos++]);
+      }
+    } else if (command == kCommandShuffle2 || command == kCommandShuffle4) {
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+      PaddedBytes shuffled(num);
+      for (size_t i = 0; i < num; i++) {
+        shuffled[i] = enc[pos + i];
+      }
+      if (command == kCommandShuffle2) {
+        Shuffle(shuffled.data(), num, 2);
+      } else if (command == kCommandShuffle4) {
+        Shuffle(shuffled.data(), num, 4);
+      }
+      for (size_t i = 0; i < num; i++) {
+        result->push_back(shuffled[i]);
+        pos++;
+      }
+    } else if (command == kCommandPredict) {
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(cpos, 2, commands_end));
+      uint8_t flags = enc[cpos++];
+
+      size_t width = (flags & 3) + 1;
+      if (width == 3) return JXL_FAILURE("Invalid width");
+
+      int order = (flags & 12) >> 2;
+      if (order == 3) return JXL_FAILURE("Invalid order");
+
+      uint64_t stride = width;
+      if (flags & 16) {
+        if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+        stride = DecodeVarInt(enc, size, &cpos);
+        if (stride < width) {
+          return JXL_FAILURE("Invalid stride");
+        }
+      }
+      // If stride * 4 >= result->size(), return failure. The check
+      // "size == 0 || ((size - 1) >> 2) < stride" corresponds to
+      // "stride * 4 >= size", but does not suffer from integer overflow.
+      // This check is more strict than necessary but follows the specification
+      // and the encoder should ensure this is followed.
+      if (result->empty() || ((result->size() - 1u) >> 2u) < stride) {
+        return JXL_FAILURE("Invalid stride");
+      }
+
+      if (cpos >= commands_end) return JXL_FAILURE("Out of bounds");
+      uint64_t num = DecodeVarInt(enc, size, &cpos);  // in bytes
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, num, size));
+
+      PaddedBytes shuffled(num);
+      for (size_t i = 0; i < num; i++) {
+        shuffled[i] = enc[pos + i];
+      }
+      if (width > 1) Shuffle(shuffled.data(), num, width);
+
+      size_t start = result->size();
+      for (size_t i = 0; i < num; i++) {
+        uint8_t predicted = LinearPredictICCValue(result->data(), start, i,
+                                                  stride, width, order);
+        result->push_back(predicted + shuffled[i]);
+      }
+      pos += num;
+    } else if (command == kCommandXYZ) {
+      AppendKeyword(kXyz_Tag, result);
+      for (int i = 0; i < 4; i++) result->push_back(0);
+      JXL_RETURN_IF_ERROR(CheckOutOfBounds(pos, 12, size));
+      for (size_t i = 0; i < 12; i++) {
+        result->push_back(enc[pos++]);
+      }
+    } else if (command >= kCommandTypeStartFirst &&
+               command < kCommandTypeStartFirst + kNumTypeStrings) {
+      AppendKeyword(*kTypeStrings[command - kCommandTypeStartFirst], result);
+      for (size_t i = 0; i < 4; i++) {
+        result->push_back(0);
+      }
+    } else {
+      return JXL_FAILURE("Unknown command");
+    }
+  }
+
+  if (pos != size) return JXL_FAILURE("Not all data used");
+  if (result->size() != osize) return JXL_FAILURE("Invalid result size");
+
+  return true;
+}
+
+Status ICCReader::Init(BitReader* reader, size_t output_limit) {
+  JXL_RETURN_IF_ERROR(CheckEOI(reader));
+  used_bits_base_ = reader->TotalBitsConsumed();
+  if (bits_to_skip_ == 0) {
+    enc_size_ = U64Coder::Read(reader);
+    if (enc_size_ > 268435456) {
+      // Avoid too large memory allocation for invalid file.
+      return JXL_FAILURE("Too large encoded profile");
+    }
+    JXL_RETURN_IF_ERROR(
+        DecodeHistograms(reader, kNumICCContexts, &code_, &context_map_));
+    ans_reader_ = ANSSymbolReader(&code_, reader);
+    i_ = 0;
+    decompressed_.resize(std::min<size_t>(i_ + 0x400, enc_size_));
+    for (; i_ < std::min<size_t>(2, enc_size_); i_++) {
+      decompressed_[i_] = ans_reader_.ReadHybridUint(
+          ICCANSContext(i_, i_ > 0 ? decompressed_[i_ - 1] : 0,
+                        i_ > 1 ? decompressed_[i_ - 2] : 0),
+          reader, context_map_);
+    }
+    if (enc_size_ > kPreambleSize) {
+      for (; i_ < kPreambleSize; i_++) {
+        decompressed_[i_] = ans_reader_.ReadHybridUint(
+            ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]),
+            reader, context_map_);
+      }
+      JXL_RETURN_IF_ERROR(CheckEOI(reader));
+      JXL_RETURN_IF_ERROR(
+          CheckPreamble(decompressed_, enc_size_, output_limit));
+    }
+    bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+  } else {
+    reader->SkipBits(bits_to_skip_);
+  }
+  return true;
+}
+
+Status ICCReader::Process(BitReader* reader, PaddedBytes* icc) {
+  ANSSymbolReader::Checkpoint checkpoint;
+  size_t saved_i = 0;
+  auto save = [&]() {
+    ans_reader_.Save(&checkpoint);
+    bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+    saved_i = i_;
+  };
+  save();
+  auto check_and_restore = [&]() {
+    Status status = CheckEOI(reader);
+    if (!status) {
+      // not enough bytes.
+      ans_reader_.Restore(checkpoint);
+      i_ = saved_i;
+      return status;
+    }
+    return Status(true);
+  };
+  for (; i_ < enc_size_; i_++) {
+    if (i_ % ANSSymbolReader::kMaxCheckpointInterval == 0 && i_ > 0) {
+      JXL_RETURN_IF_ERROR(check_and_restore());
+      save();
+      if ((i_ > 0) && (((i_ & 0xFFFF) == 0))) {
+        float used_bytes =
+            (reader->TotalBitsConsumed() - used_bits_base_) / 8.0f;
+        if (i_ > used_bytes * 256) return JXL_FAILURE("Corrupted stream");
+      }
+      decompressed_.resize(std::min<size_t>(i_ + 0x400, enc_size_));
+    }
+    JXL_DASSERT(i_ >= 2);
+    decompressed_[i_] = ans_reader_.ReadHybridUint(
+        ICCANSContext(i_, decompressed_[i_ - 1], decompressed_[i_ - 2]), reader,
+        context_map_);
+  }
+  JXL_RETURN_IF_ERROR(check_and_restore());
+  bits_to_skip_ = reader->TotalBitsConsumed() - used_bits_base_;
+  if (!ans_reader_.CheckANSFinalState()) {
+    return JXL_FAILURE("Corrupted ICC profile");
+  }
+
+  icc->clear();
+  return UnpredictICC(decompressed_.data(), decompressed_.size(), icc);
+}
+
+Status ICCReader::CheckEOI(BitReader* reader) {
+  if (reader->AllReadsWithinBounds()) return true;
+  return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                    "Not enough bytes for reading ICC profile");
+}
+
+Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
+               size_t output_limit) {
+  ICCReader icc_reader;
+  JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit));
+  JXL_RETURN_IF_ERROR(icc_reader.Process(reader, icc));
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec.h b/third_party/jpeg-xl/lib/jxl/icc_codec.h
new file mode 100644
index 0000000000..a6c7477c60
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec.h
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ICC_CODEC_H_
+#define LIB_JXL_ICC_CODEC_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+
+namespace jxl {
+
+struct ICCReader {
+  Status Init(BitReader* reader, size_t output_limit);
+  Status Process(BitReader* reader, PaddedBytes* icc);
+  void Reset() {
+    bits_to_skip_ = 0;
+    decompressed_.clear();
+  }
+
+ private:
+  Status CheckEOI(BitReader* reader);
+  size_t i_ = 0;
+  size_t bits_to_skip_ = 0;
+  size_t used_bits_base_ = 0;
+  uint64_t enc_size_ = 0;
+  std::vector<uint8_t> context_map_;
+  ANSCode code_;
+  ANSSymbolReader ans_reader_;
+  PaddedBytes decompressed_;
+};
+
+// `icc` may be empty afterwards - if so, call CreateProfile. Does not append,
+// clears any original data that was in icc.
+// If `output_limit` is not 0, then returns error if resulting profile would be
+// longer than `output_limit`
+Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
+               size_t output_limit = 0);
+
+// Exposed only for testing
+Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result);
+
+// Exposed only for testing
+Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ICC_CODEC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
new file mode 100644
index 0000000000..212387e78f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc
@@ -0,0 +1,190 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec_common.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace {
+static uint8_t ByteKind1(uint8_t b) {
+  if ('a' <= b && b <= 'z') return 0;
+  if ('A' <= b && b <= 'Z') return 0;
+  if ('0' <= b && b <= '9') return 1;
+  if (b == '.' || b == ',') return 1;
+  if (b == 0) return 2;
+  if (b == 1) return 3;
+  if (b < 16) return 4;
+  if (b == 255) return 6;
+  if (b > 240) return 5;
+  return 7;
+}
+
+static uint8_t ByteKind2(uint8_t b) {
+  if ('a' <= b && b <= 'z') return 0;
+  if ('A' <= b && b <= 'Z') return 0;
+  if ('0' <= b && b <= '9') return 1;
+  if (b == '.' || b == ',') return 1;
+  if (b < 16) return 2;
+  if (b > 240) return 3;
+  return 4;
+}
+
+template <typename T>
+T PredictValue(T p1, T p2, T p3, int order) {
+  if (order == 0) return p1;
+  if (order == 1) return 2 * p1 - p2;
+  if (order == 2) return 3 * p1 - 3 * p2 + p3;
+  return 0;
+}
+}  // namespace
+
+uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos) {
+  return pos + 4 > size ? 0 : LoadBE32(data + pos);
+}
+
+void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data) {
+  if (pos + 4 > data->size()) return;
+  StoreBE32(value, data->data() + pos);
+}
+
+void AppendUint32(uint32_t value, PaddedBytes* data) {
+  data->resize(data->size() + 4);
+  EncodeUint32(data->size() - 4, value, data);
+}
+
+typedef std::array<uint8_t, 4> Tag;
+
+Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos) {
+  if (pos + 4 > size) return {{' ', ' ', ' ', ' '}};
+  return {{data[pos], data[pos + 1], data[pos + 2], data[pos + 3]}};
+}
+
+void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos) {
+  if (keyword.size() != 4 || pos + 3 >= size) return;
+  for (size_t i = 0; i < 4; ++i) data[pos + i] = keyword[i];
+}
+
+void AppendKeyword(const Tag& keyword, PaddedBytes* data) {
+  JXL_ASSERT(keyword.size() == 4);
+  data->append(keyword);
+}
+
+// Checks if a + b > size, taking possible integer overflow into account.
+Status CheckOutOfBounds(size_t a, size_t b, size_t size) {
+  size_t pos = a + b;
+  if (pos > size) return JXL_FAILURE("Out of bounds");
+  if (pos < a) return JXL_FAILURE("Out of bounds");  // overflow happened
+  return true;
+}
+
+Status CheckIs32Bit(uint64_t v) {
+  static constexpr const uint64_t kUpper32 = ~static_cast<uint64_t>(0xFFFFFFFF);
+  if ((v & kUpper32) != 0) return JXL_FAILURE("32-bit value expected");
+  return true;
+}
+
+PaddedBytes ICCInitialHeaderPrediction() {
+  PaddedBytes result(kICCHeaderSize);
+  for (size_t i = 0; i < kICCHeaderSize; i++) {
+    result[i] = 0;
+  }
+  result[8] = 4;
+  EncodeKeyword(kMntrTag, result.data(), result.size(), 12);
+  EncodeKeyword(kRgb_Tag, result.data(), result.size(), 16);
+  EncodeKeyword(kXyz_Tag, result.data(), result.size(), 20);
+  EncodeKeyword(kAcspTag, result.data(), result.size(), 36);
+  result[68] = 0;
+  result[69] = 0;
+  result[70] = 246;
+  result[71] = 214;
+  result[72] = 0;
+  result[73] = 1;
+  result[74] = 0;
+  result[75] = 0;
+  result[76] = 0;
+  result[77] = 0;
+  result[78] = 211;
+  result[79] = 45;
+  return result;
+}
+
+void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
+                      size_t pos) {
+  if (pos == 8 && size >= 8) {
+    header[80] = icc[4];
+    header[81] = icc[5];
+    header[82] = icc[6];
+    header[83] = icc[7];
+  }
+  if (pos == 41 && size >= 41) {
+    if (icc[40] == 'A') {
+      header[41] = 'P';
+      header[42] = 'P';
+      header[43] = 'L';
+    }
+    if (icc[40] == 'M') {
+      header[41] = 'S';
+      header[42] = 'F';
+      header[43] = 'T';
+    }
+  }
+  if (pos == 42 && size >= 42) {
+    if (icc[40] == 'S' && icc[41] == 'G') {
+      header[42] = 'I';
+      header[43] = ' ';
+    }
+    if (icc[40] == 'S' && icc[41] == 'U') {
+      header[42] = 'N';
+      header[43] = 'W';
+    }
+  }
+}
+
+// Predicts a value with linear prediction of given order (0-2), for integers
+// with width bytes and given stride in bytes between values.
+// The start position is at start + i, and the relevant modulus of i describes
+// which byte of the multi-byte integer is being handled.
+// The value start + i must be at least stride * 4.
+uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i,
+                              size_t stride, size_t width, int order) {
+  size_t pos = start + i;
+  if (width == 1) {
+    uint8_t p1 = data[pos - stride];
+    uint8_t p2 = data[pos - stride * 2];
+    uint8_t p3 = data[pos - stride * 3];
+    return PredictValue(p1, p2, p3, order);
+  } else if (width == 2) {
+    size_t p = start + (i & ~1);
+    uint16_t p1 = (data[p - stride * 1] << 8) + data[p - stride * 1 + 1];
+    uint16_t p2 = (data[p - stride * 2] << 8) + data[p - stride * 2 + 1];
+    uint16_t p3 = (data[p - stride * 3] << 8) + data[p - stride * 3 + 1];
+    uint16_t pred = PredictValue(p1, p2, p3, order);
+    return (i & 1) ? (pred & 255) : ((pred >> 8) & 255);
+  } else {
+    size_t p = start + (i & ~3);
+    uint32_t p1 = DecodeUint32(data, pos, p - stride);
+    uint32_t p2 = DecodeUint32(data, pos, p - stride * 2);
+    uint32_t p3 = DecodeUint32(data, pos, p - stride * 3);
+    uint32_t pred = PredictValue(p1, p2, p3, order);
+    unsigned shiftbytes = 3 - (i & 3);
+    return (pred >> (shiftbytes * 8)) & 255;
+  }
+}
+
+size_t ICCANSContext(size_t i, size_t b1, size_t b2) {
+  if (i <= 128) return 0;
+  return 1 + ByteKind1(b1) + ByteKind2(b2) * 8;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_common.h b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
new file mode 100644
index 0000000000..e91e908669
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_common.h
@@ -0,0 +1,106 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ICC_CODEC_COMMON_H_
+#define LIB_JXL_ICC_CODEC_COMMON_H_
+
+// Compressed representation of ICC profiles.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+static constexpr size_t kICCHeaderSize = 128;
+
+typedef std::array<uint8_t, 4> Tag;
+
+static const Tag kAcspTag = {{'a', 'c', 's', 'p'}};
+static const Tag kBkptTag = {{'b', 'k', 'p', 't'}};
+static const Tag kBtrcTag = {{'b', 'T', 'R', 'C'}};
+static const Tag kBxyzTag = {{'b', 'X', 'Y', 'Z'}};
+static const Tag kChadTag = {{'c', 'h', 'a', 'd'}};
+static const Tag kChrmTag = {{'c', 'h', 'r', 'm'}};
+static const Tag kCprtTag = {{'c', 'p', 'r', 't'}};
+static const Tag kCurvTag = {{'c', 'u', 'r', 'v'}};
+static const Tag kDescTag = {{'d', 'e', 's', 'c'}};
+static const Tag kDmddTag = {{'d', 'm', 'd', 'd'}};
+static const Tag kDmndTag = {{'d', 'm', 'n', 'd'}};
+static const Tag kGbd_Tag = {{'g', 'b', 'd', ' '}};
+static const Tag kGtrcTag = {{'g', 'T', 'R', 'C'}};
+static const Tag kGxyzTag = {{'g', 'X', 'Y', 'Z'}};
+static const Tag kKtrcTag = {{'k', 'T', 'R', 'C'}};
+static const Tag kKxyzTag = {{'k', 'X', 'Y', 'Z'}};
+static const Tag kLumiTag = {{'l', 'u', 'm', 'i'}};
+static const Tag kMab_Tag = {{'m', 'A', 'B', ' '}};
+static const Tag kMba_Tag = {{'m', 'B', 'A', ' '}};
+static const Tag kMlucTag = {{'m', 'l', 'u', 'c'}};
+static const Tag kMntrTag = {{'m', 'n', 't', 'r'}};
+static const Tag kParaTag = {{'p', 'a', 'r', 'a'}};
+static const Tag kRgb_Tag = {{'R', 'G', 'B', ' '}};
+static const Tag kRtrcTag = {{'r', 'T', 'R', 'C'}};
+static const Tag kRxyzTag = {{'r', 'X', 'Y', 'Z'}};
+static const Tag kSf32Tag = {{'s', 'f', '3', '2'}};
+static const Tag kTextTag = {{'t', 'e', 'x', 't'}};
+static const Tag kVcgtTag = {{'v', 'c', 'g', 't'}};
+static const Tag kWtptTag = {{'w', 't', 'p', 't'}};
+static const Tag kXyz_Tag = {{'X', 'Y', 'Z', ' '}};
+
+// Tag names focused on RGB and GRAY monitor profiles
+static constexpr size_t kNumTagStrings = 17;
+static constexpr const Tag* kTagStrings[kNumTagStrings] = {
+    &kCprtTag, &kWtptTag, &kBkptTag, &kRxyzTag, &kGxyzTag, &kBxyzTag,
+    &kKxyzTag, &kRtrcTag, &kGtrcTag, &kBtrcTag, &kKtrcTag, &kChadTag,
+    &kDescTag, &kChrmTag, &kDmndTag, &kDmddTag, &kLumiTag};
+
+static constexpr size_t kCommandTagUnknown = 1;
+static constexpr size_t kCommandTagTRC = 2;
+static constexpr size_t kCommandTagXYZ = 3;
+static constexpr size_t kCommandTagStringFirst = 4;
+
+// Tag types focused on RGB and GRAY monitor profiles
+static constexpr size_t kNumTypeStrings = 8;
+static constexpr const Tag* kTypeStrings[kNumTypeStrings] = {
+    &kXyz_Tag, &kDescTag, &kTextTag, &kMlucTag,
+    &kParaTag, &kCurvTag, &kSf32Tag, &kGbd_Tag};
+
+static constexpr size_t kCommandInsert = 1;
+static constexpr size_t kCommandShuffle2 = 2;
+static constexpr size_t kCommandShuffle4 = 3;
+static constexpr size_t kCommandPredict = 4;
+static constexpr size_t kCommandXYZ = 10;
+static constexpr size_t kCommandTypeStartFirst = 16;
+
+static constexpr size_t kFlagBitOffset = 64;
+static constexpr size_t kFlagBitSize = 128;
+
+static constexpr size_t kNumICCContexts = 41;
+
+uint32_t DecodeUint32(const uint8_t* data, size_t size, size_t pos);
+void EncodeUint32(size_t pos, uint32_t value, PaddedBytes* data);
+void AppendUint32(uint32_t value, PaddedBytes* data);
+Tag DecodeKeyword(const uint8_t* data, size_t size, size_t pos);
+void EncodeKeyword(const Tag& keyword, uint8_t* data, size_t size, size_t pos);
+void AppendKeyword(const Tag& keyword, PaddedBytes* data);
+
+// Checks if a + b > size, taking possible integer overflow into account.
+Status CheckOutOfBounds(size_t a, size_t b, size_t size);
+Status CheckIs32Bit(uint64_t v);
+
+PaddedBytes ICCInitialHeaderPrediction();
+void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
+                      size_t pos);
+uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i,
+                              size_t stride, size_t width, int order);
+size_t ICCANSContext(size_t i, size_t b1, size_t b2);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ICC_CODEC_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc
new file mode 100644
index 0000000000..af02094e99
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/icc_codec_test.cc
@@ -0,0 +1,207 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/icc_codec.h"
+
+#include <string>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void TestProfile(const PaddedBytes& icc) {
+  BitWriter writer;
+  ASSERT_TRUE(WriteICC(icc, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  PaddedBytes dec;
+  BitReader reader(writer.GetSpan());
+  ASSERT_TRUE(ReadICC(&reader, &dec));
+  ASSERT_TRUE(reader.Close());
+  EXPECT_EQ(icc.size(), dec.size());
+  if (icc.size() == dec.size()) {
+    for (size_t i = 0; i < icc.size(); i++) {
+      EXPECT_EQ(icc[i], dec[i]);
+      if (icc[i] != dec[i]) break;  // One output is enough
+    }
+  }
+}
+
+void TestProfile(const std::string& icc) {
+  PaddedBytes bytes(icc.size());
+  for (size_t i = 0; i < icc.size(); i++) {
+    bytes[i] = icc[i];
+  }
+  TestProfile(bytes);
+}
+
+// Valid profile from one of the images output by the decoder.
+static const unsigned char kTestProfile[] = {
+    0x00, 0x00, 0x03, 0x80, 0x6c, 0x63, 0x6d, 0x73, 0x04, 0x30, 0x00, 0x00,
+    0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+    0x07, 0xe3, 0x00, 0x04, 0x00, 0x1d, 0x00, 0x0f, 0x00, 0x32, 0x00, 0x2e,
+    0x61, 0x63, 0x73, 0x70, 0x41, 0x50, 0x50, 0x4c, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xf6, 0xd6,
+    0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x6c, 0x63, 0x6d, 0x73,
+    0x5f, 0x07, 0x0d, 0x3e, 0x4d, 0x32, 0xf2, 0x6e, 0x5d, 0x77, 0x26, 0xcc,
+    0x23, 0xb0, 0x6a, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d,
+    0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x42,
+    0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x64, 0x00, 0x00, 0x01, 0x00,
+    0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x02, 0x64, 0x00, 0x00, 0x00, 0x14,
+    0x63, 0x68, 0x61, 0x64, 0x00, 0x00, 0x02, 0x78, 0x00, 0x00, 0x00, 0x2c,
+    0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x14,
+    0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xb8, 0x00, 0x00, 0x00, 0x14,
+    0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x02, 0xcc, 0x00, 0x00, 0x00, 0x14,
+    0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x02, 0xe0, 0x00, 0x00, 0x00, 0x20,
+    0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x24,
+    0x64, 0x6d, 0x6e, 0x64, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00, 0x28,
+    0x64, 0x6d, 0x64, 0x64, 0x00, 0x00, 0x03, 0x4c, 0x00, 0x00, 0x00, 0x32,
+    0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x26,
+    0x00, 0x00, 0x00, 0x1c, 0x00, 0x52, 0x00, 0x47, 0x00, 0x42, 0x00, 0x5f,
+    0x00, 0x44, 0x00, 0x36, 0x00, 0x35, 0x00, 0x5f, 0x00, 0x53, 0x00, 0x52,
+    0x00, 0x47, 0x00, 0x5f, 0x00, 0x52, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x5f,
+    0x00, 0x37, 0x00, 0x30, 0x00, 0x39, 0x00, 0x00, 0x6d, 0x6c, 0x75, 0x63,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c,
+    0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x1c,
+    0x00, 0x43, 0x00, 0x6f, 0x00, 0x70, 0x00, 0x79, 0x00, 0x72, 0x00, 0x69,
+    0x00, 0x67, 0x00, 0x68, 0x00, 0x74, 0x00, 0x20, 0x00, 0x32, 0x00, 0x30,
+    0x00, 0x31, 0x00, 0x38, 0x00, 0x20, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f,
+    0x00, 0x67, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x20, 0x00, 0x4c, 0x00, 0x4c,
+    0x00, 0x43, 0x00, 0x2c, 0x00, 0x20, 0x00, 0x43, 0x00, 0x43, 0x00, 0x2d,
+    0x00, 0x42, 0x00, 0x59, 0x00, 0x2d, 0x00, 0x53, 0x00, 0x41, 0x00, 0x20,
+    0x00, 0x33, 0x00, 0x2e, 0x00, 0x30, 0x00, 0x20, 0x00, 0x55, 0x00, 0x6e,
+    0x00, 0x70, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x74, 0x00, 0x65, 0x00, 0x64,
+    0x00, 0x20, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e,
+    0x00, 0x73, 0x00, 0x65, 0x00, 0x28, 0x00, 0x68, 0x00, 0x74, 0x00, 0x74,
+    0x00, 0x70, 0x00, 0x73, 0x00, 0x3a, 0x00, 0x2f, 0x00, 0x2f, 0x00, 0x63,
+    0x00, 0x72, 0x00, 0x65, 0x00, 0x61, 0x00, 0x74, 0x00, 0x69, 0x00, 0x76,
+    0x00, 0x65, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x6d, 0x00, 0x6d, 0x00, 0x6f,
+    0x00, 0x6e, 0x00, 0x73, 0x00, 0x2e, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x67,
+    0x00, 0x2f, 0x00, 0x6c, 0x00, 0x69, 0x00, 0x63, 0x00, 0x65, 0x00, 0x6e,
+    0x00, 0x73, 0x00, 0x65, 0x00, 0x73, 0x00, 0x2f, 0x00, 0x62, 0x00, 0x79,
+    0x00, 0x2d, 0x00, 0x73, 0x00, 0x61, 0x00, 0x2f, 0x00, 0x33, 0x00, 0x2e,
+    0x00, 0x30, 0x00, 0x2f, 0x00, 0x6c, 0x00, 0x65, 0x00, 0x67, 0x00, 0x61,
+    0x00, 0x6c, 0x00, 0x63, 0x00, 0x6f, 0x00, 0x64, 0x00, 0x65, 0x00, 0x29,
+    0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+    0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x73, 0x66, 0x33, 0x32,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x42, 0x00, 0x00, 0x05, 0xde,
+    0xff, 0xff, 0xf3, 0x25, 0x00, 0x00, 0x07, 0x93, 0x00, 0x00, 0xfd, 0x90,
+    0xff, 0xff, 0xfb, 0xa1, 0xff, 0xff, 0xfd, 0xa2, 0x00, 0x00, 0x03, 0xdc,
+    0x00, 0x00, 0xc0, 0x6e, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x6f, 0xa0, 0x00, 0x00, 0x38, 0xf5, 0x00, 0x00, 0x03, 0x90,
+    0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0x9f,
+    0x00, 0x00, 0x0f, 0x84, 0x00, 0x00, 0xb6, 0xc4, 0x58, 0x59, 0x5a, 0x20,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x97, 0x00, 0x00, 0xb7, 0x87,
+    0x00, 0x00, 0x18, 0xd9, 0x70, 0x61, 0x72, 0x61, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x38, 0xe4, 0x00, 0x00, 0xe8, 0xf0,
+    0x00, 0x00, 0x17, 0x10, 0x00, 0x00, 0x38, 0xe4, 0x00, 0x00, 0x14, 0xbc,
+    0x63, 0x68, 0x72, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
+    0x00, 0x00, 0xa3, 0xd7, 0x00, 0x00, 0x54, 0x7c, 0x00, 0x00, 0x4c, 0xcd,
+    0x00, 0x00, 0x99, 0x9a, 0x00, 0x00, 0x26, 0x67, 0x00, 0x00, 0x0f, 0x5c,
+    0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53, 0x00, 0x00, 0x00, 0x0c,
+    0x00, 0x00, 0x00, 0x1c, 0x00, 0x47, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x67,
+    0x00, 0x6c, 0x00, 0x65, 0x6d, 0x6c, 0x75, 0x63, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x65, 0x6e, 0x55, 0x53,
+    0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x49, 0x00, 0x6d,
+    0x00, 0x61, 0x00, 0x67, 0x00, 0x65, 0x00, 0x20, 0x00, 0x63, 0x00, 0x6f,
+    0x00, 0x64, 0x00, 0x65, 0x00, 0x63, 0x00, 0x00,
+};
+
+}  // namespace
+
+TEST(IccCodecTest, Icc) {
+  // Empty string cannot be tested, encoder checks against writing it.
+  TestProfile("a");
+  TestProfile("ab");
+  TestProfile("aaaa");
+
+  {
+    // Exactly the ICC header size
+    PaddedBytes profile(128);
+    for (size_t i = 0; i < 128; i++) {
+      profile[i] = 0;
+    }
+    TestProfile(profile);
+  }
+
+  {
+    PaddedBytes profile;
+    profile.append(kTestProfile, kTestProfile + sizeof(kTestProfile));
+    TestProfile(profile);
+  }
+
+  // Test substrings of full profile
+  {
+    PaddedBytes profile;
+    for (size_t i = 0; i <= 256; i++) {
+      profile.push_back(kTestProfile[i]);
+      TestProfile(profile);
+    }
+  }
+}
+
+// kTestProfile after encoding with the ICC codec
+static const unsigned char kEncodedTestProfile[] = {
+    0x1f, 0x8b, 0x1,  0x13, 0x10, 0x0,  0x0,  0x0,  0x20, 0x4c, 0xcc, 0x3,
+    0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26,
+    0x57, 0x54, 0xef, 0x0,  0xe8, 0x97, 0x2,  0xce, 0xa1, 0xd7, 0x85, 0x16,
+    0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c,
+    0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4,  0x75, 0x12, 0xc9, 0xcc,
+    0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae,
+    0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd,  0x40,
+    0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7,  0xa6, 0xb9,
+    0x27, 0x92, 0x38, 0x0,  0x3,  0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf,
+    0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66,
+    0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4,
+    0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd,
+    0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f,
+    0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93,
+    0x92, 0x3,  0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7,
+    0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7,  0x38, 0x38, 0xd4, 0xa,  0x66,
+    0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb,  0x7a, 0x24,
+    0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3,
+    0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8,  0xa8, 0xc4, 0x2a, 0x86,
+    0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6,
+    0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9,
+    0x8,  0x98, 0xe1, 0x21, 0x4a, 0x9,  0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0,
+    0x69, 0x1a, 0xeb, 0x52, 0x1,  0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29,
+    0x70, 0xee, 0x4,  0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff,
+    0xfe, 0xdb, 0xaf, 0x8,  0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8,
+    0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8,  0xe9, 0x7,  0xee, 0x4b, 0x80, 0xda,
+    0x4a, 0x4,  0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60,
+    0xb,  0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6,  0x20, 0xb8, 0x64, 0x18,
+    0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b,
+    0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9,  0xf7,
+    0x72, 0xf0, 0x7a, 0xe,  0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc,
+    0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28,
+    0xc1, 0xa7, 0x59, 0xe3, 0x0,
+};
+
+// Tests that the decoded kEncodedTestProfile matches kTestProfile.
+TEST(IccCodecTest, EncodedIccProfile) {
+  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
+                                                 sizeof(kEncodedTestProfile)));
+  jxl::PaddedBytes dec;
+  ASSERT_TRUE(ReadICC(&reader, &dec));
+  ASSERT_TRUE(reader.Close());
+  EXPECT_EQ(sizeof(kTestProfile), dec.size());
+  if (sizeof(kTestProfile) == dec.size()) {
+    for (size_t i = 0; i < dec.size(); i++) {
+      EXPECT_EQ(kTestProfile[i], dec[i]);
+      if (kTestProfile[i] != dec[i]) break;  // One output is enough
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image.cc b/third_party/jpeg-xl/lib/jxl/image.cc
new file mode 100644
index 0000000000..3faff6aefb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image.cc
@@ -0,0 +1,251 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image.h"
+
+#include <algorithm>  // swap
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/image.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+
+namespace HWY_NAMESPACE {
+size_t GetVectorSize() { return HWY_LANES(uint8_t); }
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(GetVectorSize);  // Local function.
+
+// Returns distance [bytes] between the start of two consecutive rows, a
+// multiple of vector/cache line size but NOT CacheAligned::kAlias - see below.
+size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) {
+  const size_t vec_size = VectorSize();
+  size_t valid_bytes = xsize * sizeof_t;
+
+  // Allow unaligned accesses starting at the last valid value - this may raise
+  // msan errors unless the user calls InitializePaddingForUnalignedAccesses.
+  // Skip for the scalar case because no extra lanes will be loaded.
+  if (vec_size != 0) {
+    valid_bytes += vec_size - sizeof_t;
+  }
+
+  // Round up to vector and cache line size.
+  const size_t align = std::max(vec_size, CacheAligned::kAlignment);
+  size_t bytes_per_row = RoundUpTo(valid_bytes, align);
+
+  // During the lengthy window before writes are committed to memory, CPUs
+  // guard against read after write hazards by checking the address, but
+  // only the lower 11 bits. We avoid a false dependency between writes to
+  // consecutive rows by ensuring their sizes are not multiples of 2 KiB.
+  // Avoid2K prevents the same problem for the planes of an Image3.
+  if (bytes_per_row % CacheAligned::kAlias == 0) {
+    bytes_per_row += align;
+  }
+
+  JXL_ASSERT(bytes_per_row % align == 0);
+  return bytes_per_row;
+}
+
+}  // namespace
+
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
+PlaneBase::PlaneBase(const size_t xsize, const size_t ysize,
+                     const size_t sizeof_t)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      orig_xsize_(static_cast<uint32_t>(xsize)),
+      orig_ysize_(static_cast<uint32_t>(ysize)) {
+  // (Can't profile CacheAligned itself because it is used by profiler.h)
+  PROFILER_FUNC;
+
+  JXL_CHECK(xsize == xsize_);
+  JXL_CHECK(ysize == ysize_);
+
+  JXL_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
+
+  bytes_per_row_ = 0;
+  // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
+  // if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
+  if (xsize != 0 && ysize != 0) {
+    bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
+    bytes_ = AllocateArray(bytes_per_row_ * ysize);
+    JXL_CHECK(bytes_.get());
+    InitializePadding(sizeof_t, Padding::kRoundUp);
+  }
+}
+
+void PlaneBase::InitializePadding(const size_t sizeof_t, Padding padding) {
+#if defined(MEMORY_SANITIZER) || HWY_IDE
+  if (xsize_ == 0 || ysize_ == 0) return;
+
+  const size_t vec_size = VectorSize();
+  if (vec_size == 0) return;  // Scalar mode: no padding needed
+
+  const size_t valid_size = xsize_ * sizeof_t;
+  const size_t initialize_size = padding == Padding::kRoundUp
+                                     ? RoundUpTo(valid_size, vec_size)
+                                     : valid_size + vec_size - sizeof_t;
+  if (valid_size == initialize_size) return;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    uint8_t* JXL_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ <= 6) || \
+     (defined(__apple_build_version__) &&                           \
+      __apple_build_version__ <= 10001145))
+    // There's a bug in msan in clang-6 when handling AVX2 operations. This
+    // workaround allows tests to pass on msan, although it is slower and
+    // prevents msan warnings from uninitialized images.
+    std::fill(row, msan::kSanitizerSentinelByte, initialize_size);
+#else
+    memset(row + valid_size, msan::kSanitizerSentinelByte,
+           initialize_size - valid_size);
+#endif  // clang6
+  }
+#endif  // MEMORY_SANITIZER
+}
+
+void PlaneBase::Swap(PlaneBase& other) {
+  std::swap(xsize_, other.xsize_);
+  std::swap(ysize_, other.ysize_);
+  std::swap(orig_xsize_, other.orig_xsize_);
+  std::swap(orig_ysize_, other.orig_ysize_);
+  std::swap(bytes_per_row_, other.bytes_per_row_);
+  std::swap(bytes_, other.bytes_);
+}
+
+Image3F PadImageMirror(const Image3F& in, const size_t xborder,
+                       const size_t yborder) {
+  size_t xsize = in.xsize();
+  size_t ysize = in.ysize();
+  Image3F out(xsize + 2 * xborder, ysize + 2 * yborder);
+  if (xborder > xsize || yborder > ysize) {
+    for (size_t c = 0; c < 3; c++) {
+      for (int32_t y = 0; y < static_cast<int32_t>(out.ysize()); y++) {
+        float* row_out = out.PlaneRow(c, y);
+        const float* row_in = in.PlaneRow(
+            c, Mirror(y - static_cast<int32_t>(yborder), in.ysize()));
+        for (int32_t x = 0; x < static_cast<int32_t>(out.xsize()); x++) {
+          int32_t xin = Mirror(x - static_cast<int32_t>(xborder), in.xsize());
+          row_out[x] = row_in[xin];
+        }
+      }
+    }
+    return out;
+  }
+  CopyImageTo(in, Rect(xborder, yborder, xsize, ysize), &out);
+  for (size_t c = 0; c < 3; c++) {
+    // Horizontal pad.
+    for (size_t y = 0; y < ysize; y++) {
+      for (size_t x = 0; x < xborder; x++) {
+        out.PlaneRow(c, y + yborder)[x] =
+            in.ConstPlaneRow(c, y)[xborder - x - 1];
+        out.PlaneRow(c, y + yborder)[x + xsize + xborder] =
+            in.ConstPlaneRow(c, y)[xsize - 1 - x];
+      }
+    }
+    // Vertical pad.
+    for (size_t y = 0; y < yborder; y++) {
+      memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y),
+             out.xsize() * sizeof(float));
+      memcpy(out.PlaneRow(c, y + ysize + yborder),
+             out.ConstPlaneRow(c, ysize + yborder - 1 - y),
+             out.xsize() * sizeof(float));
+    }
+  }
+  return out;
+}
+
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim) {
+  PROFILER_FUNC;
+  const size_t xsize_orig = in->xsize();
+  const size_t ysize_orig = in->ysize();
+  const size_t xsize = RoundUpTo(xsize_orig, block_dim);
+  const size_t ysize = RoundUpTo(ysize_orig, block_dim);
+  // Expands image size to the originally-allocated size.
+  in->ShrinkTo(xsize, ysize);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize_orig; y++) {
+      float* JXL_RESTRICT row = in->PlaneRow(c, y);
+      for (size_t x = xsize_orig; x < xsize; x++) {
+        row[x] = row[xsize_orig - 1];
+      }
+    }
+    const float* JXL_RESTRICT row_src = in->ConstPlaneRow(c, ysize_orig - 1);
+    for (size_t y = ysize_orig; y < ysize; y++) {
+      memcpy(in->PlaneRow(c, y), row_src, xsize * sizeof(float));
+    }
+  }
+}
+
+static void DownsampleImage(const ImageF& input, size_t factor,
+                            ImageF* output) {
+  JXL_ASSERT(factor != 1);
+  output->ShrinkTo(DivCeil(input.xsize(), factor),
+                   DivCeil(input.ysize(), factor));
+  size_t in_stride = input.PixelsPerRow();
+  for (size_t y = 0; y < output->ysize(); y++) {
+    float* row_out = output->Row(y);
+    const float* row_in = input.Row(factor * y);
+    for (size_t x = 0; x < output->xsize(); x++) {
+      size_t cnt = 0;
+      float sum = 0;
+      for (size_t iy = 0; iy < factor && iy + factor * y < input.ysize();
+           iy++) {
+        for (size_t ix = 0; ix < factor && ix + factor * x < input.xsize();
+             ix++) {
+          sum += row_in[iy * in_stride + x * factor + ix];
+          cnt++;
+        }
+      }
+      row_out[x] = sum / cnt;
+    }
+  }
+}
+
+void DownsampleImage(ImageF* image, size_t factor) {
+  // Allocate extra space to avoid a reallocation when padding.
+  ImageF downsampled(DivCeil(image->xsize(), factor) + kBlockDim,
+                     DivCeil(image->ysize(), factor) + kBlockDim);
+  DownsampleImage(*image, factor, &downsampled);
+  *image = std::move(downsampled);
+}
+
+void DownsampleImage(Image3F* opsin, size_t factor) {
+  JXL_ASSERT(factor != 1);
+  // Allocate extra space to avoid a reallocation when padding.
+  Image3F downsampled(DivCeil(opsin->xsize(), factor) + kBlockDim,
+                      DivCeil(opsin->ysize(), factor) + kBlockDim);
+  downsampled.ShrinkTo(downsampled.xsize() - kBlockDim,
+                       downsampled.ysize() - kBlockDim);
+  for (size_t c = 0; c < 3; c++) {
+    DownsampleImage(opsin->Plane(c), factor, &downsampled.Plane(c));
+  }
+  *opsin = std::move(downsampled);
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/image.h b/third_party/jpeg-xl/lib/jxl/image.h
new file mode 100644
index 0000000000..e66534220c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image.h
@@ -0,0 +1,497 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_H_
+#define LIB_JXL_IMAGE_H_
+
+// SIMD/multicore-friendly planar image representation with row accessors.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <sstream>
+#include <utility>  // std::move
+
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+// Helper function to create rows that are multiples of SIMD vector size.
+size_t VectorSize();
+
+// Type-independent parts of Plane<> - reduces code duplication and facilitates
+// moving member function implementations to cc file.
+struct PlaneBase {
+  PlaneBase()
+      : xsize_(0),
+        ysize_(0),
+        orig_xsize_(0),
+        orig_ysize_(0),
+        bytes_per_row_(0),
+        bytes_(nullptr) {}
+  PlaneBase(size_t xsize, size_t ysize, size_t sizeof_t);
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo() instead.
+  PlaneBase(const PlaneBase& other) = delete;
+  PlaneBase& operator=(const PlaneBase& other) = delete;
+
+  // Move constructor (required for returning Image from function)
+  PlaneBase(PlaneBase&& other) noexcept = default;
+
+  // Move assignment (required for std::vector)
+  PlaneBase& operator=(PlaneBase&& other) noexcept = default;
+
+  void Swap(PlaneBase& other);
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. May also be used to
+  // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <=
+  // the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    JXL_CHECK(xsize <= orig_xsize_);
+    JXL_CHECK(ysize <= orig_ysize_);
+    xsize_ = static_cast<uint32_t>(xsize);
+    ysize_ = static_cast<uint32_t>(ysize);
+    // NOTE: we can't recompute bytes_per_row for more compact storage and
+    // better locality because that would invalidate the image contents.
+  }
+
+  // How many pixels.
+  JXL_INLINE size_t xsize() const { return xsize_; }
+  JXL_INLINE size_t ysize() const { return ysize_; }
+
+  // NOTE: do not use this for copying rows - the valid xsize may be much less.
+  JXL_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
+
+  // Raw access to byte contents, for interfacing with other libraries.
+  // Unsigned char instead of char to avoid surprises (sign extension).
+  JXL_INLINE uint8_t* bytes() {
+    void* p = bytes_.get();
+    return static_cast<uint8_t * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(p, 64));
+  }
+  JXL_INLINE const uint8_t* bytes() const {
+    const void* p = bytes_.get();
+    return static_cast<const uint8_t * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(p, 64));
+  }
+
+ protected:
+  // Returns pointer to the start of a row.
+  JXL_INLINE void* VoidRow(const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (y >= ysize_) {
+      JXL_ABORT("Row(%" PRIu64 ") in (%u x %u) image\n", (uint64_t)y, xsize_,
+                ysize_);
+    }
+#endif
+
+    void* row = bytes_.get() + y * bytes_per_row_;
+    return JXL_ASSUME_ALIGNED(row, 64);
+  }
+
+  enum class Padding {
+    // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
+    kRoundUp,
+    // Allow LoadU(d, row + x) for x = xsize() - 1. This requires an extra
+    // vector to be initialized. If done by default, this would suppress
+    // legitimate msan warnings. We therefore require users to explicitly call
+    // InitializePadding before using unaligned loads (e.g. convolution).
+    kUnaligned
+  };
+
+  // Initializes the minimum bytes required to suppress msan warnings from
+  // legitimate (according to Padding mode) vector loads/stores on the right
+  // border, where some lanes are uninitialized and assumed to be unused.
+  void InitializePadding(size_t sizeof_t, Padding padding);
+
+  // (Members are non-const to enable assignment during move-assignment.)
+  uint32_t xsize_;  // In valid pixels, not including any padding.
+  uint32_t ysize_;
+  uint32_t orig_xsize_;
+  uint32_t orig_ysize_;
+  size_t bytes_per_row_;  // Includes padding.
+  CacheAlignedUniquePtr bytes_;
+};
+
+// Single channel, aligned rows separated by padding. T must be POD.
+//
+// 'Single channel' (one 2D array per channel) simplifies vectorization
+// (repeating the same operation on multiple adjacent components) without the
+// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
+// can easily iterate over all components in a row and Image requires no
+// knowledge of the pixel format beyond the component type "T".
+//
+// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
+// false sharing between two threads operating on adjacent rows.
+//
+// 'Padding' is still relevant because vectors could potentially be larger than
+// a cache line. By rounding up row sizes to the vector size, we allow
+// reading/writing ALIGNED vectors whose first lane is a valid sample. This
+// avoids needing a separate loop to handle remaining unaligned lanes.
+//
+// This image layout could also be achieved with a vector and a row accessor
+// function, but a class wrapper with support for "deleter" allows wrapping
+// existing memory allocated by clients without copying the pixels. It also
+// provides convenient accessors for xsize/ysize, which shortens function
+// argument lists. Supports move-construction so it can be stored in containers.
+template <typename ComponentType>
+class Plane : public PlaneBase {
+ public:
+  using T = ComponentType;
+  static constexpr size_t kNumPlanes = 1;
+
+  Plane() = default;
+  Plane(const size_t xsize, const size_t ysize)
+      : PlaneBase(xsize, ysize, sizeof(T)) {}
+
+  void InitializePaddingForUnalignedAccesses() {
+    InitializePadding(sizeof(T), Padding::kUnaligned);
+  }
+
+  JXL_INLINE T* Row(const size_t y) { return static_cast<T*>(VoidRow(y)); }
+
+  // Returns pointer to const (see above).
+  JXL_INLINE const T* Row(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Documents that the access is const.
+  JXL_INLINE const T* ConstRow(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must
+  // NOT be used to determine xsize.
+  JXL_INLINE intptr_t PixelsPerRow() const {
+    return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
+  }
+};
+
+using ImageSB = Plane<int8_t>;
+using ImageB = Plane<uint8_t>;
+using ImageS = Plane<int16_t>;  // signed integer or half-float
+using ImageU = Plane<uint16_t>;
+using ImageI = Plane<int32_t>;
+using ImageF = Plane<float>;
+using ImageD = Plane<double>;
+
+// Also works for Image3 and mixed argument types.
+template <class Image1, class Image2>
+bool SameSize(const Image1& image1, const Image2& image2) {
+  return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
+}
+
+template <typename T>
+class Image3;
+
+// Rectangular region in image(s). Factoring this out of Image instead of
+// shifting the pointer by x0/y0 allows this to apply to multiple images with
+// different resolutions (e.g. color transform and quantization field).
+// Can compare using SameSize(rect1, rect2).
+template <typename T>
+class RectT {
+ public:
+  // Most windows are xsize_max * ysize_max, except those on the borders where
+  // begin + size_max > end.
+  constexpr RectT(T xbegin, T ybegin, size_t xsize_max, size_t ysize_max,
+                  T xend, T yend)
+      : x0_(xbegin),
+        y0_(ybegin),
+        xsize_(ClampedSize(xbegin, xsize_max, xend)),
+        ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
+
+  // Construct with origin and known size (typically from another Rect).
+  constexpr RectT(T xbegin, T ybegin, size_t xsize, size_t ysize)
+      : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
+
+  // Construct a rect that covers a whole image/plane/ImageBundle etc.
+  template <typename ImageT>
+  explicit RectT(const ImageT& image)
+      : RectT(0, 0, image.xsize(), image.ysize()) {}
+
+  RectT() : RectT(0, 0, 0, 0) {}
+
+  RectT(const RectT&) = default;
+  RectT& operator=(const RectT&) = default;
+
+  // Construct a subrect that resides in an image/plane/ImageBundle etc.
+  template <typename ImageT>
+  RectT Crop(const ImageT& image) const {
+    return Intersection(RectT(image));
+  }
+
+  // Construct a subrect that resides in the [0, ysize) x [0, xsize) region of
+  // the current rect.
+  RectT Crop(size_t area_xsize, size_t area_ysize) const {
+    return Intersection(RectT(0, 0, area_xsize, area_ysize));
+  }
+
+  // Returns a rect that only contains `num` lines with offset `y` from `y0()`.
+  RectT Lines(size_t y, size_t num) const {
+    JXL_DASSERT(y + num <= ysize_);
+    return RectT(x0_, y0_ + y, xsize_, num);
+  }
+
+  RectT Line(size_t y) const { return Lines(y, 1); }
+
+  JXL_MUST_USE_RESULT RectT Intersection(const RectT& other) const {
+    return RectT(std::max(x0_, other.x0_), std::max(y0_, other.y0_), xsize_,
+                 ysize_, std::min(x1(), other.x1()),
+                 std::min(y1(), other.y1()));
+  }
+
+  JXL_MUST_USE_RESULT RectT Translate(int64_t x_offset,
+                                      int64_t y_offset) const {
+    return RectT(x0_ + x_offset, y0_ + y_offset, xsize_, ysize_);
+  }
+
+  template <typename V>
+  V* Row(Plane<V>* image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->Row(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* Row(const Plane<V>* image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->Row(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  V* PlaneRow(Image3<V>* image, const size_t c, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image->PlaneRow(c, y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* ConstRow(const Plane<V>& image, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image.ConstRow(y + y0_) + x0_;
+  }
+
+  template <typename V>
+  const V* ConstPlaneRow(const Image3<V>& image, size_t c, size_t y) const {
+    JXL_DASSERT(y + y0_ >= 0);
+    return image.ConstPlaneRow(c, y + y0_) + x0_;
+  }
+
+  bool IsInside(const RectT& other) const {
+    return x0_ >= other.x0() && x1() <= other.x1() && y0_ >= other.y0() &&
+           y1() <= other.y1();
+  }
+
+  // Returns true if this Rect fully resides in the given image. ImageT could be
+  // Plane<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
+  template <class ImageT>
+  bool IsInside(const ImageT& image) const {
+    return IsInside(RectT(image));
+  }
+
+  T x0() const { return x0_; }
+  T y0() const { return y0_; }
+  size_t xsize() const { return xsize_; }
+  size_t ysize() const { return ysize_; }
+  T x1() const { return x0_ + xsize_; }
+  T y1() const { return y0_ + ysize_; }
+
+  RectT<T> ShiftLeft(size_t shiftx, size_t shifty) const {
+    return RectT<T>(x0_ * (1 << shiftx), y0_ * (1 << shifty), xsize_ << shiftx,
+                    ysize_ << shifty);
+  }
+  RectT<T> ShiftLeft(size_t shift) const { return ShiftLeft(shift, shift); }
+
+  // Requires x0(), y0() to be multiples of 1<<shiftx, 1<<shifty.
+  RectT<T> CeilShiftRight(size_t shiftx, size_t shifty) const {
+    JXL_ASSERT(x0_ % (1 << shiftx) == 0);
+    JXL_ASSERT(y0_ % (1 << shifty) == 0);
+    return RectT<T>(x0_ / (1 << shiftx), y0_ / (1 << shifty),
+                    DivCeil(xsize_, T{1} << shiftx),
+                    DivCeil(ysize_, T{1} << shifty));
+  }
+  RectT<T> CeilShiftRight(std::pair<size_t, size_t> shift) const {
+    return CeilShiftRight(shift.first, shift.second);
+  }
+  RectT<T> CeilShiftRight(size_t shift) const {
+    return CeilShiftRight(shift, shift);
+  }
+
+  template <typename U>
+  RectT<U> As() const {
+    return RectT<U>(U(x0_), U(y0_), U(xsize_), U(ysize_));
+  }
+
+ private:
+  // Returns size_max, or whatever is left in [begin, end).
+  static constexpr size_t ClampedSize(T begin, size_t size_max, T end) {
+    return (static_cast<T>(begin + size_max) <= end)
+               ? size_max
+               : (end > begin ? end - begin : 0);
+  }
+
+  T x0_;
+  T y0_;
+
+  size_t xsize_;
+  size_t ysize_;
+};
+
+template <typename T>
+std::string Description(RectT<T> r) {
+  std::ostringstream os;
+  os << "[" << r.x0() << ".." << r.x1() << ")x"
+     << "[" << r.y0() << ".." << r.y1() << ")";
+  return os.str();
+}
+
+using Rect = RectT<size_t>;
+
+// Currently, we abuse Image to either refer to an image that owns its storage
+// or one that doesn't. In similar vein, we abuse Image* function parameters to
+// either mean "assign to me" or "fill the provided image with data".
+// Hopefully, the "assign to me" meaning will go away and most images in the
+// codebase will not be backed by own storage. When this happens we can redesign
+// Image to be a non-storage-holding view class and introduce BackedImage in
+// those places that actually need it.
+
+// NOTE: we can't use Image as a view because invariants are violated
+// (alignment and the presence of padding before/after each "row").
+
+// A bundle of 3 same-sized images. Typically constructed by moving from three
+// rvalue references to Image. To overwrite an existing Image3 using
+// single-channel producers, we also need access to Image*. Constructing
+// temporary non-owning Image pointing to one plane of an existing Image3 risks
+// dangling references, especially if the wrapper is moved. Therefore, we
+// store an array of Image (which are compact enough that size is not a concern)
+// and provide Plane+Row accessors.
+template <typename ComponentType>
+class Image3 {
+ public:
+  using T = ComponentType;
+  using PlaneT = jxl::Plane<T>;
+  static constexpr size_t kNumPlanes = 3;
+
+  Image3() : planes_{PlaneT(), PlaneT(), PlaneT()} {}
+
+  Image3(const size_t xsize, const size_t ysize)
+      : planes_{PlaneT(xsize, ysize), PlaneT(xsize, ysize),
+                PlaneT(xsize, ysize)} {}
+
+  Image3(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+  }
+
+  Image3(PlaneT&& plane0, PlaneT&& plane1, PlaneT&& plane2) {
+    JXL_CHECK(SameSize(plane0, plane1));
+    JXL_CHECK(SameSize(plane0, plane2));
+    planes_[0] = std::move(plane0);
+    planes_[1] = std::move(plane1);
+    planes_[2] = std::move(plane2);
+  }
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo instead.
+  Image3(const Image3& other) = delete;
+  Image3& operator=(const Image3& other) = delete;
+
+  Image3& operator=(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+    return *this;
+  }
+
+  // Returns row pointer; usage: PlaneRow(idx_plane, y)[x] = val.
+  JXL_INLINE T* PlaneRow(const size_t c, const size_t y) {
+    // Custom implementation instead of calling planes_[c].Row ensures only a
+    // single multiplication is needed for PlaneRow(0..2, y).
+    PlaneRowBoundsCheck(c, y);
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    void* row = planes_[c].bytes() + row_offset;
+    return static_cast<T * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(row, 64));
+  }
+
+  // Returns const row pointer; usage: val = PlaneRow(idx_plane, y)[x].
+  JXL_INLINE const T* PlaneRow(const size_t c, const size_t y) const {
+    PlaneRowBoundsCheck(c, y);
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    const void* row = planes_[c].bytes() + row_offset;
+    return static_cast<const T * JXL_RESTRICT>(JXL_ASSUME_ALIGNED(row, 64));
+  }
+
+  // Returns const row pointer, even if called from a non-const Image3.
+  JXL_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
+    PlaneRowBoundsCheck(c, y);
+    return PlaneRow(c, y);
+  }
+
+  JXL_INLINE const PlaneT& Plane(size_t idx) const { return planes_[idx]; }
+
+  JXL_INLINE PlaneT& Plane(size_t idx) { return planes_[idx]; }
+
+  void Swap(Image3& other) {
+    for (size_t c = 0; c < 3; ++c) {
+      other.planes_[c].Swap(planes_[c]);
+    }
+  }
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. May also be used to
+  // un-shrink the image. Caller is responsible for ensuring xsize/ysize are <=
+  // the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    for (PlaneT& plane : planes_) {
+      plane.ShrinkTo(xsize, ysize);
+    }
+  }
+
+  // Sizes of all three images are guaranteed to be equal.
+  JXL_INLINE size_t xsize() const { return planes_[0].xsize(); }
+  JXL_INLINE size_t ysize() const { return planes_[0].ysize(); }
+  // Returns offset [bytes] from one row to the next row of the same plane.
+  // WARNING: this must NOT be used to determine xsize, nor for copying rows -
+  // the valid xsize may be much less.
+  JXL_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must NOT be used
+  // to determine xsize.
+  JXL_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
+
+ private:
+  void PlaneRowBoundsCheck(const size_t c, const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (c >= kNumPlanes || y >= ysize()) {
+      JXL_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") in (%" PRIu64 " x %" PRIu64
+                ") image\n",
+                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
+                static_cast<uint64_t>(xsize()), static_cast<uint64_t>(ysize()));
+    }
+#endif
+  }
+
+ private:
+  PlaneT planes_[kNumPlanes];
+};
+
+using Image3B = Image3<uint8_t>;
+using Image3S = Image3<int16_t>;
+using Image3U = Image3<uint16_t>;
+using Image3I = Image3<int32_t>;
+using Image3F = Image3<float>;
+using Image3D = Image3<double>;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.cc b/third_party/jpeg-xl/lib/jxl/image_bundle.cc
new file mode 100644
index 0000000000..7e7051b608
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle.cc
@@ -0,0 +1,125 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_bundle.h"
+
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+void ImageBundle::ShrinkTo(size_t xsize, size_t ysize) {
+  if (HasColor()) color_.ShrinkTo(xsize, ysize);
+  for (ImageF& ec : extra_channels_) {
+    ec.ShrinkTo(xsize, ysize);
+  }
+}
+
+// Called by all other SetFrom*.
+void ImageBundle::SetFromImage(Image3F&& color,
+                               const ColorEncoding& c_current) {
+  JXL_CHECK(color.xsize() != 0 && color.ysize() != 0);
+  JXL_CHECK(metadata_->color_encoding.IsGray() == c_current.IsGray());
+  color_ = std::move(color);
+  c_current_ = c_current;
+  VerifySizes();
+}
+
+void ImageBundle::VerifyMetadata() const {
+  JXL_CHECK(!c_current_.ICC().empty());
+  JXL_CHECK(metadata_->color_encoding.IsGray() == IsGray());
+
+  if (metadata_->HasAlpha() && alpha().xsize() == 0) {
+    JXL_ABORT("MD alpha_bits %u IB alpha %" PRIuS " x %" PRIuS "\n",
+              metadata_->GetAlphaBits(), alpha().xsize(), alpha().ysize());
+  }
+  const uint32_t alpha_bits = metadata_->GetAlphaBits();
+  JXL_CHECK(alpha_bits <= 32);
+
+  // metadata_->num_extra_channels may temporarily differ from
+  // extra_channels_.size(), e.g. after SetAlpha. They are synced by the next
+  // call to VisitFields.
+}
+
+void ImageBundle::VerifySizes() const {
+  const size_t xs = xsize();
+  const size_t ys = ysize();
+
+  if (HasExtraChannels()) {
+    JXL_CHECK(xs != 0 && ys != 0);
+    for (const ImageF& ec : extra_channels_) {
+      JXL_CHECK(ec.xsize() == xs);
+      JXL_CHECK(ec.ysize() == ys);
+    }
+  }
+}
+
+size_t ImageBundle::DetectRealBitdepth() const {
+  return metadata_->bit_depth.bits_per_sample;
+
+  // TODO(lode): let this function return lower bit depth if possible, e.g.
+  // return 8 bits in case the original image came from a 16-bit PNG that
+  // was in fact representable as 8-bit PNG. Ensure that the implementation
+  // returns 16 if e.g. two consecutive 16-bit values appeared in the original
+  // image (such as 32768 and 32769), take into account that e.g. the values
+  // 3-bit can represent is not a superset of the values 2-bit can represent,
+  // and there may be slight imprecisions in the floating point image.
+}
+
+const ImageF& ImageBundle::black() const {
+  JXL_ASSERT(HasBlack());
+  const size_t ec = metadata_->Find(ExtraChannel::kBlack) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return extra_channels_[ec];
+}
+const ImageF& ImageBundle::alpha() const {
+  JXL_ASSERT(HasAlpha());
+  const size_t ec = metadata_->Find(ExtraChannel::kAlpha) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return extra_channels_[ec];
+}
+ImageF* ImageBundle::alpha() {
+  JXL_ASSERT(HasAlpha());
+  const size_t ec = metadata_->Find(ExtraChannel::kAlpha) -
+                    metadata_->extra_channel_info.data();
+  JXL_ASSERT(ec < extra_channels_.size());
+  return &extra_channels_[ec];
+}
+
+void ImageBundle::SetAlpha(ImageF&& alpha) {
+  const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha);
+  // Must call SetAlphaBits first, otherwise we don't know which channel index
+  JXL_CHECK(eci != nullptr);
+  JXL_CHECK(alpha.xsize() != 0 && alpha.ysize() != 0);
+  if (extra_channels_.size() < metadata_->extra_channel_info.size()) {
+    // TODO(jon): get rid of this case
+    extra_channels_.insert(
+        extra_channels_.begin() + (eci - metadata_->extra_channel_info.data()),
+        std::move(alpha));
+  } else {
+    extra_channels_[eci - metadata_->extra_channel_info.data()] =
+        std::move(alpha);
+  }
+  // num_extra_channels is automatically set in visitor
+  VerifySizes();
+}
+
+void ImageBundle::SetExtraChannels(std::vector<ImageF>&& extra_channels) {
+  for (const ImageF& plane : extra_channels) {
+    JXL_CHECK(plane.xsize() != 0 && plane.ysize() != 0);
+  }
+  extra_channels_ = std::move(extra_channels);
+  VerifySizes();
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle.h b/third_party/jpeg-xl/lib/jxl/image_bundle.h
new file mode 100644
index 0000000000..c7b812b59a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle.h
@@ -0,0 +1,254 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_BUNDLE_H_
+#define LIB_JXL_IMAGE_BUNDLE_H_
+
+// The main image or frame consists of a bundle of associated images.
+
+#include <jxl/cms_interface.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+// A bundle of color/alpha/depth/plane images.
+class ImageBundle {
+ public:
+  // Uninitialized state for use as output parameter.
+  ImageBundle() : metadata_(nullptr) {}
+  // Caller is responsible for setting metadata before calling Set*.
+  explicit ImageBundle(const ImageMetadata* metadata) : metadata_(metadata) {}
+
+  // Move-only (allows storing in std::vector).
+  ImageBundle(ImageBundle&&) = default;
+  ImageBundle& operator=(ImageBundle&&) = default;
+
+  ImageBundle Copy() const {
+    ImageBundle copy(metadata_);
+    copy.color_ = CopyImage(color_);
+    copy.c_current_ = c_current_;
+    copy.extra_channels_.reserve(extra_channels_.size());
+    for (const ImageF& plane : extra_channels_) {
+      copy.extra_channels_.emplace_back(CopyImage(plane));
+    }
+
+    copy.jpeg_data =
+        jpeg_data ? make_unique<jpeg::JPEGData>(*jpeg_data) : nullptr;
+    copy.color_transform = color_transform;
+    copy.chroma_subsampling = chroma_subsampling;
+
+    return copy;
+  }
+
+  // -- SIZE
+
+  size_t xsize() const {
+    if (IsJPEG()) return jpeg_data->width;
+    if (color_.xsize() != 0) return color_.xsize();
+    return extra_channels_.empty() ? 0 : extra_channels_[0].xsize();
+  }
+  size_t ysize() const {
+    if (IsJPEG()) return jpeg_data->height;
+    if (color_.ysize() != 0) return color_.ysize();
+    return extra_channels_.empty() ? 0 : extra_channels_[0].ysize();
+  }
+  void ShrinkTo(size_t xsize, size_t ysize);
+
+  // sizes taking orientation into account
+  size_t oriented_xsize() const {
+    if (static_cast<uint32_t>(metadata_->GetOrientation()) > 4) {
+      return ysize();
+    } else {
+      return xsize();
+    }
+  }
+  size_t oriented_ysize() const {
+    if (static_cast<uint32_t>(metadata_->GetOrientation()) > 4) {
+      return xsize();
+    } else {
+      return ysize();
+    }
+  }
+
+  // -- COLOR
+
+  // Whether color() is valid/usable. Returns true in most cases. Even images
+  // with spot colors (one example of when !planes().empty()) typically have a
+  // part that can be converted to RGB.
+  bool HasColor() const { return color_.xsize() != 0; }
+
+  // For resetting the size when switching from a reference to main frame.
+  void RemoveColor() { color_ = Image3F(); }
+
+  // Do not use if !HasColor().
+  const Image3F& color() const {
+    // If this fails, Set* was not called - perhaps because decoding failed?
+    JXL_DASSERT(HasColor());
+    return color_;
+  }
+
+  // Do not use if !HasColor().
+  Image3F* color() {
+    JXL_DASSERT(HasColor());
+    return &color_;
+  }
+
+  // If c_current.IsGray(), all planes must be identical. NOTE: c_current is
+  // independent of metadata()->color_encoding, which is the original, whereas
+  // a decoder might return pixels in a different c_current.
+  // This only sets the color channels, you must also make extra channels
+  // match the amount that is in the metadata.
+  void SetFromImage(Image3F&& color, const ColorEncoding& c_current);
+
+  // -- COLOR ENCODING
+
+  const ColorEncoding& c_current() const { return c_current_; }
+
+  // Returns whether the color image has identical planes. Once established by
+  // Set*, remains unchanged until a subsequent Set* or TransformTo.
+  bool IsGray() const { return c_current_.IsGray(); }
+
+  bool IsSRGB() const { return c_current_.IsSRGB(); }
+  bool IsLinearSRGB() const {
+    return c_current_.white_point == WhitePoint::kD65 &&
+           c_current_.primaries == Primaries::kSRGB && c_current_.tf.IsLinear();
+  }
+
+  // Set the c_current profile without doing any transformation, e.g. if the
+  // transformation was already applied.
+  void OverrideProfile(const ColorEncoding& new_c_current) {
+    c_current_ = new_c_current;
+  }
+
+  // TODO(lode): TransformTo and CopyTo are implemented in enc_image_bundle.cc,
+  // move these functions out of this header file and class, to
+  // enc_image_bundle.h.
+
+  // Transforms color to c_desired and sets c_current to c_desired. Alpha and
+  // metadata remains unchanged.
+  Status TransformTo(const ColorEncoding& c_desired, const JxlCmsInterface& cms,
+                     ThreadPool* pool = nullptr);
+  // Copies this:rect, converts to c_desired, and allocates+fills out.
+  Status CopyTo(const Rect& rect, const ColorEncoding& c_desired,
+                const JxlCmsInterface& cms, Image3F* out,
+                ThreadPool* pool = nullptr) const;
+
+  // Detect 'real' bit depth, which can be lower than nominal bit depth
+  // (this is common in PNG), returns 'real' bit depth
+  size_t DetectRealBitdepth() const;
+
+  // -- ALPHA
+
+  void SetAlpha(ImageF&& alpha);
+  bool HasAlpha() const {
+    return metadata_->Find(ExtraChannel::kAlpha) != nullptr;
+  }
+  bool AlphaIsPremultiplied() const {
+    const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha);
+    return (eci == nullptr) ? false : eci->alpha_associated;
+  }
+  const ImageF& alpha() const;
+  ImageF* alpha();
+
+  // -- EXTRA CHANNELS
+  bool HasBlack() const {
+    return metadata_->Find(ExtraChannel::kBlack) != nullptr;
+  }
+  const ImageF& black() const;
+
+  // Extra channels of unknown interpretation (e.g. spot colors).
+  void SetExtraChannels(std::vector<ImageF>&& extra_channels);
+  void ClearExtraChannels() { extra_channels_.clear(); }
+  bool HasExtraChannels() const { return !extra_channels_.empty(); }
+  const std::vector<ImageF>& extra_channels() const { return extra_channels_; }
+  std::vector<ImageF>& extra_channels() { return extra_channels_; }
+
+  const ImageMetadata* metadata() const { return metadata_; }
+
+  void VerifyMetadata() const;
+
+  void SetDecodedBytes(size_t decoded_bytes) { decoded_bytes_ = decoded_bytes; }
+  size_t decoded_bytes() const { return decoded_bytes_; }
+
+  // -- JPEG transcoding:
+
+  // Returns true if image does or will represent quantized DCT-8 coefficients,
+  // stored in 8x8 pixel regions.
+  bool IsJPEG() const {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+    return jpeg_data != nullptr;
+#else   // JPEGXL_ENABLE_TRANSCODE_JPEG
+    return false;
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+  }
+
+  std::unique_ptr<jpeg::JPEGData> jpeg_data;
+  // these fields are used to signal the input JPEG color space
+  // NOTE: JPEG doesn't actually provide a way to determine whether YCbCr was
+  // applied or not.
+  ColorTransform color_transform = ColorTransform::kNone;
+  YCbCrChromaSubsampling chroma_subsampling;
+
+  FrameOrigin origin{0, 0};
+
+  // Animation-related information, corresponding to the timecode and duration
+  // fields of the jxl::AnimationFrame of the jxl::FrameHeader.
+  // TODO(lode): ImageBundle is used here to carry the information from
+  // jxl::FrameHeader, consider instead passing a jxl::FrameHeader directly to
+  // EncodeFrame or having a field of that type here.
+  uint32_t duration = 0;
+  uint32_t timecode = 0;
+
+  // TODO(lode): these fields do not match the JXL frame header, it should be
+  // possible to specify up to 4 (3 if nonzero duration) slots to save this
+  // frame as reference (see save_as_reference).
+  bool use_for_next_frame = false;
+  bool blend = false;
+  BlendMode blendmode = BlendMode::kBlend;
+
+  std::string name;
+
+ private:
+  // Called after any Set* to ensure their sizes are compatible.
+  void VerifySizes() const;
+
+  // Required for TransformTo so that an ImageBundle is self-sufficient. Always
+  // points to the same thing, but cannot be const-pointer because that prevents
+  // the compiler from generating a move ctor.
+  const ImageMetadata* metadata_;
+
+  // Initialized by Set*:
+  Image3F color_;  // If empty, planes_ is not; all planes equal if IsGray().
+  ColorEncoding c_current_;  // of color_
+
+  // Initialized by SetPlanes; size = ImageMetadata.num_extra_channels
+  std::vector<ImageF> extra_channels_;
+
+  // How many bytes of the input were actually read.
+  size_t decoded_bytes_ = 0;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_BUNDLE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc
new file mode 100644
index 0000000000..1a10598fe2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle_test.cc
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_bundle.h"
+
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(ImageBundleTest, ExtraChannelName) {
+  AuxOut aux_out;
+  BitWriter writer;
+  BitWriter::Allotment allotment(&writer, 99);
+
+  ImageMetadata metadata;
+  ExtraChannelInfo eci;
+  eci.type = ExtraChannel::kBlack;
+  eci.name = "testK";
+  metadata.extra_channel_info.push_back(std::move(eci));
+  ASSERT_TRUE(WriteImageMetadata(metadata, &writer, /*layer=*/0, &aux_out));
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, /*layer=*/0, &aux_out);
+
+  BitReader reader(writer.GetSpan());
+  ImageMetadata metadata_out;
+  ASSERT_TRUE(ReadImageMetadata(&reader, &metadata_out));
+  EXPECT_TRUE(reader.Close());
+  EXPECT_EQ("testK", metadata_out.Find(ExtraChannel::kBlack)->name);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.cc b/third_party/jpeg-xl/lib/jxl/image_metadata.cc
new file mode 100644
index 0000000000..20b0d6f95a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_metadata.cc
@@ -0,0 +1,472 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_metadata.h"
+
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+BitDepth::BitDepth() { Bundle::Init(this); }
+Status BitDepth::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &floating_point_sample));
+  // The same fields (bits_per_sample and exponent_bits_per_sample) are read
+  // in a different way depending on floating_point_sample's value. It's still
+  // default-initialized correctly so using visitor->Conditional is not
+  // required.
+  if (!floating_point_sample) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(8), Val(10), Val(12), BitsOffset(6, 1), 8, &bits_per_sample));
+    exponent_bits_per_sample = 0;
+  } else {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val(32), Val(16), Val(24), BitsOffset(6, 1), 32, &bits_per_sample));
+    // The encoded value is exponent_bits_per_sample - 1, encoded in 3 bits
+    // so the value can be in range [1, 8].
+    const uint32_t offset = 1;
+    exponent_bits_per_sample -= offset;
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->Bits(4, 8 - offset, &exponent_bits_per_sample));
+    exponent_bits_per_sample += offset;
+  }
+
+  // Error-checking for floating point ranges.
+  if (floating_point_sample) {
+    if (exponent_bits_per_sample < 2 || exponent_bits_per_sample > 8) {
+      return JXL_FAILURE("Invalid exponent_bits_per_sample: %u",
+                         exponent_bits_per_sample);
+    }
+    int mantissa_bits =
+        static_cast<int>(bits_per_sample) - exponent_bits_per_sample - 1;
+    if (mantissa_bits < 2 || mantissa_bits > 23) {
+      return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample);
+    }
+  } else {
+    if (bits_per_sample > 31) {
+      return JXL_FAILURE("Invalid bits_per_sample: %u", bits_per_sample);
+    }
+  }
+  return true;
+}
+
+std::string BitDepth::DebugString() const {
+  std::ostringstream os;
+  os << (floating_point_sample ? "F" : "U");
+  os << bits_per_sample;
+  if (floating_point_sample) os << "." << exponent_bits_per_sample;
+  return os.str();
+}
+
+CustomTransformData::CustomTransformData() { Bundle::Init(this); }
+Status CustomTransformData::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+  if (visitor->Conditional(nonserialized_xyb_encoded)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&opsin_inverse_matrix));
+  }
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &custom_weights_mask));
+  if (visitor->Conditional((custom_weights_mask & 0x1) != 0)) {
+    // 4 5x5 kernels, but all of them can be obtained by symmetry from one,
+    // which is symmetric along its main diagonal. The top-left kernel is
+    // defined by
+    //
+    // 0  1  2  3  4
+    // 1  5  6  7  8
+    // 2  6  9 10 11
+    // 3  7 10 12 13
+    // 4  8 11 13 14
+    float constexpr kWeights2[15] = {
+        -0.01716200f, -0.03452303f, -0.04022174f, -0.02921014f, -0.00624645f,
+        0.14111091f,  0.28896755f,  0.00278718f,  -0.01610267f, 0.56661550f,
+        0.03777607f,  -0.01986694f, -0.03144731f, -0.01185068f, -0.00213539f};
+    for (size_t i = 0; i < 15; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights2[i], &upsampling2_weights[i]));
+    }
+  }
+  if (visitor->Conditional((custom_weights_mask & 0x2) != 0)) {
+    // 16 5x5 kernels, but all of them can be obtained by symmetry from
+    // three, two of which are symmetric along their main diagonals. The top
+    // left 4 kernels are defined by
+    //
+    // 0  1  2  3  4   5  6  7  8  9
+    // 1 10 11 12 13  14 15 16 17 18
+    // 2 11 19 20 21  22 23 24 25 26
+    // 3 12 20 27 28  29 30 31 32 33
+    // 4 13 21 28 34  35 36 37 38 39
+    //
+    // 5 14 22 29 35  40 41 42 43 44
+    // 6 15 23 30 36  41 45 46 47 48
+    // 7 16 24 31 37  42 46 49 50 51
+    // 8 17 25 32 38  43 47 50 52 53
+    // 9 18 26 33 39  44 48 51 53 54
+    constexpr float kWeights4[55] = {
+        -0.02419067f, -0.03491987f, -0.03693351f, -0.03094285f, -0.00529785f,
+        -0.01663432f, -0.03556863f, -0.03888905f, -0.03516850f, -0.00989469f,
+        0.23651958f,  0.33392945f,  -0.01073543f, -0.01313181f, -0.03556694f,
+        0.13048175f,  0.40103025f,  0.03951150f,  -0.02077584f, 0.46914198f,
+        -0.00209270f, -0.01484589f, -0.04064806f, 0.18942530f,  0.56279892f,
+        0.06674400f,  -0.02335494f, -0.03551682f, -0.00754830f, -0.02267919f,
+        -0.02363578f, 0.00315804f,  -0.03399098f, -0.01359519f, -0.00091653f,
+        -0.00335467f, -0.01163294f, -0.01610294f, -0.00974088f, -0.00191622f,
+        -0.01095446f, -0.03198464f, -0.04455121f, -0.02799790f, -0.00645912f,
+        0.06390599f,  0.22963888f,  0.00630981f,  -0.01897349f, 0.67537268f,
+        0.08483369f,  -0.02534994f, -0.02205197f, -0.01667999f, -0.00384443f};
+    for (size_t i = 0; i < 55; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights4[i], &upsampling4_weights[i]));
+    }
+  }
+  if (visitor->Conditional((custom_weights_mask & 0x4) != 0)) {
+    // 64 5x5 kernels, all of them can be obtained by symmetry from
+    // 10, 4 of which are symmetric along their main diagonals. The top
+    // left 16 kernels are defined by
+    //  0  1  2  3  4   5  6  7  8  9   a  b  c  d  e   f 10 11 12 13
+    //  1 14 15 16 17  18 19 1a 1b 1c  1d 1e 1f 20 21  22 23 24 25 26
+    //  2 15 27 28 29  2a 2b 2c 2d 2e  2f 30 31 32 33  34 35 36 37 38
+    //  3 16 28 39 3a  3b 3c 3d 3e 3f  40 41 42 43 44  45 46 47 48 49
+    //  4 17 29 3a 4a  4b 4c 4d 4e 4f  50 51 52 53 54  55 56 57 58 59
+
+    //  5 18 2a 3b 4b  5a 5b 5c 5d 5e  5f 60 61 62 63  64 65 66 67 68
+    //  6 19 2b 3c 4c  5b 69 6a 6b 6c  6d 6e 6f 70 71  72 73 74 75 76
+    //  7 1a 2c 3d 4d  5c 6a 77 78 79  7a 7b 7c 7d 7e  7f 80 81 82 83
+    //  8 1b 2d 3e 4e  5d 6b 78 84 85  86 87 88 89 8a  8b 8c 8d 8e 8f
+    //  9 1c 2e 3f 4f  5e 6c 79 85 90  91 92 93 94 95  96 97 98 99 9a
+
+    //  a 1d 2f 40 50  5f 6d 7a 86 91  9b 9c 9d 9e 9f  a0 a1 a2 a3 a4
+    //  b 1e 30 41 51  60 6e 7b 87 92  9c a5 a6 a7 a8  a9 aa ab ac ad
+    //  c 1f 31 42 52  61 6f 7c 88 93  9d a6 ae af b0  b1 b2 b3 b4 b5
+    //  d 20 32 43 53  62 70 7d 89 94  9e a7 af b6 b7  b8 b9 ba bb bc
+    //  e 21 33 44 54  63 71 7e 8a 95  9f a8 b0 b7 bd  be bf c0 c1 c2
+
+    //  f 22 34 45 55  64 72 7f 8b 96  a0 a9 b1 b8 be  c3 c4 c5 c6 c7
+    // 10 23 35 46 56  65 73 80 8c 97  a1 aa b2 b9 bf  c4 c8 c9 ca cb
+    // 11 24 36 47 57  66 74 81 8d 98  a2 ab b3 ba c0  c5 c9 cc cd ce
+    // 12 25 37 48 58  67 75 82 8e 99  a3 ac b4 bb c1  c6 ca cd cf d0
+    // 13 26 38 49 59  68 76 83 8f 9a  a4 ad b5 bc c2  c7 cb ce d0 d1
+    constexpr float kWeights8[210] = {
+        -0.02928613f, -0.03706353f, -0.03783812f, -0.03324558f, -0.00447632f,
+        -0.02519406f, -0.03752601f, -0.03901508f, -0.03663285f, -0.00646649f,
+        -0.02066407f, -0.03838633f, -0.04002101f, -0.03900035f, -0.00901973f,
+        -0.01626393f, -0.03954148f, -0.04046620f, -0.03979621f, -0.01224485f,
+        0.29895328f,  0.35757708f,  -0.02447552f, -0.01081748f, -0.04314594f,
+        0.23903219f,  0.41119301f,  -0.00573046f, -0.01450239f, -0.04246845f,
+        0.17567618f,  0.45220643f,  0.02287757f,  -0.01936783f, -0.03583255f,
+        0.11572472f,  0.47416733f,  0.06284440f,  -0.02685066f, 0.42720050f,
+        -0.02248939f, -0.01155273f, -0.04562755f, 0.28689496f,  0.49093869f,
+        -0.00007891f, -0.01545926f, -0.04562659f, 0.21238920f,  0.53980934f,
+        0.03369474f,  -0.02070211f, -0.03866988f, 0.14229550f,  0.56593398f,
+        0.08045181f,  -0.02888298f, -0.03680918f, -0.00542229f, -0.02920477f,
+        -0.02788574f, -0.02118180f, -0.03942402f, -0.00775547f, -0.02433614f,
+        -0.03193943f, -0.02030828f, -0.04044014f, -0.01074016f, -0.01930822f,
+        -0.03620399f, -0.01974125f, -0.03919545f, -0.01456093f, -0.00045072f,
+        -0.00360110f, -0.01020207f, -0.01231907f, -0.00638988f, -0.00071592f,
+        -0.00279122f, -0.00957115f, -0.01288327f, -0.00730937f, -0.00107783f,
+        -0.00210156f, -0.00890705f, -0.01317668f, -0.00813895f, -0.00153491f,
+        -0.02128481f, -0.04173044f, -0.04831487f, -0.03293190f, -0.00525260f,
+        -0.01720322f, -0.04052736f, -0.05045706f, -0.03607317f, -0.00738030f,
+        -0.01341764f, -0.03965629f, -0.05151616f, -0.03814886f, -0.01005819f,
+        0.18968273f,  0.33063684f,  -0.01300105f, -0.01372950f, -0.04017465f,
+        0.13727832f,  0.36402234f,  0.01027890f,  -0.01832107f, -0.03365072f,
+        0.08734506f,  0.38194295f,  0.04338228f,  -0.02525993f, 0.56408126f,
+        0.00458352f,  -0.01648227f, -0.04887868f, 0.24585519f,  0.62026135f,
+        0.04314807f,  -0.02213737f, -0.04158014f, 0.16637289f,  0.65027023f,
+        0.09621636f,  -0.03101388f, -0.04082742f, -0.00904519f, -0.02790922f,
+        -0.02117818f, 0.00798662f,  -0.03995711f, -0.01243427f, -0.02231705f,
+        -0.02946266f, 0.00992055f,  -0.03600283f, -0.01684920f, -0.00111684f,
+        -0.00411204f, -0.01297130f, -0.01723725f, -0.01022545f, -0.00165306f,
+        -0.00313110f, -0.01218016f, -0.01763266f, -0.01125620f, -0.00231663f,
+        -0.01374149f, -0.03797620f, -0.05142937f, -0.03117307f, -0.00581914f,
+        -0.01064003f, -0.03608089f, -0.05272168f, -0.03375670f, -0.00795586f,
+        0.09628104f,  0.27129991f,  -0.00353779f, -0.01734151f, -0.03153981f,
+        0.05686230f,  0.28500998f,  0.02230594f,  -0.02374955f, 0.68214326f,
+        0.05018048f,  -0.02320852f, -0.04383616f, 0.18459474f,  0.71517975f,
+        0.10805613f,  -0.03263677f, -0.03637639f, -0.01394373f, -0.02511203f,
+        -0.01728636f, 0.05407331f,  -0.02867568f, -0.01893131f, -0.00240854f,
+        -0.00446511f, -0.01636187f, -0.02377053f, -0.01522848f, -0.00333334f,
+        -0.00819975f, -0.02964169f, -0.04499287f, -0.02745350f, -0.00612408f,
+        0.02727416f,  0.19446600f,  0.00159832f,  -0.02232473f, 0.74982506f,
+        0.11452620f,  -0.03348048f, -0.01605681f, -0.02070339f, -0.00458223f};
+    for (size_t i = 0; i < 210; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(kWeights8[i], &upsampling8_weights[i]));
+    }
+  }
+  return true;
+}
+
+ExtraChannelInfo::ExtraChannelInfo() { Bundle::Init(this); }
+Status ExtraChannelInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  // General
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ExtraChannel::kAlpha, &type));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth));
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->U32(Val(0), Val(3), Val(4), BitsOffset(3, 1), 0, &dim_shift));
+  if ((1U << dim_shift) > 8) {
+    return JXL_FAILURE("dim_shift %u too large", dim_shift);
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(VisitNameString(visitor, &name));
+
+  // Conditional
+  if (visitor->Conditional(type == ExtraChannel::kAlpha)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &alpha_associated));
+  }
+  if (visitor->Conditional(type == ExtraChannel::kSpotColor)) {
+    for (float& c : spot_color) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0, &c));
+    }
+  }
+  if (visitor->Conditional(type == ExtraChannel::kCFA)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(1), Bits(2), BitsOffset(4, 3),
+                                           BitsOffset(8, 19), 1, &cfa_channel));
+  }
+
+  if (type == ExtraChannel::kUnknown ||
+      (int(ExtraChannel::kReserved0) <= int(type) &&
+       int(type) <= int(ExtraChannel::kReserved7))) {
+    return JXL_FAILURE("Unknown extra channel (bits %u, shift %u, name '%s')\n",
+                       bit_depth.bits_per_sample, dim_shift, name.c_str());
+  }
+  return true;
+}
+
+std::string ExtraChannelInfo::DebugString() const {
+  std::ostringstream os;
+  os << (type == ExtraChannel::kAlpha           ? "Alpha"
+         : type == ExtraChannel::kDepth         ? "Depth"
+         : type == ExtraChannel::kSpotColor     ? "Spot"
+         : type == ExtraChannel::kSelectionMask ? "Mask"
+         : type == ExtraChannel::kBlack         ? "Black"
+         : type == ExtraChannel::kCFA           ? "CFA"
+         : type == ExtraChannel::kThermal       ? "Thermal"
+                                                : "Unknown");
+  if (type == ExtraChannel::kAlpha && alpha_associated) os << "(premul)";
+  os << " " << bit_depth.DebugString();
+  os << " shift: " << dim_shift;
+  return os.str();
+}
+
+ImageMetadata::ImageMetadata() { Bundle::Init(this); }
+Status ImageMetadata::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  // Bundle::AllDefault does not allow usage when reading (it may abort the
+  // program when a codestream has invalid values), but when reading we
+  // overwrite the extra_fields value, so do not need to call AllDefault.
+  bool tone_mapping_default =
+      visitor->IsReading() ? false : Bundle::AllDefault(tone_mapping);
+
+  bool extra_fields = (orientation != 1 || have_preview || have_animation ||
+                       have_intrinsic_size || !tone_mapping_default);
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &extra_fields));
+  if (visitor->Conditional(extra_fields)) {
+    orientation--;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(3, 0, &orientation));
+    orientation++;
+    // (No need for bounds checking because we read exactly 3 bits)
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_intrinsic_size));
+    if (visitor->Conditional(have_intrinsic_size)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&intrinsic_size));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_preview));
+    if (visitor->Conditional(have_preview)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&preview_size));
+    }
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_animation));
+    if (visitor->Conditional(have_animation)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation));
+    }
+  } else {
+    orientation = 1;  // identity
+    have_intrinsic_size = false;
+    have_preview = false;
+    have_animation = false;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&bit_depth));
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->Bool(true, &modular_16_bit_buffer_sufficient));
+
+  num_extra_channels = extra_channel_info.size();
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2),
+                                         BitsOffset(12, 1), 0,
+                                         &num_extra_channels));
+
+  if (visitor->Conditional(num_extra_channels != 0)) {
+    if (visitor->IsReading()) {
+      extra_channel_info.resize(num_extra_channels);
+    }
+    for (ExtraChannelInfo& eci : extra_channel_info) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&eci));
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &xyb_encoded));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&color_encoding));
+  if (visitor->Conditional(extra_fields)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tone_mapping));
+  }
+
+  // Treat as if only the fields up to extra channels exist.
+  if (visitor->IsReading() && nonserialized_only_parse_basic_info) {
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+OpsinInverseMatrix::OpsinInverseMatrix() { Bundle::Init(this); }
+Status OpsinInverseMatrix::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+  for (int i = 0; i < 9; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
+        DefaultInverseOpsinAbsorbanceMatrix()[i], &inverse_matrix[i]));
+  }
+  for (int i = 0; i < 3; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->F16(kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i]));
+  }
+  for (int i = 0; i < 4; ++i) {
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->F16(kDefaultQuantBias[i], &quant_biases[i]));
+  }
+  return true;
+}
+
+ToneMapping::ToneMapping() { Bundle::Init(this); }
+Status ToneMapping::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->F16(kDefaultIntensityTarget, &intensity_target));
+  if (intensity_target <= 0.f) {
+    return JXL_FAILURE("invalid intensity target");
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &min_nits));
+  if (min_nits < 0.f || min_nits > intensity_target) {
+    return JXL_FAILURE("invalid min %f vs max %f", min_nits, intensity_target);
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &relative_to_max_display));
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.0f, &linear_below));
+  if (linear_below < 0 || (relative_to_max_display && linear_below > 1.0f)) {
+    return JXL_FAILURE("invalid linear_below %f (%s)", linear_below,
+                       relative_to_max_display ? "relative" : "absolute");
+  }
+
+  return true;
+}
+
+Status ReadImageMetadata(BitReader* JXL_RESTRICT reader,
+                         ImageMetadata* JXL_RESTRICT metadata) {
+  return Bundle::Read(reader, metadata);
+}
+
+void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) {
+  std::vector<ExtraChannelInfo>& eciv = extra_channel_info;
+  ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
+  if (bits == 0) {
+    if (alpha != nullptr) {
+      // Remove the alpha channel from the extra channel info. It's
+      // theoretically possible that there are multiple, remove all in that
+      // case. This ensure a next HasAlpha() will return false.
+      const auto is_alpha = [](const ExtraChannelInfo& eci) {
+        return eci.type == ExtraChannel::kAlpha;
+      };
+      eciv.erase(std::remove_if(eciv.begin(), eciv.end(), is_alpha),
+                 eciv.end());
+    }
+  } else {
+    if (alpha == nullptr) {
+      ExtraChannelInfo info;
+      info.type = ExtraChannel::kAlpha;
+      info.bit_depth.bits_per_sample = bits;
+      info.dim_shift = 0;
+      info.alpha_associated = alpha_is_premultiplied;
+      // Prepend rather than append: in case there already are other extra
+      // channels, prefer alpha channel to be listed first.
+      eciv.insert(eciv.begin(), info);
+    } else {
+      // Ignores potential extra alpha channels, only sets to first one.
+      alpha->bit_depth.bits_per_sample = bits;
+      alpha->bit_depth.floating_point_sample = false;
+      alpha->bit_depth.exponent_bits_per_sample = 0;
+      alpha->alpha_associated = alpha_is_premultiplied;
+    }
+  }
+  num_extra_channels = extra_channel_info.size();
+  if (bits > 12) modular_16_bit_buffer_sufficient = false;
+}
+
+std::string ImageMetadata::DebugString() const {
+  std::ostringstream os;
+  os << bit_depth.DebugString();
+  if (modular_16_bit_buffer_sufficient) {
+    os << " (modular 16)";
+  }
+  os << (xyb_encoded ? " xyb encoded" : " orig profile");
+  os << " " << Description(color_encoding);
+  if (num_extra_channels > 0) {
+    os << " extra channels:";
+    for (size_t i = 0; i < num_extra_channels; ++i) {
+      os << " (" << extra_channel_info[i].DebugString() << ")";
+      if (i + 1 < num_extra_channels) os << ",";
+    }
+  }
+  if (have_preview) {
+    os << " preview: " << preview_size.xsize() << "x" << preview_size.ysize();
+  }
+  if (orientation != 1) {
+    os << " orientation: " << orientation;
+  }
+  return os.str();
+}
+
+std::string CodecMetadata::DebugString() const {
+  std::ostringstream os;
+  os << size.xsize() << "x" << size.ysize();
+  os << " " << m.DebugString();
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_metadata.h b/third_party/jpeg-xl/lib/jxl/image_metadata.h
new file mode 100644
index 0000000000..ca69eb3a3d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_metadata.h
@@ -0,0 +1,425 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Main codestream header bundles, the metadata that applies to all frames.
+// Enums must align with the C API definitions in codestream_header.h.
+
+#ifndef LIB_JXL_IMAGE_METADATA_H_
+#define LIB_JXL_IMAGE_METADATA_H_
+
+#include <jxl/codestream_header.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// EXIF orientation of the image. This field overrides any field present in
+// actual EXIF metadata. The value tells which transformation the decoder must
+// apply after decoding to display the image with the correct orientation.
+enum class Orientation : uint32_t {
+  // Values 1..8 match the EXIF definitions.
+  kIdentity = JXL_ORIENT_IDENTITY,
+  kFlipHorizontal = JXL_ORIENT_FLIP_HORIZONTAL,
+  kRotate180 = JXL_ORIENT_ROTATE_180,
+  kFlipVertical = JXL_ORIENT_FLIP_VERTICAL,
+  kTranspose = JXL_ORIENT_TRANSPOSE,
+  kRotate90 = JXL_ORIENT_ROTATE_90_CW,
+  kAntiTranspose = JXL_ORIENT_ANTI_TRANSPOSE,
+  kRotate270 = JXL_ORIENT_ROTATE_90_CCW,
+};
+// Don't need an EnumBits because Orientation is not read via Enum().
+
+enum class ExtraChannel : uint32_t {
+  // First two enumerators (most common) are cheaper to encode
+  kAlpha = JXL_CHANNEL_ALPHA,
+  kDepth = JXL_CHANNEL_DEPTH,
+
+  kSpotColor = JXL_CHANNEL_SPOT_COLOR,
+  kSelectionMask = JXL_CHANNEL_SELECTION_MASK,
+  kBlack = JXL_CHANNEL_BLACK,  // for CMYK
+  kCFA = JXL_CHANNEL_CFA,      // Bayer channel
+  kThermal = JXL_CHANNEL_THERMAL,
+  kReserved0 = JXL_CHANNEL_RESERVED0,
+  kReserved1 = JXL_CHANNEL_RESERVED1,
+  kReserved2 = JXL_CHANNEL_RESERVED2,
+  kReserved3 = JXL_CHANNEL_RESERVED3,
+  kReserved4 = JXL_CHANNEL_RESERVED4,
+  kReserved5 = JXL_CHANNEL_RESERVED5,
+  kReserved6 = JXL_CHANNEL_RESERVED6,
+  kReserved7 = JXL_CHANNEL_RESERVED7,
+  // disambiguated via name string, raise warning if unsupported
+  kUnknown = JXL_CHANNEL_UNKNOWN,
+  // like kUnknown but can silently be ignored
+  kOptional = JXL_CHANNEL_OPTIONAL
+};
+static inline const char* EnumName(ExtraChannel /*unused*/) {
+  return "ExtraChannel";
+}
+static inline constexpr uint64_t EnumBits(ExtraChannel /*unused*/) {
+  using EC = ExtraChannel;
+  return MakeBit(EC::kAlpha) | MakeBit(EC::kDepth) | MakeBit(EC::kSpotColor) |
+         MakeBit(EC::kSelectionMask) | MakeBit(EC::kBlack) | MakeBit(EC::kCFA) |
+         MakeBit(EC::kThermal) | MakeBit(EC::kUnknown) | MakeBit(EC::kOptional);
+}
+
+// Used in ImageMetadata and ExtraChannelInfo.
+struct BitDepth : public Fields {
+  BitDepth();
+  JXL_FIELDS_NAME(BitDepth)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  std::string DebugString() const;
+
+  // Whether the original (uncompressed) samples are floating point or
+  // unsigned integer.
+  bool floating_point_sample;
+
+  // Bit depth of the original (uncompressed) image samples. Must be in the
+  // range [1, 32].
+  uint32_t bits_per_sample;
+
+  // Floating point exponent bits of the original (uncompressed) image samples,
+  // only used if floating_point_sample is true.
+  // If used, the samples are floating point with:
+  // - 1 sign bit
+  // - exponent_bits_per_sample exponent bits
+  // - (bits_per_sample - exponent_bits_per_sample - 1) mantissa bits
+  // If used, exponent_bits_per_sample must be in the range
+  // [2, 8] and amount of mantissa bits must be in the range [2, 23].
+  // NOTE: exponent_bits_per_sample is 8 for single precision binary32
+  // point, 5 for half precision binary16, 7 for fp24.
+  uint32_t exponent_bits_per_sample;
+};
+
+// Describes one extra channel.
+struct ExtraChannelInfo : public Fields {
+  ExtraChannelInfo();
+  JXL_FIELDS_NAME(ExtraChannelInfo)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  std::string DebugString() const;
+
+  mutable bool all_default;
+
+  ExtraChannel type;
+  BitDepth bit_depth;
+  uint32_t dim_shift;  // downsampled by 2^dim_shift on each axis
+
+  std::string name;  // UTF-8
+
+  // Conditional:
+  bool alpha_associated;  // i.e. premultiplied
+  float spot_color[4];    // spot color in linear RGBA
+  uint32_t cfa_channel;
+};
+
+struct OpsinInverseMatrix : public Fields {
+  OpsinInverseMatrix();
+  JXL_FIELDS_NAME(OpsinInverseMatrix)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  mutable bool all_default;
+
+  float inverse_matrix[9];
+  float opsin_biases[3];
+  float quant_biases[4];
+};
+
+// Information useful for mapping HDR images to lower dynamic range displays.
+struct ToneMapping : public Fields {
+  ToneMapping();
+  JXL_FIELDS_NAME(ToneMapping)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  mutable bool all_default;
+
+  // Upper bound on the intensity level present in the image. For unsigned
+  // integer pixel encodings, this is the brightness of the largest
+  // representable value. The image does not necessarily contain a pixel
+  // actually this bright. An encoder is allowed to set 255 for SDR images
+  // without computing a histogram.
+  float intensity_target;  // [nits]
+
+  // Lower bound on the intensity level present in the image. This may be
+  // loose, i.e. lower than the actual darkest pixel. When tone mapping, a
+  // decoder will map [min_nits, intensity_target] to the display range.
+  float min_nits;
+
+  bool relative_to_max_display;  // see below
+  // The tone mapping will leave unchanged (linear mapping) any pixels whose
+  // brightness is strictly below this. The interpretation depends on
+  // relative_to_max_display. If true, this is a ratio [0, 1] of the maximum
+  // display brightness [nits], otherwise an absolute brightness [nits].
+  float linear_below;
+};
+
+// Contains weights to customize some trasnforms - in particular, XYB and
+// upsampling.
+struct CustomTransformData : public Fields {
+  CustomTransformData();
+  JXL_FIELDS_NAME(CustomTransformData)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Must be set before calling VisitFields. Must equal xyb_encoded of
+  // ImageMetadata, should be set by ImageMetadata during VisitFields.
+  bool nonserialized_xyb_encoded = false;
+
+  mutable bool all_default;
+
+  OpsinInverseMatrix opsin_inverse_matrix;
+
+  uint32_t custom_weights_mask;
+  float upsampling2_weights[15];
+  float upsampling4_weights[55];
+  float upsampling8_weights[210];
+};
+
+// Properties of the original image bundle. This enables Encode(Decode()) to
+// re-create an equivalent image without user input.
+struct ImageMetadata : public Fields {
+  ImageMetadata();
+  JXL_FIELDS_NAME(ImageMetadata)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  // Returns bit depth of the JPEG XL compressed alpha channel, or 0 if no alpha
+  // channel present. In the theoretical case that there are multiple alpha
+  // channels, returns the bit depht of the first.
+  uint32_t GetAlphaBits() const {
+    const ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
+    if (alpha == nullptr) return 0;
+    JXL_ASSERT(alpha->bit_depth.bits_per_sample != 0);
+    return alpha->bit_depth.bits_per_sample;
+  }
+
+  // Sets bit depth of alpha channel, adding extra channel if needed, or
+  // removing all alpha channels if bits is 0.
+  // Assumes integer alpha channel and not designed to support multiple
+  // alpha channels (it's possible to use those features by manipulating
+  // extra_channel_info directly).
+  //
+  // Callers must insert the actual channel image at the same index before any
+  // further modifications to extra_channel_info.
+  void SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied = false);
+
+  bool HasAlpha() const { return GetAlphaBits() != 0; }
+
+  // Sets the original bit depth fields to indicate unsigned integer of the
+  // given bit depth.
+  // TODO(lode): move function to BitDepth
+  void SetUintSamples(uint32_t bits) {
+    bit_depth.bits_per_sample = bits;
+    bit_depth.exponent_bits_per_sample = 0;
+    bit_depth.floating_point_sample = false;
+    // RCT / Squeeze may add one bit each, and this is about int16_t,
+    // so uint13 should still be OK but limiting it to 12 seems safer.
+    // TODO(jon): figure out a better way to set this header field.
+    // (in particular, if modular mode is not used it doesn't matter,
+    // and if transforms are restricted, up to 15-bit could be done)
+    if (bits > 12) modular_16_bit_buffer_sufficient = false;
+  }
+  // Sets the original bit depth fields to indicate single precision floating
+  // point.
+  // TODO(lode): move function to BitDepth
+  void SetFloat32Samples() {
+    bit_depth.bits_per_sample = 32;
+    bit_depth.exponent_bits_per_sample = 8;
+    bit_depth.floating_point_sample = true;
+    modular_16_bit_buffer_sufficient = false;
+  }
+
+  void SetFloat16Samples() {
+    bit_depth.bits_per_sample = 16;
+    bit_depth.exponent_bits_per_sample = 5;
+    bit_depth.floating_point_sample = true;
+    modular_16_bit_buffer_sufficient = false;
+  }
+
+  void SetIntensityTarget(float intensity_target) {
+    tone_mapping.intensity_target = intensity_target;
+  }
+  float IntensityTarget() const {
+    JXL_ASSERT(tone_mapping.intensity_target != 0);
+    return tone_mapping.intensity_target;
+  }
+
+  // Returns first ExtraChannelInfo of the given type, or nullptr if none.
+  const ExtraChannelInfo* Find(ExtraChannel type) const {
+    for (const ExtraChannelInfo& eci : extra_channel_info) {
+      if (eci.type == type) return &eci;
+    }
+    return nullptr;
+  }
+
+  // Returns first ExtraChannelInfo of the given type, or nullptr if none.
+  ExtraChannelInfo* Find(ExtraChannel type) {
+    for (ExtraChannelInfo& eci : extra_channel_info) {
+      if (eci.type == type) return &eci;
+    }
+    return nullptr;
+  }
+
+  Orientation GetOrientation() const {
+    return static_cast<Orientation>(orientation);
+  }
+
+  bool ExtraFieldsDefault() const;
+
+  std::string DebugString() const;
+
+  mutable bool all_default;
+
+  BitDepth bit_depth;
+  bool modular_16_bit_buffer_sufficient;  // otherwise 32 is.
+
+  // Whether the colors values of the pixels of frames are encoded in the
+  // codestream using the absolute XYB color space, or the using values that
+  // follow the color space defined by the ColorEncoding or ICC profile. This
+  // determines when or whether a CMS (Color Management System) is needed to get
+  // the pixels in a desired color space. In one case, the pixels have one known
+  // color space and a CMS is needed to convert them to the original image's
+  // color space, in the other case the pixels have the color space of the
+  // original image and a CMS is required if a different display space, or a
+  // single known consistent color space for multiple decoded images, is
+  // desired. In all cases, the color space of all frames from a single image is
+  // the same, both VarDCT and modular frames.
+  //
+  // If true: then frames can be decoded to XYB (which can also be converted to
+  // linear and non-linear sRGB with the built in conversion without CMS). The
+  // attached ColorEncoding or ICC profile has no effect on the meaning of the
+  // pixel's color values, but instead indicates what the color profile of the
+  // original image was, and what color profile one should convert to when
+  // decoding to integers to prevent clipping and precision loss. To do that
+  // conversion requires a CMS.
+  //
+  // If false: then the color values of decoded frames are in the space defined
+  // by the attached ColorEncoding or ICC profile. To instead get the pixels in
+  // a chosen known color space, such as sRGB, requires a CMS, since the
+  // attached ColorEncoding or ICC profile could be any arbitrary color space.
+  // This mode is typically used for lossless images encoded as integers.
+  // Frames can also use YCbCr encoding, some frames may and some may not, but
+  // this is not a different color space but a certain encoding of the RGB
+  // values.
+  //
+  // Note: if !xyb_encoded, but the attached color profile indicates XYB (which
+  // can happen either if it's a ColorEncoding with color_space_ ==
+  // ColorSpace::kXYB, or if it's an ICC Profile that has been crafted to
+  // represent XYB), then the frames still may not use ColorEncoding kXYB, they
+  // must still use kNone (or kYCbCr, which would mean applying the YCbCr
+  // transform to the 3-channel XYB data), since with !xyb_encoded, the 3
+  // channels are stored as-is, no matter what meaning the color profile assigns
+  // to them. To use ColorEncoding::kXYB, xyb_encoded must be true.
+  //
+  // This value is defined in image metadata because this is the global
+  // codestream header. This value does not affect the image itself, so is not
+  // image metadata per se, it only affects the encoding, and what color space
+  // the decoder can receive the pixels in without needing a CMS.
+  bool xyb_encoded;
+
+  ColorEncoding color_encoding;
+
+  // These values are initialized to defaults such that the 'extra_fields'
+  // condition in VisitFields uses correctly initialized values.
+  uint32_t orientation = 1;
+  bool have_preview = false;
+  bool have_animation = false;
+  bool have_intrinsic_size = false;
+
+  // If present, the stored image has the dimensions of the first SizeHeader,
+  // but decoders are advised to resample or display per `intrinsic_size`.
+  SizeHeader intrinsic_size;  // only if have_intrinsic_size
+
+  ToneMapping tone_mapping;
+
+  // When reading: deserialized. When writing: automatically set from vector.
+  uint32_t num_extra_channels;
+  std::vector<ExtraChannelInfo> extra_channel_info;
+
+  // Only present if m.have_preview.
+  PreviewHeader preview_size;
+  // Only present if m.have_animation.
+  AnimationHeader animation;
+
+  uint64_t extensions;
+
+  // Option to stop parsing after basic info, and treat as if the later
+  // fields do not participate. Use to parse only basic image information
+  // excluding the final larger or variable sized data.
+  bool nonserialized_only_parse_basic_info = false;
+};
+
+Status ReadImageMetadata(BitReader* JXL_RESTRICT reader,
+                         ImageMetadata* JXL_RESTRICT metadata);
+
+Status WriteImageMetadata(const ImageMetadata& metadata,
+                          BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* aux_out);
+
+// All metadata applicable to the entire codestream (dimensions, extra channels,
+// ...)
+struct CodecMetadata {
+  // TODO(lode): use the preview and animation fields too, in place of the
+  // nonserialized_ ones in ImageMetadata.
+  ImageMetadata m;
+  // The size of the codestream: this is the nominal size applicable to all
+  // frames, although some frames can have a different effective size through
+  // crop, dc_level or representing a the preview.
+  SizeHeader size;
+  // Often default.
+  CustomTransformData transform_data;
+
+  size_t xsize() const { return size.xsize(); }
+  size_t ysize() const { return size.ysize(); }
+  size_t oriented_xsize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return ysize();
+    } else {
+      return xsize();
+    }
+  }
+  size_t oriented_preview_xsize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return m.preview_size.ysize();
+    } else {
+      return m.preview_size.xsize();
+    }
+  }
+  size_t oriented_ysize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return xsize();
+    } else {
+      return ysize();
+    }
+  }
+  size_t oriented_preview_ysize(bool keep_orientation) const {
+    if (static_cast<uint32_t>(m.GetOrientation()) > 4 && !keep_orientation) {
+      return m.preview_size.xsize();
+    } else {
+      return m.preview_size.ysize();
+    }
+  }
+
+  std::string DebugString() const;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_METADATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_ops.h b/third_party/jpeg-xl/lib/jxl/image_ops.h
new file mode 100644
index 0000000000..c025007e95
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_ops.h
@@ -0,0 +1,805 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_OPS_H_
+#define LIB_JXL_IMAGE_OPS_H_
+
+// Operations on images.
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <vector>
+
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+void CopyImageTo(const Plane<T>& from, Plane<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImage1");
+  JXL_ASSERT(SameSize(from, *to));
+  if (from.ysize() == 0 || from.xsize() == 0) return;
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = from.ConstRow(y);
+    T* JXL_RESTRICT row_to = to->Row(y);
+    memcpy(row_to, row_from, from.xsize() * sizeof(T));
+  }
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Plane<T> CopyImage(const Plane<T>& from) {
+  Plane<T> to(from.xsize(), from.ysize());
+  CopyImageTo(from, &to);
+  return to;
+}
+
+// Copies `from:rect_from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const Plane<T>& from,
+                 const Rect& rect_to, Plane<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImageR");
+  JXL_DASSERT(SameSize(rect_from, rect_to));
+  JXL_DASSERT(rect_from.IsInside(from));
+  JXL_DASSERT(rect_to.IsInside(*to));
+  if (rect_from.xsize() == 0) return;
+  for (size_t y = 0; y < rect_from.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y);
+    T* JXL_RESTRICT row_to = rect_to.Row(to, y);
+    memcpy(row_to, row_from, rect_from.xsize() * sizeof(T));
+  }
+}
+
+// DEPRECATED - Returns a copy of the "image" pixels that lie in "rect".
+template <typename T>
+Plane<T> CopyImage(const Rect& rect, const Plane<T>& image) {
+  Plane<T> copy(rect.xsize(), rect.ysize());
+  CopyImageTo(rect, image, &copy);
+  return copy;
+}
+
+// Copies `from:rect_from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const Image3<T>& from,
+                 const Rect& rect_to, Image3<T>* JXL_RESTRICT to) {
+  PROFILER_ZONE("CopyImageR");
+  JXL_ASSERT(SameSize(rect_from, rect_to));
+  for (size_t c = 0; c < 3; c++) {
+    CopyImageTo(rect_from, from.Plane(c), rect_to, &to->Plane(c));
+  }
+}
+
+template <typename T, typename U>
+void ConvertPlaneAndClamp(const Rect& rect_from, const Plane<T>& from,
+                          const Rect& rect_to, Plane<U>* JXL_RESTRICT to) {
+  PROFILER_ZONE("ConvertPlane");
+  JXL_ASSERT(SameSize(rect_from, rect_to));
+  using M = decltype(T() + U());
+  for (size_t y = 0; y < rect_to.ysize(); ++y) {
+    const T* JXL_RESTRICT row_from = rect_from.ConstRow(from, y);
+    U* JXL_RESTRICT row_to = rect_to.Row(to, y);
+    for (size_t x = 0; x < rect_to.xsize(); ++x) {
+      row_to[x] =
+          std::min<M>(std::max<M>(row_from[x], std::numeric_limits<U>::min()),
+                      std::numeric_limits<U>::max());
+    }
+  }
+}
+
+// Copies `from` to `to`.
+template <typename T>
+void CopyImageTo(const T& from, T* JXL_RESTRICT to) {
+  return CopyImageTo(Rect(from), from, Rect(*to), to);
+}
+
+// Copies `from:rect_from` to `to`.
+template <typename T>
+void CopyImageTo(const Rect& rect_from, const T& from, T* JXL_RESTRICT to) {
+  return CopyImageTo(rect_from, from, Rect(*to), to);
+}
+
+// Copies `from` to `to:rect_to`.
+template <typename T>
+void CopyImageTo(const T& from, const Rect& rect_to, T* JXL_RESTRICT to) {
+  return CopyImageTo(Rect(from), from, rect_to, to);
+}
+
+// Copies `from:rect_from` to `to:rect_to`; also copies `padding` pixels of
+// border around `from:rect_from`, in all directions, whenever they are inside
+// the first image.
+template <typename T>
+void CopyImageToWithPadding(const Rect& from_rect, const T& from,
+                            size_t padding, const Rect& to_rect, T* to) {
+  size_t xextra0 = std::min(padding, from_rect.x0());
+  size_t xextra1 =
+      std::min(padding, from.xsize() - from_rect.x0() - from_rect.xsize());
+  size_t yextra0 = std::min(padding, from_rect.y0());
+  size_t yextra1 =
+      std::min(padding, from.ysize() - from_rect.y0() - from_rect.ysize());
+  JXL_DASSERT(to_rect.x0() >= xextra0);
+  JXL_DASSERT(to_rect.y0() >= yextra0);
+
+  return CopyImageTo(Rect(from_rect.x0() - xextra0, from_rect.y0() - yextra0,
+                          from_rect.xsize() + xextra0 + xextra1,
+                          from_rect.ysize() + yextra0 + yextra1),
+                     from,
+                     Rect(to_rect.x0() - xextra0, to_rect.y0() - yextra0,
+                          to_rect.xsize() + xextra0 + xextra1,
+                          to_rect.ysize() + yextra0 + yextra1),
+                     to);
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Image3<T> CopyImage(const Image3<T>& from) {
+  Image3<T> copy(from.xsize(), from.ysize());
+  CopyImageTo(from, &copy);
+  return copy;
+}
+
+// DEPRECATED - prefer to preallocate result.
+template <typename T>
+Image3<T> CopyImage(const Rect& rect, const Image3<T>& from) {
+  Image3<T> to(rect.xsize(), rect.ysize());
+  CopyImageTo(rect, from.Plane(0), to.Plane(0));
+  CopyImageTo(rect, from.Plane(1), to.Plane(1));
+  CopyImageTo(rect, from.Plane(2), to.Plane(2));
+  return to;
+}
+
+// Sets "thickness" pixels on each border to "value". This is faster than
+// initializing the entire image and overwriting valid/interior pixels.
+template <typename T>
+void SetBorder(const size_t thickness, const T value, Image3<T>* image) {
+  const size_t xsize = image->xsize();
+  const size_t ysize = image->ysize();
+  // Top: fill entire row
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < std::min(thickness, ysize); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      std::fill(row, row + xsize, value);
+    }
+
+    // Bottom: fill entire row
+    for (size_t y = ysize - thickness; y < ysize; ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      std::fill(row, row + xsize, value);
+    }
+
+    // Left/right: fill the 'columns' on either side, but only if the image is
+    // big enough that they don't already belong to the top/bottom rows.
+    if (ysize >= 2 * thickness) {
+      for (size_t y = thickness; y < ysize - thickness; ++y) {
+        T* JXL_RESTRICT row = image->PlaneRow(c, y);
+        std::fill(row, row + thickness, value);
+        std::fill(row + xsize - thickness, row + xsize, value);
+      }
+    }
+  }
+}
+
+template <class ImageIn, class ImageOut>
+void Subtract(const ImageIn& image1, const ImageIn& image2, ImageOut* out) {
+  using T = typename ImageIn::T;
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = row1[x] - row2[x];
+    }
+  }
+}
+
+// In-place.
+template <typename Tin, typename Tout>
+void SubtractFrom(const Plane<Tin>& what, Plane<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = what.ConstRow(y);
+    Tout* JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] -= row_what[x];
+    }
+  }
+}
+
+// In-place.
+template <typename Tin, typename Tout>
+void AddTo(const Plane<Tin>& what, Plane<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = what.ConstRow(y);
+    Tout* JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] += row_what[x];
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void AddTo(Rect rectFrom, const Plane<Tin>& what, Rect rectTo,
+           Plane<Tout>* to) {
+  JXL_ASSERT(SameSize(rectFrom, rectTo));
+  const size_t xsize = rectTo.xsize();
+  const size_t ysize = rectTo.ysize();
+  for (size_t y = 0; y < ysize; ++y) {
+    const Tin* JXL_RESTRICT row_what = rectFrom.ConstRow(what, y);
+    Tout* JXL_RESTRICT row_to = rectTo.Row(to, y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_to[x] += row_what[x];
+    }
+  }
+}
+
+// Returns linear combination of two grayscale images.
+template <typename T>
+Plane<T> LinComb(const T lambda1, const Plane<T>& image1, const T lambda2,
+                 const Plane<T>& image2) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+  Plane<T> out(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    T* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      row_out[x] = lambda1 * row1[x] + lambda2 * row2[x];
+    }
+  }
+  return out;
+}
+
+// Returns a pixel-by-pixel multiplication of image by lambda.
+template <typename T>
+Plane<T> ScaleImage(const T lambda, const Plane<T>& image) {
+  Plane<T> out(image.xsize(), image.ysize());
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const T* const JXL_RESTRICT row = image.Row(y);
+    T* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      row_out[x] = lambda * row[x];
+    }
+  }
+  return out;
+}
+
+// Multiplies image by lambda in-place
+template <typename T>
+void ScaleImage(const T lambda, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = lambda * row[x];
+    }
+  }
+}
+
+template <typename T>
+Plane<T> Product(const Plane<T>& a, const Plane<T>& b) {
+  Plane<T> c(a.xsize(), a.ysize());
+  for (size_t y = 0; y < a.ysize(); ++y) {
+    const T* const JXL_RESTRICT row_a = a.Row(y);
+    const T* const JXL_RESTRICT row_b = b.Row(y);
+    T* const JXL_RESTRICT row_c = c.Row(y);
+    for (size_t x = 0; x < a.xsize(); ++x) {
+      row_c[x] = row_a[x] * row_b[x];
+    }
+  }
+  return c;
+}
+
+template <typename T>
+void FillImage(const T value, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillImage(Plane<T>* image) {
+  if (image->xsize() == 0) return;
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    memset(row, 0, image->xsize() * sizeof(T));
+  }
+}
+
+// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
+// We assume the radius (distance outside the image) is small compared to the
+// image size, otherwise this might not terminate.
+// The mirror is outside the last column (border pixel is also replicated).
+static inline int64_t Mirror(int64_t x, const int64_t xsize) {
+  JXL_DASSERT(xsize != 0);
+
+  // TODO(janwas): replace with branchless version
+  while (x < 0 || x >= xsize) {
+    if (x < 0) {
+      x = -x - 1;
+    } else {
+      x = 2 * xsize - 1 - x;
+    }
+  }
+  return x;
+}
+
+// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
+
+// Mirrors (repeating the edge pixel once). Useful for convolutions.
+struct WrapMirror {
+  JXL_INLINE int64_t operator()(const int64_t coord, const int64_t size) const {
+    return Mirror(coord, size);
+  }
+};
+
+// Returns the same coordinate: required for TFNode with Border(), or useful
+// when we know "coord" is already valid (e.g. interior of an image).
+struct WrapUnchanged {
+  JXL_INLINE int64_t operator()(const int64_t coord, int64_t /*size*/) const {
+    return coord;
+  }
+};
+
+// Similar to Wrap* but for row pointers (reduces Row() multiplications).
+
+class WrapRowMirror {
+ public:
+  template <class ImageOrView>
+  WrapRowMirror(const ImageOrView& image, size_t ysize)
+      : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
+
+  const float* operator()(const float* const JXL_RESTRICT row,
+                          const int64_t stride) const {
+    if (row < first_row_) {
+      const int64_t num_before = first_row_ - row;
+      // Mirrored; one row before => row 0, two before = row 1, ...
+      return first_row_ + num_before - stride;
+    }
+    if (row > last_row_) {
+      const int64_t num_after = row - last_row_;
+      // Mirrored; one row after => last row, two after = last - 1, ...
+      return last_row_ - num_after + stride;
+    }
+    return row;
+  }
+
+ private:
+  const float* const JXL_RESTRICT first_row_;
+  const float* const JXL_RESTRICT last_row_;
+};
+
+struct WrapRowUnchanged {
+  JXL_INLINE const float* operator()(const float* const JXL_RESTRICT row,
+                                     int64_t /*stride*/) const {
+    return row;
+  }
+};
+
+// Sets "thickness" pixels on each border to "value". This is faster than
+// initializing the entire image and overwriting valid/interior pixels.
+template <typename T>
+void SetBorder(const size_t thickness, const T value, Plane<T>* image) {
+  const size_t xsize = image->xsize();
+  const size_t ysize = image->ysize();
+  // Top: fill entire row
+  for (size_t y = 0; y < std::min(thickness, ysize); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    std::fill(row, row + xsize, value);
+  }
+
+  // Bottom: fill entire row
+  for (size_t y = ysize - thickness; y < ysize; ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    std::fill(row, row + xsize, value);
+  }
+
+  // Left/right: fill the 'columns' on either side, but only if the image is
+  // big enough that they don't already belong to the top/bottom rows.
+  if (ysize >= 2 * thickness) {
+    for (size_t y = thickness; y < ysize - thickness; ++y) {
+      T* const JXL_RESTRICT row = image->Row(y);
+      std::fill(row, row + thickness, value);
+      std::fill(row + xsize - thickness, row + xsize, value);
+    }
+  }
+}
+
+// Computes the minimum and maximum pixel value.
+template <typename T>
+void ImageMinMax(const Plane<T>& image, T* const JXL_RESTRICT min,
+                 T* const JXL_RESTRICT max) {
+  *min = std::numeric_limits<T>::max();
+  *max = std::numeric_limits<T>::lowest();
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const T* const JXL_RESTRICT row = image.Row(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      *min = std::min(*min, row[x]);
+      *max = std::max(*max, row[x]);
+    }
+  }
+}
+
+// Copies pixels, scaling their value relative to the "from" min/max by
+// "to_range". Example: U8 [0, 255] := [0.0, 1.0], to_range = 1.0 =>
+// outputs [0.0, 1.0].
+template <typename FromType, typename ToType>
+void ImageConvert(const Plane<FromType>& from, const float to_range,
+                  Plane<ToType>* const JXL_RESTRICT to) {
+  JXL_ASSERT(SameSize(from, *to));
+  FromType min_from, max_from;
+  ImageMinMax(from, &min_from, &max_from);
+  const float scale = to_range / (max_from - min_from);
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const FromType* const JXL_RESTRICT row_from = from.Row(y);
+    ToType* const JXL_RESTRICT row_to = to->Row(y);
+    for (size_t x = 0; x < from.xsize(); ++x) {
+      row_to[x] = static_cast<ToType>((row_from[x] - min_from) * scale);
+    }
+  }
+}
+
+template <typename From>
+Plane<float> ConvertToFloat(const Plane<From>& from) {
+  float factor = 1.0f / std::numeric_limits<From>::max();
+  if (std::is_same<From, double>::value || std::is_same<From, float>::value) {
+    factor = 1.0f;
+  }
+  Plane<float> to(from.xsize(), from.ysize());
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const From* const JXL_RESTRICT row_from = from.Row(y);
+    float* const JXL_RESTRICT row_to = to.Row(y);
+    for (size_t x = 0; x < from.xsize(); ++x) {
+      row_to[x] = row_from[x] * factor;
+    }
+  }
+  return to;
+}
+
+template <typename T>
+Plane<T> ImageFromPacked(const std::vector<T>& packed, const size_t xsize,
+                         const size_t ysize) {
+  Plane<T> out(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    T* const JXL_RESTRICT row = out.Row(y);
+    const T* const JXL_RESTRICT packed_row = &packed[y * xsize];
+    memcpy(row, packed_row, xsize * sizeof(T));
+  }
+  return out;
+}
+
+// Computes independent minimum and maximum values for each plane.
+template <typename T>
+void Image3MinMax(const Image3<T>& image, const Rect& rect,
+                  std::array<T, 3>* out_min, std::array<T, 3>* out_max) {
+  for (size_t c = 0; c < 3; ++c) {
+    T min = std::numeric_limits<T>::max();
+    T max = std::numeric_limits<T>::min();
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      const T* JXL_RESTRICT row = rect.ConstPlaneRow(image, c, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        min = std::min(min, row[x]);
+        max = std::max(max, row[x]);
+      }
+    }
+    (*out_min)[c] = min;
+    (*out_max)[c] = max;
+  }
+}
+
+// Computes independent minimum and maximum values for each plane.
+template <typename T>
+void Image3MinMax(const Image3<T>& image, std::array<T, 3>* out_min,
+                  std::array<T, 3>* out_max) {
+  Image3MinMax(image, Rect(image), out_min, out_max);
+}
+
+template <typename T>
+void Image3Max(const Image3<T>& image, std::array<T, 3>* out_max) {
+  for (size_t c = 0; c < 3; ++c) {
+    T max = std::numeric_limits<T>::min();
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        max = std::max(max, row[x]);
+      }
+    }
+    (*out_max)[c] = max;
+  }
+}
+
+// Computes the sum of the pixels in `rect`.
+template <typename T>
+T ImageSum(const Plane<T>& image, const Rect& rect) {
+  T result = 0;
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    const T* JXL_RESTRICT row = rect.ConstRow(image, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      result += row[x];
+    }
+  }
+  return result;
+}
+
+template <typename T>
+T ImageSum(const Plane<T>& image) {
+  return ImageSum(image, Rect(image));
+}
+
+template <typename T>
+std::array<T, 3> Image3Sum(const Image3<T>& image, const Rect& rect) {
+  std::array<T, 3> out_sum = 0;
+  for (size_t c = 0; c < 3; ++c) {
+    (out_sum)[c] = ImageSum(image.Plane(c), rect);
+  }
+  return out_sum;
+}
+
+template <typename T>
+std::array<T, 3> Image3Sum(const Image3<T>& image) {
+  return Image3Sum(image, Rect(image));
+}
+
+template <typename T>
+std::vector<T> PackedFromImage(const Plane<T>& image, const Rect& rect) {
+  const size_t xsize = rect.xsize();
+  const size_t ysize = rect.ysize();
+  std::vector<T> packed(xsize * ysize);
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    memcpy(&packed[y * xsize], rect.ConstRow(image, y), xsize * sizeof(T));
+  }
+  return packed;
+}
+
+template <typename T>
+std::vector<T> PackedFromImage(const Plane<T>& image) {
+  return PackedFromImage(image, Rect(image));
+}
+
+// Computes the median pixel value.
+template <typename T>
+T ImageMedian(const Plane<T>& image, const Rect& rect) {
+  std::vector<T> pixels = PackedFromImage(image, rect);
+  return Median(&pixels);
+}
+
+template <typename T>
+T ImageMedian(const Plane<T>& image) {
+  return ImageMedian(image, Rect(image));
+}
+
+template <typename T>
+std::array<T, 3> Image3Median(const Image3<T>& image, const Rect& rect) {
+  std::array<T, 3> out_median;
+  for (size_t c = 0; c < 3; ++c) {
+    (out_median)[c] = ImageMedian(image.Plane(c), rect);
+  }
+  return out_median;
+}
+
+template <typename T>
+std::array<T, 3> Image3Median(const Image3<T>& image) {
+  return Image3Median(image, Rect(image));
+}
+
+template <typename FromType, typename ToType>
+void Image3Convert(const Image3<FromType>& from, const float to_range,
+                   Image3<ToType>* const JXL_RESTRICT to) {
+  JXL_ASSERT(SameSize(from, *to));
+  std::array<FromType, 3> min_from, max_from;
+  Image3MinMax(from, &min_from, &max_from);
+  float scales[3];
+  for (size_t c = 0; c < 3; ++c) {
+    scales[c] = to_range / (max_from[c] - min_from[c]);
+  }
+  float scale = std::min(scales[0], std::min(scales[1], scales[2]));
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < from.ysize(); ++y) {
+      const FromType* JXL_RESTRICT row_from = from.ConstPlaneRow(c, y);
+      ToType* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < from.xsize(); ++x) {
+        const float to = (row_from[x] - min_from[c]) * scale;
+        row_to[x] = static_cast<ToType>(to);
+      }
+    }
+  }
+}
+
+template <typename From>
+Image3F ConvertToFloat(const Image3<From>& from) {
+  return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)),
+                 ConvertToFloat(from.Plane(2)));
+}
+
+template <typename Tin, typename Tout>
+void Subtract(const Image3<Tin>& image1, const Image3<Tin>& image2,
+              Image3<Tout>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y);
+      const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y);
+      Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row1[x] - row2[x];
+      }
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void SubtractFrom(const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] -= row_what[x];
+      }
+    }
+  }
+}
+
+template <typename Tin, typename Tout>
+void AddTo(const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] += row_what[x];
+      }
+    }
+  }
+}
+
+// Adds `what` of the size of `rect` to `to` in the position of `rect`.
+template <typename Tin, typename Tout>
+void AddTo(const Rect& rect, const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  JXL_ASSERT(xsize == rect.xsize());
+  JXL_ASSERT(ysize == rect.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] += row_what[x];
+      }
+    }
+  }
+}
+
+template <typename T>
+Image3<T> ScaleImage(const T lambda, const Image3<T>& image) {
+  Image3<T> out(image.xsize(), image.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y);
+      T* JXL_RESTRICT row_out = out.PlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        row_out[x] = lambda * row[x];
+      }
+    }
+  }
+  return out;
+}
+
+// Multiplies image by lambda in-place
+template <typename T>
+void ScaleImage(const T lambda, Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* const JXL_RESTRICT row = image->PlaneRow(c, y);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        row[x] = lambda * row[x];
+      }
+    }
+  }
+}
+
+// Initializes all planes to the same "value".
+template <typename T>
+void FillImage(const T value, Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      for (size_t x = 0; x < image->xsize(); ++x) {
+        row[x] = value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillPlane(const T value, Plane<T>* image) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void FillImage(const T value, Image3<T>* image, Rect rect) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < rect.ysize(); ++y) {
+      T* JXL_RESTRICT row = rect.PlaneRow(image, c, y);
+      for (size_t x = 0; x < rect.xsize(); ++x) {
+        row[x] = value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillPlane(const T value, Plane<T>* image, Rect rect) {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    T* JXL_RESTRICT row = rect.Row(image, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      row[x] = value;
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillImage(Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < image->ysize(); ++y) {
+      T* JXL_RESTRICT row = image->PlaneRow(c, y);
+      if (image->xsize() != 0) memset(row, 0, image->xsize() * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void ZeroFillPlane(Plane<T>* image, Rect rect) {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    T* JXL_RESTRICT row = rect.Row(image, y);
+    memset(row, 0, rect.xsize() * sizeof(T));
+  }
+}
+
+// Pad an image with xborder columns on each vertical side and yboder rows
+// above and below, mirroring the image.
+Image3F PadImageMirror(const Image3F& in, size_t xborder, size_t yborder);
+
+// Same as above, but operates in-place. Assumes that the `in` image was
+// allocated large enough.
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim = kBlockDim);
+
+// Downsamples an image by a given factor.
+void DownsampleImage(Image3F* opsin, size_t factor);
+void DownsampleImage(ImageF* image, size_t factor);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_OPS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/image_ops_test.cc b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc
new file mode 100644
index 0000000000..44c021513d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_ops_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/image_ops.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <utility>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+template <typename T>
+void TestPacked(const size_t xsize, const size_t ysize) {
+  Plane<T> image1(xsize, ysize);
+  RandomFillImage(&image1);
+  const std::vector<T>& packed = PackedFromImage(image1);
+  const Plane<T>& image2 = ImageFromPacked(packed, xsize, ysize);
+  JXL_EXPECT_OK(SamePixels(image1, image2, _));
+}
+
+TEST(ImageTest, TestPacked) {
+  TestPacked<uint8_t>(1, 1);
+  TestPacked<uint8_t>(7, 1);
+  TestPacked<uint8_t>(1, 7);
+
+  TestPacked<int16_t>(1, 1);
+  TestPacked<int16_t>(7, 1);
+  TestPacked<int16_t>(1, 7);
+
+  TestPacked<uint16_t>(1, 1);
+  TestPacked<uint16_t>(7, 1);
+  TestPacked<uint16_t>(1, 7);
+
+  TestPacked<float>(1, 1);
+  TestPacked<float>(7, 1);
+  TestPacked<float>(1, 7);
+}
+
+// Ensure entire payload is readable/writable for various size/offset combos.
+TEST(ImageTest, TestAllocator) {
+  Rng rng(0);
+  const size_t k32 = 32;
+  const size_t kAlign = CacheAligned::kAlignment;
+  for (size_t size : {k32 * 1, k32 * 2, k32 * 3, k32 * 4, k32 * 5,
+                      CacheAligned::kAlias, 2 * CacheAligned::kAlias + 4}) {
+    for (size_t offset = 0; offset <= CacheAligned::kAlias; offset += kAlign) {
+      uint8_t* bytes =
+          static_cast<uint8_t*>(CacheAligned::Allocate(size, offset));
+      JXL_CHECK(reinterpret_cast<uintptr_t>(bytes) % kAlign == 0);
+      // Ensure we can write/read the last byte. Use RNG to fool the compiler
+      // into thinking the write is necessary.
+      memset(bytes, 0, size);
+      bytes[size - 1] = 1;                       // greatest element
+      uint32_t pos = rng.UniformU(0, size - 1);  // random but != greatest
+      JXL_CHECK(bytes[pos] < bytes[size - 1]);
+
+      CacheAligned::Free(bytes);
+    }
+  }
+}
+
+template <typename T>
+void TestFillImpl(Image3<T>* img, const char* layout) {
+  FillImage(T(1), img);
+  for (size_t y = 0; y < img->ysize(); ++y) {
+    for (size_t c = 0; c < 3; ++c) {
+      T* JXL_RESTRICT row = img->PlaneRow(c, y);
+      for (size_t x = 0; x < img->xsize(); ++x) {
+        if (row[x] != T(1)) {
+          printf("Not 1 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS
+                 " x %" PRIuS ") (%s)\n",
+                 c, x, y, img->xsize(), img->ysize(), layout);
+          abort();
+        }
+        row[x] = T(2);
+      }
+    }
+  }
+
+  // Same for ZeroFillImage and swapped c/y loop ordering.
+  ZeroFillImage(img);
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < img->ysize(); ++y) {
+      T* JXL_RESTRICT row = img->PlaneRow(c, y);
+      for (size_t x = 0; x < img->xsize(); ++x) {
+        if (row[x] != T(0)) {
+          printf("Not 0 at c=%" PRIuS " %" PRIuS ", %" PRIuS " (%" PRIuS
+                 " x %" PRIuS ") (%s)\n",
+                 c, x, y, img->xsize(), img->ysize(), layout);
+          abort();
+        }
+        row[x] = T(3);
+      }
+    }
+  }
+}
+
+template <typename T>
+void TestFillT() {
+  for (uint32_t xsize : {0, 1, 15, 16, 31, 32}) {
+    for (uint32_t ysize : {0, 1, 15, 16, 31, 32}) {
+      Image3<T> image(xsize, ysize);
+      TestFillImpl(&image, "size ctor");
+
+      Image3<T> planar(Plane<T>(xsize, ysize), Plane<T>(xsize, ysize),
+                       Plane<T>(xsize, ysize));
+      TestFillImpl(&planar, "planar");
+    }
+  }
+}
+
+// Ensure y/c/x and c/y/x loops visit pixels no more than once.
+TEST(ImageTest, TestFill) {
+  TestFillT<uint8_t>();
+  TestFillT<int16_t>();
+  TestFillT<float>();
+  TestFillT<double>();
+}
+
+TEST(ImageTest, CopyImageToWithPaddingTest) {
+  Plane<uint32_t> src(100, 61);
+  for (size_t y = 0; y < src.ysize(); y++) {
+    for (size_t x = 0; x < src.xsize(); x++) {
+      src.Row(y)[x] = x * 1000 + y;
+    }
+  }
+  Rect src_rect(10, 20, 30, 40);
+  EXPECT_TRUE(src_rect.IsInside(src));
+
+  Plane<uint32_t> dst(60, 50);
+  FillImage(0u, &dst);
+  Rect dst_rect(20, 5, 30, 40);
+  EXPECT_TRUE(dst_rect.IsInside(dst));
+
+  CopyImageToWithPadding(src_rect, src, /*padding=*/2, dst_rect, &dst);
+
+  // ysize is + 3 instead of + 4 because we are at the y image boundary on the
+  // source image.
+  Rect padded_dst_rect(20 - 2, 5 - 2, 30 + 4, 40 + 3);
+  for (size_t y = 0; y < dst.ysize(); y++) {
+    for (size_t x = 0; x < dst.xsize(); x++) {
+      if (Rect(x, y, 1, 1).IsInside(padded_dst_rect)) {
+        EXPECT_EQ((x - dst_rect.x0() + src_rect.x0()) * 1000 +
+                      (y - dst_rect.y0() + src_rect.y0()),
+                  dst.Row(y)[x]);
+      } else {
+        EXPECT_EQ(0u, dst.Row(y)[x]);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/image_test_utils.h b/third_party/jpeg-xl/lib/jxl/image_test_utils.h
new file mode 100644
index 0000000000..e7d72285e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/image_test_utils.h
@@ -0,0 +1,257 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_IMAGE_TEST_UTILS_H_
+#define LIB_JXL_IMAGE_TEST_UTILS_H_
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cmath>
+#include <limits>
+#include <sstream>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+bool SamePixels(const Plane<T>& image1, const Plane<T>& image2,
+                std::stringstream& failures) {
+  const Rect rect(image1);
+  JXL_CHECK(SameSize(image1, image2));
+  size_t mismatches = 0;
+  for (size_t y = rect.y0(); y < rect.ysize(); ++y) {
+    const T* const JXL_RESTRICT row1 = image1.Row(y);
+    const T* const JXL_RESTRICT row2 = image2.Row(y);
+    for (size_t x = rect.x0(); x < rect.xsize(); ++x) {
+      if (row1[x] != row2[x]) {
+        failures << "pixel mismatch" << x << ", " << y << ": "
+                 << double(row1[x]) << " != " << double(row2[x]) << "\n";
+        if (++mismatches > 4) {
+          return false;
+        }
+      }
+    }
+  }
+  return mismatches == 0;
+}
+
+template <typename T>
+bool SamePixels(const Image3<T>& image1, const Image3<T>& image2,
+                std::stringstream& failures) {
+  JXL_CHECK(SameSize(image1, image2));
+  for (size_t c = 0; c < 3; ++c) {
+    if (!SamePixels(image1.Plane(c), image2.Plane(c), failures)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Use for floating-point images with fairly large numbers; tolerates small
+// absolute errors and/or small relative errors.
+template <typename T>
+bool VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
+                         const double threshold_l1,
+                         const double threshold_relative,
+                         std::stringstream& failures, const intptr_t border = 0,
+                         const size_t c = 0) {
+  JXL_CHECK(SameSize(expected, actual));
+  const intptr_t xsize = expected.xsize();
+  const intptr_t ysize = expected.ysize();
+
+  // Max over current scanline to give a better idea whether there are
+  // systematic errors or just one outlier. Invalid if negative.
+  double max_l1 = -1;
+  double max_relative = -1;
+  bool any_bad = false;
+  for (intptr_t y = border; y < ysize - border; ++y) {
+    const T* const JXL_RESTRICT row_expected = expected.Row(y);
+    const T* const JXL_RESTRICT row_actual = actual.Row(y);
+    for (intptr_t x = border; x < xsize - border; ++x) {
+      const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+      // Cannot compute relative, only check/update L1.
+      if (std::abs(row_expected[x]) < 1E-10) {
+        if (l1 > threshold_l1) {
+          any_bad = true;
+          max_l1 = std::max(max_l1, l1);
+        }
+      } else {
+        const double relative = l1 / std::abs(double(row_expected[x]));
+        if (l1 > threshold_l1 && relative > threshold_relative) {
+          // Fails both tolerances => will exit below, update max_*.
+          any_bad = true;
+          max_l1 = std::max(max_l1, l1);
+          max_relative = std::max(max_relative, relative);
+        }
+      }
+    }
+  }
+  if (!any_bad) {
+    return true;
+  }
+  // Never had a valid relative value, don't print it.
+  if (max_relative < 0) {
+    fprintf(stderr, "c=%" PRIu64 ": max +/- %E exceeds +/- %.2E\n",
+            static_cast<uint64_t>(c), max_l1, threshold_l1);
+  } else {
+    fprintf(stderr,
+            "c=%" PRIu64 ": max +/- %E, x %E exceeds +/- %.2E, x %.2E\n",
+            static_cast<uint64_t>(c), max_l1, max_relative, threshold_l1,
+            threshold_relative);
+  }
+  // Dump the expected image and actual image if the region is small enough.
+  const intptr_t kMaxTestDumpSize = 16;
+  if (xsize <= kMaxTestDumpSize + 2 * border &&
+      ysize <= kMaxTestDumpSize + 2 * border) {
+    fprintf(stderr, "Expected image:\n");
+    for (intptr_t y = border; y < ysize - border; ++y) {
+      const T* const JXL_RESTRICT row_expected = expected.Row(y);
+      for (intptr_t x = border; x < xsize - border; ++x) {
+        fprintf(stderr, "%10lf ", static_cast<double>(row_expected[x]));
+      }
+      fprintf(stderr, "\n");
+    }
+
+    fprintf(stderr, "Actual image:\n");
+    for (intptr_t y = border; y < ysize - border; ++y) {
+      const T* const JXL_RESTRICT row_expected = expected.Row(y);
+      const T* const JXL_RESTRICT row_actual = actual.Row(y);
+      for (intptr_t x = border; x < xsize - border; ++x) {
+        const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+        bool bad = l1 > threshold_l1;
+        if (row_expected[x] > 1E-10) {
+          const double relative = l1 / std::abs(double(row_expected[x]));
+          bad &= relative > threshold_relative;
+        }
+        if (bad) {
+          fprintf(stderr, "%10lf ", static_cast<double>(row_actual[x]));
+        } else {
+          fprintf(stderr, "%10s ", "==");
+        }
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  // Find first failing x for further debugging.
+  for (intptr_t y = border; y < ysize - border; ++y) {
+    const T* const JXL_RESTRICT row_expected = expected.Row(y);
+    const T* const JXL_RESTRICT row_actual = actual.Row(y);
+
+    for (intptr_t x = border; x < xsize - border; ++x) {
+      const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+      bool bad = l1 > threshold_l1;
+      if (row_expected[x] > 1E-10) {
+        const double relative = l1 / std::abs(double(row_expected[x]));
+        bad &= relative > threshold_relative;
+      }
+      if (bad) {
+        failures << x << ", " << y << " (" << expected.xsize() << " x "
+                 << expected.ysize() << ") expected "
+                 << static_cast<double>(row_expected[x]) << " actual "
+                 << static_cast<double>(row_actual[x]);
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+template <typename T>
+bool VerifyRelativeError(const Image3<T>& expected, const Image3<T>& actual,
+                         const float threshold_l1,
+                         const float threshold_relative,
+                         std::stringstream& failures,
+                         const intptr_t border = 0) {
+  for (size_t c = 0; c < 3; ++c) {
+    bool ok =
+        VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1,
+                            threshold_relative, failures, border, c);
+    if (!ok) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T, typename U = T>
+void GenerateImage(Rng& rng, Plane<T>* image, U begin, U end) {
+  for (size_t y = 0; y < image->ysize(); ++y) {
+    T* const JXL_RESTRICT row = image->Row(y);
+    for (size_t x = 0; x < image->xsize(); ++x) {
+      if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
+        row[x] = rng.UniformF(begin, end);
+      } else if (std::is_signed<T>::value) {
+        row[x] = rng.UniformI(begin, end);
+      } else {
+        row[x] = rng.UniformU(begin, end);
+      }
+    }
+  }
+}
+
+template <typename T>
+void RandomFillImage(Plane<T>* image, const T begin, const T end,
+                     const int seed = 129) {
+  Rng rng(seed);
+  GenerateImage(rng, image, begin, end);
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type RandomFillImage(
+    Plane<T>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, int64_t(0),
+                int64_t(std::numeric_limits<T>::max()) + 1);
+}
+
+JXL_INLINE void RandomFillImage(Plane<float>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, 0.0f, std::numeric_limits<float>::max());
+}
+
+template <typename T, typename U>
+void GenerateImage(Rng& rng, Image3<T>* image, U begin, U end) {
+  for (size_t c = 0; c < 3; ++c) {
+    GenerateImage(rng, &image->Plane(c), begin, end);
+  }
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type RandomFillImage(
+    Image3<T>* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, int64_t(0),
+                int64_t(std::numeric_limits<T>::max()) + 1);
+}
+
+JXL_INLINE void RandomFillImage(Image3F* image) {
+  Rng rng(129);
+  GenerateImage(rng, image, 0.0f, std::numeric_limits<float>::max());
+}
+
+template <typename T, typename U>
+void RandomFillImage(Image3<T>* image, const U begin, const U end,
+                     const int seed = 129) {
+  Rng rng(seed);
+  GenerateImage(rng, image, begin, end);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_IMAGE_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h
new file mode 100644
index 0000000000..fcb01d7396
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/inverse_mtf-inl.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SIMDified inverse-move-to-front transform.
+
+#if defined(LIB_JXL_INVERSE_MTF_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_INVERSE_MTF_INL_H_
+#undef LIB_JXL_INVERSE_MTF_INL_H_
+#else
+#define LIB_JXL_INVERSE_MTF_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::FirstN;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Load;
+using hwy::HWY_NAMESPACE::LoadU;
+using hwy::HWY_NAMESPACE::StoreU;
+
+inline void MoveToFront(uint8_t* v, uint8_t index) {
+  uint8_t value = v[index];
+  uint8_t i = index;
+  if (i < 4) {
+    for (; i; --i) v[i] = v[i - 1];
+  } else {
+    const HWY_CAPPED(uint8_t, 64) d;
+    int tail = i & (Lanes(d) - 1);
+    if (tail) {
+      i -= tail;
+      const auto vec = Load(d, v + i);
+      const auto prev = LoadU(d, v + i + 1);
+      StoreU(IfThenElse(FirstN(d, tail), vec, prev), d, v + i + 1);
+    }
+    while (i) {
+      i -= Lanes(d);
+      const auto vec = Load(d, v + i);
+      StoreU(vec, d, v + i + 1);
+    }
+  }
+  v[0] = value;
+}
+
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  HWY_ALIGN uint8_t mtf[256 + 64];
+  int i;
+  for (i = 0; i < 256; ++i) {
+    mtf[i] = static_cast<uint8_t>(i);
+  }
+#if JXL_MEMORY_SANITIZER
+  const HWY_CAPPED(uint8_t, 64) d;
+  for (size_t j = 0; j < Lanes(d); ++j) {
+    mtf[256 + j] = 0;
+  }
+#endif  // JXL_MEMORY_SANITIZER
+  for (i = 0; i < v_len; ++i) {
+    uint8_t index = v[i];
+    v[i] = mtf[index];
+    if (index) MoveToFront(mtf, index);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_INVERSE_MTF_INL_H_
+
+#if HWY_ONCE
+#ifndef INVERSE_MTF_ONCE
+#define INVERSE_MTF_ONCE
+
+namespace jxl {
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  return HWY_STATIC_DISPATCH(InverseMoveToFrontTransform)(v, v_len);
+}
+}  // namespace jxl
+
+#endif  // INVERSE_MTF_ONCE
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc
new file mode 100644
index 0000000000..db49a1c215
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc
@@ -0,0 +1,145 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+
+#include <brotli/decode.h>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace jpeg {
+Status DecodeJPEGData(Span<const uint8_t> encoded, JPEGData* jpeg_data) {
+  Status ret = true;
+  const uint8_t* in = encoded.data();
+  size_t available_in = encoded.size();
+  {
+    BitReader br(encoded);
+    BitReaderScopedCloser br_closer(&br, &ret);
+    JXL_RETURN_IF_ERROR(Bundle::Read(&br, jpeg_data));
+    JXL_RETURN_IF_ERROR(br.JumpToByteBoundary());
+    in += br.TotalBitsConsumed() / 8;
+    available_in -= br.TotalBitsConsumed() / 8;
+  }
+  JXL_RETURN_IF_ERROR(ret);
+
+  BrotliDecoderState* brotli_dec =
+      BrotliDecoderCreateInstance(nullptr, nullptr, nullptr);
+
+  struct BrotliDecDeleter {
+    BrotliDecoderState* brotli_dec;
+    ~BrotliDecDeleter() { BrotliDecoderDestroyInstance(brotli_dec); }
+  } brotli_dec_deleter{brotli_dec};
+
+  BrotliDecoderResult result =
+      BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS;
+
+  auto br_read = [&](std::vector<uint8_t>& data) -> Status {
+    size_t available_out = data.size();
+    uint8_t* out = data.data();
+    while (available_out != 0) {
+      if (BrotliDecoderIsFinished(brotli_dec)) {
+        return JXL_FAILURE("Not enough decompressed output");
+      }
+      uint8_t* next_out_before = out;
+      size_t avail_out_before = available_out;
+      msan::MemoryIsInitialized(in, available_in);
+      result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in,
+                                             &available_out, &out, nullptr);
+      if (result !=
+              BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT &&
+          result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) {
+        return JXL_FAILURE(
+            "Brotli decoding error: %s\n",
+            BrotliDecoderErrorString(BrotliDecoderGetErrorCode(brotli_dec)));
+      }
+      msan::UnpoisonMemory(next_out_before, avail_out_before - available_out);
+    }
+    return true;
+  };
+  size_t num_icc = 0;
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    auto& marker = jpeg_data->app_data[i];
+    if (jpeg_data->app_marker_type[i] != AppMarkerType::kUnknown) {
+      // Set the size of the marker.
+      size_t size_minus_1 = marker.size() - 1;
+      marker[1] = size_minus_1 >> 8;
+      marker[2] = size_minus_1 & 0xFF;
+      if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) {
+        if (marker.size() < 17) {
+          return JXL_FAILURE("ICC markers must be at least 17 bytes");
+        }
+        marker[0] = 0xE2;
+        memcpy(&marker[3], kIccProfileTag, sizeof kIccProfileTag);
+        marker[15] = ++num_icc;
+      }
+    } else {
+      JXL_RETURN_IF_ERROR(br_read(marker));
+      if (marker[1] * 256u + marker[2] + 1u != marker.size()) {
+        return JXL_FAILURE("Incorrect marker size");
+      }
+    }
+  }
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    auto& marker = jpeg_data->app_data[i];
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kICC) {
+      marker[16] = num_icc;
+    }
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kExif) {
+      marker[0] = 0xE1;
+      if (marker.size() < 3 + sizeof kExifTag) {
+        return JXL_FAILURE("Incorrect Exif marker size");
+      }
+      memcpy(&marker[3], kExifTag, sizeof kExifTag);
+    }
+    if (jpeg_data->app_marker_type[i] == AppMarkerType::kXMP) {
+      marker[0] = 0xE1;
+      if (marker.size() < 3 + sizeof kXMPTag) {
+        return JXL_FAILURE("Incorrect XMP marker size");
+      }
+      memcpy(&marker[3], kXMPTag, sizeof kXMPTag);
+    }
+  }
+  // TODO(eustas): actually inject ICC profile and check it fits perfectly.
+  for (size_t i = 0; i < jpeg_data->com_data.size(); i++) {
+    auto& marker = jpeg_data->com_data[i];
+    JXL_RETURN_IF_ERROR(br_read(marker));
+    if (marker[1] * 256u + marker[2] + 1u != marker.size()) {
+      return JXL_FAILURE("Incorrect marker size");
+    }
+  }
+  for (size_t i = 0; i < jpeg_data->inter_marker_data.size(); i++) {
+    JXL_RETURN_IF_ERROR(br_read(jpeg_data->inter_marker_data[i]));
+  }
+  JXL_RETURN_IF_ERROR(br_read(jpeg_data->tail_data));
+
+  // Check if there is more decompressed output.
+  size_t available_out = 1;
+  uint64_t dummy;
+  uint8_t* next_out = reinterpret_cast<uint8_t*>(&dummy);
+  result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in,
+                                         &available_out, &next_out, nullptr);
+  if (available_out == 0 ||
+      result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT) {
+    return JXL_FAILURE("Excess data in compressed stream");
+  }
+  if (result == BrotliDecoderResult::BROTLI_DECODER_RESULT_NEEDS_MORE_INPUT) {
+    return JXL_FAILURE("Incomplete brotli-stream");
+  }
+  if (!BrotliDecoderIsFinished(brotli_dec) ||
+      result != BrotliDecoderResult::BROTLI_DECODER_RESULT_SUCCESS) {
+    return JXL_FAILURE("Corrupted brotli-stream");
+  }
+  if (available_in != 0) {
+    return JXL_FAILURE("Unused data after brotli stream");
+  }
+
+  return true;
+}
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h
new file mode 100644
index 0000000000..b9d50bf9f8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.h
@@ -0,0 +1,19 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_H_
+#define LIB_JXL_JPEG_DEC_JPEG_DATA_H_
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+Status DecodeJPEGData(Span<const uint8_t> encoded, JPEGData* jpeg_data);
+}
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc
new file mode 100644
index 0000000000..f9ae755789
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc
@@ -0,0 +1,1050 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+
+#include <stdlib.h>
+#include <string.h> /* for memset, memcpy */
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+
+enum struct SerializationStatus {
+  NEEDS_MORE_INPUT,
+  NEEDS_MORE_OUTPUT,
+  ERROR,
+  DONE
+};
+
+const int kJpegPrecision = 8;
+
+// JpegBitWriter: buffer size
+const size_t kJpegBitWriterChunkSize = 16384;
+
+// DCTCodingState: maximum number of correction bits to buffer
+const int kJPEGMaxCorrectionBits = 1u << 16;
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+void JpegBitWriterInit(JpegBitWriter* bw,
+                       std::deque<OutputChunk>* output_queue) {
+  bw->output = output_queue;
+  bw->chunk = OutputChunk(kJpegBitWriterChunkSize);
+  bw->pos = 0;
+  bw->put_buffer = 0;
+  bw->put_bits = 64;
+  bw->healthy = true;
+  bw->data = bw->chunk.buffer->data();
+}
+
+static JXL_NOINLINE void SwapBuffer(JpegBitWriter* bw) {
+  bw->chunk.len = bw->pos;
+  bw->output->emplace_back(std::move(bw->chunk));
+  bw->chunk = OutputChunk(kJpegBitWriterChunkSize);
+  bw->data = bw->chunk.buffer->data();
+  bw->pos = 0;
+}
+
+static JXL_INLINE void Reserve(JpegBitWriter* bw, size_t n_bytes) {
+  if (JXL_UNLIKELY((bw->pos + n_bytes) > kJpegBitWriterChunkSize)) {
+    SwapBuffer(bw);
+  }
+}
+
+/**
+ * Writes the given byte to the output, writes an extra zero if byte is 0xFF.
+ *
+ * This method is "careless" - caller must make sure that there is enough
+ * space in the output buffer. Emits up to 2 bytes to buffer.
+ */
+static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
+  bw->data[bw->pos++] = byte;
+  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+}
+
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
+  // At this point we are ready to emit the most significant 6 bytes of
+  // put_buffer_ to the output.
+  // The JPEG format requires that after every 0xff byte in the entropy
+  // coded section, there is a zero byte, therefore we first check if any of
+  // the 6 most significant bytes of put_buffer_ is 0xFF.
+  Reserve(bw, 12);
+  if (HasZeroByte(~bw->put_buffer | 0xFFFF)) {
+    // We have a 0xFF byte somewhere, examine each byte and append a zero
+    // byte if necessary.
+    EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 48) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 40) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+  } else {
+    // We don't have any 0xFF bytes, output all 6 bytes without checking.
+    bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF;
+    bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF;
+    bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF;
+    bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF;
+    bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF;
+    bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF;
+    bw->pos += 6;
+  }
+  bw->put_buffer <<= 48;
+  bw->put_bits += 48;
+}
+
+static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
+  // This is an optimization; if everything goes well,
+  // then |nbits| is positive; if non-existing Huffman symbol is going to be
+  // encoded, its length should be zero; later encoder could check the
+  // "health" of JpegBitWriter.
+  if (nbits == 0) {
+    bw->healthy = false;
+    return;
+  }
+  bw->put_bits -= nbits;
+  bw->put_buffer |= (bits << bw->put_bits);
+  if (bw->put_bits <= 16) DischargeBitBuffer(bw);
+}
+
+void EmitMarker(JpegBitWriter* bw, int marker) {
+  Reserve(bw, 2);
+  JXL_DASSERT(marker != 0xFF);
+  bw->data[bw->pos++] = 0xFF;
+  bw->data[bw->pos++] = marker;
+}
+
+bool JumpToByteBoundary(JpegBitWriter* bw, const uint8_t** pad_bits,
+                        const uint8_t* pad_bits_end) {
+  size_t n_bits = bw->put_bits & 7u;
+  uint8_t pad_pattern;
+  if (*pad_bits == nullptr) {
+    pad_pattern = (1u << n_bits) - 1;
+  } else {
+    pad_pattern = 0;
+    const uint8_t* src = *pad_bits;
+    // TODO(eustas): bitwise reading looks insanely ineffective...
+    while (n_bits--) {
+      pad_pattern <<= 1;
+      if (src >= pad_bits_end) return false;
+      // TODO(eustas): DCHECK *src == {0, 1}
+      pad_pattern |= !!*(src++);
+    }
+    *pad_bits = src;
+  }
+
+  Reserve(bw, 16);
+
+  while (bw->put_bits <= 56) {
+    int c = (bw->put_buffer >> 56) & 0xFF;
+    EmitByte(bw, c);
+    bw->put_buffer <<= 8;
+    bw->put_bits += 8;
+  }
+  if (bw->put_bits < 64) {
+    int pad_mask = 0xFFu >> (64 - bw->put_bits);
+    int c = ((bw->put_buffer >> 56) & ~pad_mask) | pad_pattern;
+    EmitByte(bw, c);
+  }
+  bw->put_buffer = 0;
+  bw->put_bits = 64;
+
+  return true;
+}
+
+void JpegBitWriterFinish(JpegBitWriter* bw) {
+  if (bw->pos == 0) return;
+  bw->chunk.len = bw->pos;
+  bw->output->emplace_back(std::move(bw->chunk));
+  bw->chunk = OutputChunk(nullptr, 0);
+  bw->data = nullptr;
+  bw->pos = 0;
+}
+
+void DCTCodingStateInit(DCTCodingState* s) {
+  s->eob_run_ = 0;
+  s->cur_ac_huff_ = nullptr;
+  s->refinement_bits_.clear();
+  s->refinement_bits_.reserve(kJPEGMaxCorrectionBits);
+}
+
+enum OutputModes {
+  kModeHistogram,
+  kModeWrite,
+};
+
+template <int kOutputMode>
+static JXL_INLINE void WriteSymbol(int symbol, HuffmanCodeTable* table,
+                                   JpegBitWriter* bw) {
+  if (kOutputMode == OutputModes::kModeHistogram) {
+    ++table->depth[symbol];
+  } else {
+    WriteBits(bw, table->depth[symbol], table->code[symbol]);
+  }
+}
+
+// Emit all buffered data to the bit stream using the given Huffman code and
+// bit writer.
+template <int kOutputMode>
+static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) {
+  if (s->eob_run_ > 0) {
+    int nbits = FloorLog2Nonzero<uint32_t>(s->eob_run_);
+    int symbol = nbits << 4u;
+    WriteSymbol<kOutputMode>(symbol, s->cur_ac_huff_, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1));
+    }
+    s->eob_run_ = 0;
+  }
+  for (size_t i = 0; i < s->refinement_bits_.size(); ++i) {
+    WriteBits(bw, 1, s->refinement_bits_[i]);
+  }
+  s->refinement_bits_.clear();
+}
+
+// Buffer some more data at the end-of-band (the last non-zero or newly
+// non-zero coefficient within the [Ss, Se] spectral band).
+template <int kOutputMode>
+static JXL_INLINE void BufferEndOfBand(DCTCodingState* s,
+                                       HuffmanCodeTable* ac_huff,
+                                       const std::vector<int>* new_bits,
+                                       JpegBitWriter* bw) {
+  if (s->eob_run_ == 0) {
+    s->cur_ac_huff_ = ac_huff;
+  }
+  ++s->eob_run_;
+  if (new_bits) {
+    s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(),
+                               new_bits->end());
+  }
+  if (s->eob_run_ == 0x7FFF ||
+      s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+    Flush<kOutputMode>(s, bw);
+  }
+}
+
+bool BuildHuffmanCodeTable(const JPEGHuffmanCode& huff,
+                           HuffmanCodeTable* table) {
+  int huff_code[kJpegHuffmanAlphabetSize];
+  // +1 for a sentinel element.
+  uint32_t huff_size[kJpegHuffmanAlphabetSize + 1];
+  int p = 0;
+  for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = huff.counts[l];
+    if (p + i > kJpegHuffmanAlphabetSize + 1) {
+      return false;
+    }
+    while (i--) huff_size[p++] = l;
+  }
+
+  if (p == 0) {
+    return true;
+  }
+
+  // Reuse sentinel element.
+  int last_p = p - 1;
+  huff_size[last_p] = 0;
+
+  int code = 0;
+  uint32_t si = huff_size[0];
+  p = 0;
+  while (huff_size[p]) {
+    while ((huff_size[p]) == si) {
+      huff_code[p++] = code;
+      code++;
+    }
+    code <<= 1;
+    si++;
+  }
+  for (p = 0; p < last_p; p++) {
+    int i = huff.values[p];
+    table->depth[i] = huff_size[p];
+    table->code[i] = huff_code[p];
+  }
+  return true;
+}
+
+bool EncodeSOI(SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, 0xD8}));
+  return true;
+}
+
+bool EncodeEOI(const JPEGData& jpg, SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, 0xD9}));
+  state->output_queue.emplace_back(jpg.tail_data);
+  return true;
+}
+
+bool EncodeSOF(const JPEGData& jpg, uint8_t marker, SerializationState* state) {
+  if (marker <= 0xC2) state->is_progressive = (marker == 0xC2);
+
+  const size_t n_comps = jpg.components.size();
+  const size_t marker_len = 8 + 3 * n_comps;
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = marker;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = jpg.height >> 8u;
+  data[pos++] = jpg.height & 0xFFu;
+  data[pos++] = jpg.width >> 8u;
+  data[pos++] = jpg.width & 0xFFu;
+  data[pos++] = n_comps;
+  for (size_t i = 0; i < n_comps; ++i) {
+    data[pos++] = jpg.components[i].id;
+    data[pos++] = ((jpg.components[i].h_samp_factor << 4u) |
+                   (jpg.components[i].v_samp_factor));
+    const size_t quant_idx = jpg.components[i].quant_idx;
+    if (quant_idx >= jpg.quant.size()) return false;
+    data[pos++] = jpg.quant[quant_idx].index;
+  }
+  return true;
+}
+
+bool EncodeSOS(const JPEGData& jpg, const JPEGScanInfo& scan_info,
+               SerializationState* state) {
+  const size_t n_scans = scan_info.num_components;
+  const size_t marker_len = 6 + 2 * n_scans;
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDA;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = n_scans;
+  for (size_t i = 0; i < n_scans; ++i) {
+    const JPEGComponentScanInfo& si = scan_info.components[i];
+    if (si.comp_idx >= jpg.components.size()) return false;
+    data[pos++] = jpg.components[si.comp_idx].id;
+    data[pos++] = (si.dc_tbl_idx << 4u) + si.ac_tbl_idx;
+  }
+  data[pos++] = scan_info.Ss;
+  data[pos++] = scan_info.Se;
+  data[pos++] = ((scan_info.Ah << 4u) | (scan_info.Al));
+  return true;
+}
+
+bool EncodeDHT(const JPEGData& jpg, SerializationState* state) {
+  const std::vector<JPEGHuffmanCode>& huffman_code = jpg.huffman_code;
+
+  size_t marker_len = 2;
+  for (size_t i = state->dht_index; i < huffman_code.size(); ++i) {
+    const JPEGHuffmanCode& huff = huffman_code[i];
+    marker_len += kJpegHuffmanMaxBitLength;
+    for (size_t j = 0; j < huff.counts.size(); ++j) {
+      marker_len += huff.counts[j];
+    }
+    if (huff.is_last) break;
+  }
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xC4;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  while (true) {
+    const size_t huffman_code_index = state->dht_index++;
+    if (huffman_code_index >= huffman_code.size()) {
+      return false;
+    }
+    const JPEGHuffmanCode& huff = huffman_code[huffman_code_index];
+    size_t index = huff.slot_id;
+    HuffmanCodeTable* huff_table;
+    if (index & 0x10) {
+      index -= 0x10;
+      huff_table = &state->ac_huff_table[index];
+    } else {
+      huff_table = &state->dc_huff_table[index];
+    }
+    // TODO(eustas): cache
+    // TODO(eustas): set up non-existing symbols
+    if (!BuildHuffmanCodeTable(huff, huff_table)) {
+      return false;
+    }
+    size_t total_count = 0;
+    size_t max_length = 0;
+    for (size_t i = 0; i < huff.counts.size(); ++i) {
+      if (huff.counts[i] != 0) {
+        max_length = i;
+      }
+      total_count += huff.counts[i];
+    }
+    --total_count;
+    data[pos++] = huff.slot_id;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      data[pos++] = (i == max_length ? huff.counts[i] - 1 : huff.counts[i]);
+    }
+    for (size_t i = 0; i < total_count; ++i) {
+      data[pos++] = huff.values[i];
+    }
+    if (huff.is_last) break;
+  }
+  return true;
+}
+
+bool EncodeDQT(const JPEGData& jpg, SerializationState* state) {
+  int marker_len = 2;
+  for (size_t i = state->dqt_index; i < jpg.quant.size(); ++i) {
+    const JPEGQuantTable& table = jpg.quant[i];
+    marker_len += 1 + (table.precision ? 2 : 1) * kDCTBlockSize;
+    if (table.is_last) break;
+  }
+  state->output_queue.emplace_back(marker_len + 2);
+  uint8_t* data = state->output_queue.back().buffer->data();
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDB;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  while (true) {
+    const size_t idx = state->dqt_index++;
+    if (idx >= jpg.quant.size()) {
+      return false;  // corrupt input
+    }
+    const JPEGQuantTable& table = jpg.quant[idx];
+    data[pos++] = (table.precision << 4u) + table.index;
+    for (size_t i = 0; i < kDCTBlockSize; ++i) {
+      int val_idx = kJPEGNaturalOrder[i];
+      int val = table.values[val_idx];
+      if (table.precision) {
+        data[pos++] = val >> 8u;
+      }
+      data[pos++] = val & 0xFFu;
+    }
+    if (table.is_last) break;
+  }
+  return true;
+}
+
+bool EncodeDRI(const JPEGData& jpg, SerializationState* state) {
+  state->seen_dri_marker = true;
+  OutputChunk dri_marker = {0xFF,
+                            0xDD,
+                            0,
+                            4,
+                            static_cast<uint8_t>(jpg.restart_interval >> 8),
+                            static_cast<uint8_t>(jpg.restart_interval & 0xFF)};
+  state->output_queue.push_back(std::move(dri_marker));
+  return true;
+}
+
+bool EncodeRestart(uint8_t marker, SerializationState* state) {
+  state->output_queue.push_back(OutputChunk({0xFF, marker}));
+  return true;
+}
+
+bool EncodeAPP(const JPEGData& jpg, uint8_t marker, SerializationState* state) {
+  // TODO(eustas): check that marker corresponds to payload?
+  (void)marker;
+
+  size_t app_index = state->app_index++;
+  if (app_index >= jpg.app_data.size()) return false;
+  state->output_queue.push_back(OutputChunk({0xFF}));
+  state->output_queue.emplace_back(jpg.app_data[app_index]);
+  return true;
+}
+
+bool EncodeCOM(const JPEGData& jpg, SerializationState* state) {
+  size_t com_index = state->com_index++;
+  if (com_index >= jpg.com_data.size()) return false;
+  state->output_queue.push_back(OutputChunk({0xFF}));
+  state->output_queue.emplace_back(jpg.com_data[com_index]);
+  return true;
+}
+
+bool EncodeInterMarkerData(const JPEGData& jpg, SerializationState* state) {
+  size_t index = state->data_index++;
+  if (index >= jpg.inter_marker_data.size()) return false;
+  state->output_queue.emplace_back(jpg.inter_marker_data[index]);
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeDCTBlockSequential(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                              HuffmanCodeTable* ac_huff, int num_zero_runs,
+                              coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0];
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    if (temp < 0) return false;
+    temp2--;
+  }
+  int dc_nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
+  WriteSymbol<kOutputMode>(dc_nbits, dc_huff, bw);
+  if (dc_nbits >= 12) return false;
+  if (dc_nbits > 0) {
+    WriteBits(bw, dc_nbits, temp2 & ((1u << dc_nbits) - 1));
+  }
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int ac_nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
+    if (ac_nbits >= 16) return false;
+    int symbol = (r << 4u) + ac_nbits;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, ac_nbits, temp2 & ((1 << ac_nbits) - 1));
+    r = 0;
+  }
+  for (int i = 0; i < num_zero_runs; ++i) {
+    WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+    r -= 16;
+  }
+  if (r > 0) {
+    WriteSymbol<kOutputMode>(0, ac_huff, bw);
+  }
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                               HuffmanCodeTable* ac_huff, int Ss, int Se,
+                               int Al, int num_zero_runs,
+                               DCTCodingState* coding_state,
+                               coeff_t* last_dc_coeff, JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  coeff_t temp2;
+  coeff_t temp;
+  if (Ss == 0) {
+    temp2 = coeffs[0] >> Al;
+    temp = temp2 - *last_dc_coeff;
+    *last_dc_coeff = temp2;
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp2--;
+    }
+    int nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
+    WriteSymbol<kOutputMode>(nbits, dc_huff, bw);
+    if (nbits > 0) {
+      WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    }
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int r = 0;
+  for (int k = Ss; k <= Se; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      if (temp < 0) return false;
+      temp >>= Al;
+      temp2 = ~temp;
+    } else {
+      temp >>= Al;
+      temp2 = temp;
+    }
+    if (temp == 0) {
+      r++;
+      continue;
+    }
+    Flush<kOutputMode>(coding_state, bw);
+    while (r > 15) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+    int nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int symbol = (r << 4u) + nbits;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
+    r = 0;
+  }
+  if (num_zero_runs > 0) {
+    Flush<kOutputMode>(coding_state, bw);
+    for (int i = 0; i < num_zero_runs; ++i) {
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+    }
+  }
+  if (r > 0) {
+    BufferEndOfBand<kOutputMode>(coding_state, ac_huff, nullptr, bw);
+    if (!eob_run_allowed) {
+      Flush<kOutputMode>(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+template <int kOutputMode>
+bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff,
+                          int Ss, int Se, int Al, DCTCodingState* coding_state,
+                          JpegBitWriter* bw) {
+  bool eob_run_allowed = Ss > 0;
+  if (Ss == 0) {
+    // Emit next bit of DC component.
+    WriteBits(bw, 1, (coeffs[0] >> Al) & 1);
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int abs_values[kDCTBlockSize];
+  int eob = 0;
+  for (int k = Ss; k <= Se; k++) {
+    const coeff_t abs_val = std::abs(coeffs[kJPEGNaturalOrder[k]]);
+    abs_values[k] = abs_val >> Al;
+    if (abs_values[k] == 1) {
+      eob = k;
+    }
+  }
+  int r = 0;
+  std::vector<int> refinement_bits;
+  refinement_bits.reserve(kDCTBlockSize);
+  for (int k = Ss; k <= Se; k++) {
+    if (abs_values[k] == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15 && k <= eob) {
+      Flush<kOutputMode>(coding_state, bw);
+      WriteSymbol<kOutputMode>(0xf0, ac_huff, bw);
+      r -= 16;
+      for (int bit : refinement_bits) {
+        WriteBits(bw, 1, bit);
+      }
+      refinement_bits.clear();
+    }
+    if (abs_values[k] > 1) {
+      refinement_bits.push_back(abs_values[k] & 1u);
+      continue;
+    }
+    Flush<kOutputMode>(coding_state, bw);
+    int symbol = (r << 4u) + 1;
+    int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1;
+    WriteSymbol<kOutputMode>(symbol, ac_huff, bw);
+    WriteBits(bw, 1, new_non_zero_bit);
+    for (int bit : refinement_bits) {
+      WriteBits(bw, 1, bit);
+    }
+    refinement_bits.clear();
+    r = 0;
+  }
+  if (r > 0 || !refinement_bits.empty()) {
+    BufferEndOfBand<kOutputMode>(coding_state, ac_huff, &refinement_bits, bw);
+    if (!eob_run_allowed) {
+      Flush<kOutputMode>(coding_state, bw);
+    }
+  }
+  return true;
+}
+
+size_t NumHistograms(const JPEGData& jpg) {
+  size_t num = 0;
+  for (const auto& si : jpg.scan_info) {
+    num += si.num_components;
+  }
+  return num;
+}
+
+size_t HistogramIndex(const JPEGData& jpg, size_t scan_index,
+                      size_t component_index) {
+  size_t idx = 0;
+  for (size_t i = 0; i < scan_index; ++i) {
+    idx += jpg.scan_info[i].num_components;
+  }
+  return idx + component_index;
+}
+
+template <int kMode, int kOutputMode>
+SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
+                                              SerializationState* state) {
+  const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index];
+  EncodeScanState& ss = state->scan_state;
+
+  const int restart_interval =
+      state->seen_dri_marker ? jpg.restart_interval : 0;
+
+  const auto get_next_extra_zero_run_index = [&ss, &scan_info]() -> int {
+    if (ss.extra_zero_runs_pos < scan_info.extra_zero_runs.size()) {
+      return scan_info.extra_zero_runs[ss.extra_zero_runs_pos].block_idx;
+    } else {
+      return -1;
+    }
+  };
+
+  const auto get_next_reset_point = [&ss, &scan_info]() -> int {
+    if (ss.next_reset_point_pos < scan_info.reset_points.size()) {
+      return scan_info.reset_points[ss.next_reset_point_pos++];
+    } else {
+      return -1;
+    }
+  };
+
+  if (ss.stage == EncodeScanState::HEAD) {
+    if (!EncodeSOS(jpg, scan_info, state)) return SerializationStatus::ERROR;
+    JpegBitWriterInit(&ss.bw, &state->output_queue);
+    DCTCodingStateInit(&ss.coding_state);
+    ss.restarts_to_go = restart_interval;
+    ss.next_restart_marker = 0;
+    ss.block_scan_index = 0;
+    ss.extra_zero_runs_pos = 0;
+    ss.next_extra_zero_run_index = get_next_extra_zero_run_index();
+    ss.next_reset_point_pos = 0;
+    ss.next_reset_point = get_next_reset_point();
+    ss.mcu_y = 0;
+    memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff));
+    ss.stage = EncodeScanState::BODY;
+  }
+  JpegBitWriter* bw = &ss.bw;
+  DCTCodingState* coding_state = &ss.coding_state;
+
+  JXL_DASSERT(ss.stage == EncodeScanState::BODY);
+
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info.num_components > 1);
+  int MCUs_per_row = 0;
+  int MCU_rows = 0;
+  jpg.CalculateMcuSize(scan_info, &MCUs_per_row, &MCU_rows);
+  const bool is_progressive = state->is_progressive;
+  const int Al = is_progressive ? scan_info.Al : 0;
+  const int Ss = is_progressive ? scan_info.Ss : 0;
+  const int Se = is_progressive ? scan_info.Se : 63;
+
+  // DC-only is defined by [0..0] spectral range.
+  const bool want_ac = ((Ss != 0) || (Se != 0));
+  // TODO: support streaming decoding again.
+  const bool complete_ac = true;
+  const bool has_ac = true;
+  if (want_ac && !has_ac) return SerializationStatus::NEEDS_MORE_INPUT;
+
+  // |has_ac| implies |complete_dc| but not vice versa; for the sake of
+  // simplicity we pretend they are equal, because they are separated by just a
+  // few bytes of input.
+  const bool complete_dc = has_ac;
+  const bool complete = want_ac ? complete_ac : complete_dc;
+  // When "incomplete" |ac_dc| tracks information about current ("incomplete")
+  // band parsing progress.
+
+  // FIXME: Is this always complete?
+  // const int last_mcu_y =
+  //     complete ? MCU_rows : parsing_state.internal->ac_dc.next_mcu_y *
+  //     v_group;
+  (void)complete;
+  const int last_mcu_y = complete ? MCU_rows : 0;
+
+  for (; ss.mcu_y < last_mcu_y; ++ss.mcu_y) {
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && ss.restarts_to_go == 0) {
+        Flush<kOutputMode>(coding_state, bw);
+        if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) {
+          return SerializationStatus::ERROR;
+        }
+        EmitMarker(bw, 0xD0 + ss.next_restart_marker);
+        ss.next_restart_marker += 1;
+        ss.next_restart_marker &= 0x7;
+        ss.restarts_to_go = restart_interval;
+        memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff));
+      }
+      // Encode one MCU
+      for (size_t i = 0; i < scan_info.num_components; ++i) {
+        const JPEGComponentScanInfo& si = scan_info.components[i];
+        const JPEGComponent& c = jpg.components[si.comp_idx];
+        size_t dc_tbl_idx = (kOutputMode == OutputModes::kModeHistogram
+                                 ? HistogramIndex(jpg, state->scan_index, i)
+                                 : si.dc_tbl_idx);
+        size_t ac_tbl_idx = (kOutputMode == OutputModes::kModeHistogram
+                                 ? HistogramIndex(jpg, state->scan_index, i)
+                                 : si.ac_tbl_idx);
+        HuffmanCodeTable* dc_huff = &state->dc_huff_table[dc_tbl_idx];
+        HuffmanCodeTable* ac_huff = &state->ac_huff_table[ac_tbl_idx];
+        int n_blocks_y = is_interleaved ? c.v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? c.h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            int block_y = ss.mcu_y * n_blocks_y + iy;
+            int block_x = mcu_x * n_blocks_x + ix;
+            int block_idx = block_y * c.width_in_blocks + block_x;
+            if (ss.block_scan_index == ss.next_reset_point) {
+              Flush<kOutputMode>(coding_state, bw);
+              ss.next_reset_point = get_next_reset_point();
+            }
+            int num_zero_runs = 0;
+            if (ss.block_scan_index == ss.next_extra_zero_run_index) {
+              num_zero_runs = scan_info.extra_zero_runs[ss.extra_zero_runs_pos]
+                                  .num_extra_zero_runs;
+              ++ss.extra_zero_runs_pos;
+              ss.next_extra_zero_run_index = get_next_extra_zero_run_index();
+            }
+            const coeff_t* coeffs = &c.coeffs[block_idx << 6];
+            bool ok;
+            if (kMode == 0) {
+              ok = EncodeDCTBlockSequential<kOutputMode>(
+                  coeffs, dc_huff, ac_huff, num_zero_runs,
+                  ss.last_dc_coeff + si.comp_idx, bw);
+            } else if (kMode == 1) {
+              ok = EncodeDCTBlockProgressive<kOutputMode>(
+                  coeffs, dc_huff, ac_huff, Ss, Se, Al, num_zero_runs,
+                  coding_state, ss.last_dc_coeff + si.comp_idx, bw);
+            } else {
+              ok = EncodeRefinementBits<kOutputMode>(coeffs, ac_huff, Ss, Se,
+                                                     Al, coding_state, bw);
+            }
+            if (!ok) return SerializationStatus::ERROR;
+            ++ss.block_scan_index;
+          }
+        }
+      }
+      --ss.restarts_to_go;
+    }
+  }
+  if (ss.mcu_y < MCU_rows) {
+    if (!bw->healthy) return SerializationStatus::ERROR;
+    return SerializationStatus::NEEDS_MORE_INPUT;
+  }
+  Flush<kOutputMode>(coding_state, bw);
+  if (!JumpToByteBoundary(bw, &state->pad_bits, state->pad_bits_end)) {
+    return SerializationStatus::ERROR;
+  }
+  JpegBitWriterFinish(bw);
+  ss.stage = EncodeScanState::HEAD;
+  state->scan_index++;
+  if (!bw->healthy) return SerializationStatus::ERROR;
+
+  return SerializationStatus::DONE;
+}
+
+template <int kOutputMode>
+static SerializationStatus JXL_INLINE EncodeScan(const JPEGData& jpg,
+                                                 SerializationState* state) {
+  const JPEGScanInfo& scan_info = jpg.scan_info[state->scan_index];
+  const bool is_progressive = state->is_progressive;
+  const int Al = is_progressive ? scan_info.Al : 0;
+  const int Ah = is_progressive ? scan_info.Ah : 0;
+  const int Ss = is_progressive ? scan_info.Ss : 0;
+  const int Se = is_progressive ? scan_info.Se : 63;
+  const bool need_sequential =
+      !is_progressive || (Ah == 0 && Al == 0 && Ss == 0 && Se == 63);
+  if (need_sequential) {
+    return DoEncodeScan<0, kOutputMode>(jpg, state);
+  } else if (Ah == 0) {
+    return DoEncodeScan<1, kOutputMode>(jpg, state);
+  } else {
+    return DoEncodeScan<2, kOutputMode>(jpg, state);
+  }
+}
+
+template <int kOutputMode>
+SerializationStatus SerializeSection(uint8_t marker, SerializationState* state,
+                                     const JPEGData& jpg) {
+  const auto to_status = [](bool result) {
+    return result ? SerializationStatus::DONE : SerializationStatus::ERROR;
+  };
+  // TODO(eustas): add and use marker enum
+  switch (marker) {
+    case 0xC0:
+    case 0xC1:
+    case 0xC2:
+    case 0xC9:
+    case 0xCA:
+      return to_status(EncodeSOF(jpg, marker, state));
+
+    case 0xC4:
+      return to_status((kOutputMode == OutputModes::kModeHistogram) ||
+                       EncodeDHT(jpg, state));
+
+    case 0xD0:
+    case 0xD1:
+    case 0xD2:
+    case 0xD3:
+    case 0xD4:
+    case 0xD5:
+    case 0xD6:
+    case 0xD7:
+      return to_status(EncodeRestart(marker, state));
+
+    case 0xD9:
+      return to_status(EncodeEOI(jpg, state));
+
+    case 0xDA:
+      return EncodeScan<kOutputMode>(jpg, state);
+
+    case 0xDB:
+      return to_status(EncodeDQT(jpg, state));
+
+    case 0xDD:
+      return to_status(EncodeDRI(jpg, state));
+
+    case 0xE0:
+    case 0xE1:
+    case 0xE2:
+    case 0xE3:
+    case 0xE4:
+    case 0xE5:
+    case 0xE6:
+    case 0xE7:
+    case 0xE8:
+    case 0xE9:
+    case 0xEA:
+    case 0xEB:
+    case 0xEC:
+    case 0xED:
+    case 0xEE:
+    case 0xEF:
+      return to_status(EncodeAPP(jpg, marker, state));
+
+    case 0xFE:
+      return to_status(EncodeCOM(jpg, state));
+
+    case 0xFF:
+      return to_status(EncodeInterMarkerData(jpg, state));
+
+    default:
+      return SerializationStatus::ERROR;
+  }
+}
+
+// TODO(veluca): add streaming support again.
+template <int kOutputMode>
+Status WriteJpegInternal(const JPEGData& jpg, const JPEGOutput& out,
+                         SerializationState* ss) {
+  const auto maybe_push_output = [&]() -> Status {
+    if (ss->stage != SerializationState::STAGE_ERROR) {
+      while (!ss->output_queue.empty()) {
+        auto& chunk = ss->output_queue.front();
+        size_t num_written = out(chunk.next, chunk.len);
+        if (num_written == 0 && chunk.len > 0) {
+          return StatusMessage(Status(StatusCode::kNotEnoughBytes),
+                               "Failed to write output");
+        }
+        chunk.len -= num_written;
+        if (chunk.len == 0) {
+          ss->output_queue.pop_front();
+        }
+      }
+    }
+    return true;
+  };
+
+  while (true) {
+    switch (ss->stage) {
+      case SerializationState::STAGE_INIT: {
+        // Valid Brunsli requires, at least, 0xD9 marker.
+        // This might happen on corrupted stream, or on unconditioned JPEGData.
+        // TODO(eustas): check D9 in the only one and is the last one.
+        if (jpg.marker_order.empty()) {
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        if (kOutputMode == OutputModes::kModeHistogram) {
+          size_t num_histo = NumHistograms(jpg);
+          ss->dc_huff_table.resize(num_histo);
+          ss->ac_huff_table.resize(num_histo);
+          for (size_t i = 0; i < num_histo; ++i) {
+            ss->dc_huff_table[i].InitDepths();
+            ss->ac_huff_table[i].InitDepths();
+          }
+        } else {
+          ss->dc_huff_table.resize(kMaxHuffmanTables);
+          ss->ac_huff_table.resize(kMaxHuffmanTables);
+        }
+        if (jpg.has_zero_padding_bit) {
+          ss->pad_bits = jpg.padding_bits.data();
+          ss->pad_bits_end = ss->pad_bits + jpg.padding_bits.size();
+        }
+
+        EncodeSOI(ss);
+        JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
+        ss->stage = SerializationState::STAGE_SERIALIZE_SECTION;
+        break;
+      }
+
+      case SerializationState::STAGE_SERIALIZE_SECTION: {
+        if (ss->section_index >= jpg.marker_order.size()) {
+          ss->stage = SerializationState::STAGE_DONE;
+          break;
+        }
+        uint8_t marker = jpg.marker_order[ss->section_index];
+        SerializationStatus status =
+            SerializeSection<kOutputMode>(marker, ss, jpg);
+        if (status == SerializationStatus::ERROR) {
+          JXL_WARNING("Failed to encode marker 0x%.2x", marker);
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
+        if (status == SerializationStatus::NEEDS_MORE_INPUT) {
+          return JXL_FAILURE("Incomplete serialization data");
+        } else if (status != SerializationStatus::DONE) {
+          JXL_DASSERT(false);
+          ss->stage = SerializationState::STAGE_ERROR;
+          break;
+        }
+        ++ss->section_index;
+        break;
+      }
+
+      case SerializationState::STAGE_DONE:
+        JXL_ASSERT(ss->output_queue.empty());
+        if (ss->pad_bits != nullptr && ss->pad_bits != ss->pad_bits_end) {
+          return JXL_FAILURE("Invalid number of padding bits.");
+        }
+        return true;
+
+      case SerializationState::STAGE_ERROR:
+        return JXL_FAILURE("JPEG serialization error");
+    }
+  }
+}
+
+}  // namespace
+
+Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
+  SerializationState ss;
+  return WriteJpegInternal<OutputModes::kModeWrite>(jpg, out, &ss);
+}
+
+Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss) {
+  auto nullout = [](const uint8_t* buf, size_t len) { return len; };
+  return WriteJpegInternal<OutputModes::kModeHistogram>(jpg, nullout, ss);
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h
new file mode 100644
index 0000000000..9ccfb749a8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.h
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for writing a JPEGData object into a jpeg byte stream.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
+#define LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <functional>
+
+#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+// Function type used to write len bytes into buf. Returns the number of bytes
+// written.
+using JPEGOutput = std::function<size_t(const uint8_t* buf, size_t len)>;
+
+Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out);
+
+// Same as WriteJpeg, but instead of writing to the output, collects statistics
+// about the bit-stream into `ss`.
+Status ProcessJpeg(const JPEGData& jpg, SerializationState* ss);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_DATA_WRITER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h
new file mode 100644
index 0000000000..e003c04952
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_output_chunk.h
@@ -0,0 +1,72 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
+#define LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+namespace jxl {
+namespace jpeg {
+
+/**
+ * A chunk of output data.
+ *
+ * Data producer creates OutputChunks and adds them to the end output queue.
+ * Once control flow leaves the producer code, it is considered that chunk of
+ * data is final and can not be changed; to underline this fact |next| is a
+ * const-pointer.
+ *
+ * Data consumer removes OutputChunks from the beginning of the output queue.
+ * It is possible to consume OutputChunks partially, by updating |next| and
+ * |len|.
+ *
+ * There are 2 types of output chunks:
+ *  - owning: actual data is stored in |buffer| field; producer fills data after
+ *    the instance it created; it is legal to reduce |len| to show that not all
+ *    the capacity of |buffer| is used
+ *  - non-owning: represents the data stored (owned) somewhere else
+ */
+struct OutputChunk {
+  // Non-owning
+  template <typename Bytes>
+  explicit OutputChunk(Bytes& bytes) : len(bytes.size()) {
+    // Deal both with const qualifier and data type.
+    const void* src = bytes.data();
+    next = reinterpret_cast<const uint8_t*>(src);
+  }
+
+  // Non-owning
+  OutputChunk(const uint8_t* data, size_t size) : next(data), len(size) {}
+
+  // Owning
+  explicit OutputChunk(size_t size = 0) {
+    buffer.reset(new std::vector<uint8_t>(size));
+    next = buffer->data();
+    len = size;
+  }
+
+  // Owning
+  OutputChunk(std::initializer_list<uint8_t> bytes) {
+    buffer.reset(new std::vector<uint8_t>(bytes));
+    next = buffer->data();
+    len = bytes.size();
+  }
+
+  const uint8_t* next;
+  size_t len;
+  // TODO(veluca): consider removing the unique_ptr.
+  std::unique_ptr<std::vector<uint8_t>> buffer;
+};
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_OUTPUT_CHUNK_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h
new file mode 100644
index 0000000000..40ce450a76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_serialization_state.h
@@ -0,0 +1,96 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
+#define LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
+
+#include <deque>
+#include <vector>
+
+#include "lib/jxl/jpeg/dec_jpeg_output_chunk.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+struct HuffmanCodeTable {
+  int depth[256];
+  int code[256];
+  void InitDepths() { std::fill(std::begin(depth), std::end(depth), 0); }
+};
+
+// Handles the packing of bits into output bytes.
+struct JpegBitWriter {
+  bool healthy;
+  std::deque<OutputChunk>* output;
+  OutputChunk chunk;
+  uint8_t* data;
+  size_t pos;
+  uint64_t put_buffer;
+  int put_bits;
+};
+
+// Holds data that is buffered between 8x8 blocks in progressive mode.
+struct DCTCodingState {
+  // The run length of end-of-band symbols in a progressive scan.
+  int eob_run_;
+  // The huffman table to be used when flushing the state.
+  HuffmanCodeTable* cur_ac_huff_;
+  // The sequence of currently buffered refinement bits for a successive
+  // approximation scan (one where Ah > 0).
+  std::vector<int> refinement_bits_;
+};
+
+struct EncodeScanState {
+  enum Stage { HEAD, BODY };
+
+  Stage stage = HEAD;
+
+  int mcu_y;
+  JpegBitWriter bw;
+  coeff_t last_dc_coeff[kMaxComponents] = {0};
+  int restarts_to_go;
+  int next_restart_marker;
+  int block_scan_index;
+  DCTCodingState coding_state;
+  size_t extra_zero_runs_pos;
+  int next_extra_zero_run_index;
+  size_t next_reset_point_pos;
+  int next_reset_point;
+};
+
+struct SerializationState {
+  enum Stage {
+    STAGE_INIT,
+    STAGE_SERIALIZE_SECTION,
+    STAGE_DONE,
+    STAGE_ERROR,
+  };
+
+  Stage stage = STAGE_INIT;
+
+  std::deque<OutputChunk> output_queue;
+
+  size_t section_index = 0;
+  int dht_index = 0;
+  int dqt_index = 0;
+  int app_index = 0;
+  int com_index = 0;
+  int data_index = 0;
+  int scan_index = 0;
+  std::vector<HuffmanCodeTable> dc_huff_table;
+  std::vector<HuffmanCodeTable> ac_huff_table;
+  const uint8_t* pad_bits = nullptr;
+  const uint8_t* pad_bits_end = nullptr;
+  bool seen_dri_marker = false;
+  bool is_progressive = false;
+
+  EncodeScanState scan_state;
+};
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc
new file mode 100644
index 0000000000..842612f4ab
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.cc
@@ -0,0 +1,384 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+
+#include <brotli/encode.h>
+#include <stdio.h>
+
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+#include "lib/jxl/luminance.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+
+constexpr int BITS_IN_JSAMPLE = 8;
+using ByteSpan = Span<const uint8_t>;
+
+// TODO(eustas): move to jpeg_data, to use from codec_jpg as well.
+// See if there is a canonically chunked ICC profile and mark corresponding
+// app-tags with AppMarkerType::kICC.
+Status DetectIccProfile(JPEGData& jpeg_data) {
+  JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size());
+  size_t num_icc = 0;
+  size_t num_icc_jpeg = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    const auto& app = jpeg_data.app_data[i];
+    size_t pos = 0;
+    if (app[pos++] != 0xE2) continue;
+    // At least APPn + size; otherwise it should be intermarker-data.
+    JXL_DASSERT(app.size() >= 3);
+    size_t tag_length = (app[pos] << 8) + app[pos + 1];
+    pos += 2;
+    JXL_DASSERT(app.size() == tag_length + 1);
+    // Empty payload is 2 bytes for tag length itself + signature
+    if (tag_length < 2 + sizeof kIccProfileTag) continue;
+
+    if (memcmp(&app[pos], kIccProfileTag, sizeof kIccProfileTag) != 0) continue;
+    pos += sizeof kIccProfileTag;
+    uint8_t chunk_id = app[pos++];
+    uint8_t num_chunks = app[pos++];
+    if (chunk_id != num_icc + 1) continue;
+    if (num_icc_jpeg == 0) num_icc_jpeg = num_chunks;
+    if (num_icc_jpeg != num_chunks) continue;
+    num_icc++;
+    jpeg_data.app_marker_type[i] = AppMarkerType::kICC;
+  }
+  if (num_icc != num_icc_jpeg) {
+    return JXL_FAILURE("Invalid ICC chunks");
+  }
+  return true;
+}
+
+bool GetMarkerPayload(const uint8_t* data, size_t size, ByteSpan* payload) {
+  if (size < 3) {
+    return false;
+  }
+  size_t hi = data[1];
+  size_t lo = data[2];
+  size_t internal_size = (hi << 8u) | lo;
+  // Second byte of marker is not counted towards size.
+  if (internal_size != size - 1) {
+    return false;
+  }
+  // cut second marker byte and "length" from payload.
+  *payload = ByteSpan(data, size);
+  payload->remove_prefix(3);
+  return true;
+}
+
+Status DetectBlobs(jpeg::JPEGData& jpeg_data) {
+  JXL_DASSERT(jpeg_data.app_data.size() == jpeg_data.app_marker_type.size());
+  bool have_exif = false, have_xmp = false;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    auto& marker = jpeg_data.app_data[i];
+    if (marker.empty() || marker[0] != kApp1) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if (!have_exif && payload.size() >= sizeof kExifTag &&
+        !memcmp(payload.data(), kExifTag, sizeof kExifTag)) {
+      jpeg_data.app_marker_type[i] = AppMarkerType::kExif;
+      have_exif = true;
+    }
+    if (!have_xmp && payload.size() >= sizeof kXMPTag &&
+        !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) {
+      jpeg_data.app_marker_type[i] = AppMarkerType::kXMP;
+      have_xmp = true;
+    }
+  }
+  return true;
+}
+
+Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type,
+                          const ByteSpan& tag, PaddedBytes* output,
+                          bool allow_permutations = false) {
+  output->clear();
+
+  std::vector<ByteSpan> chunks;
+  std::vector<bool> presence;
+  size_t expected_number_of_parts = 0;
+  bool is_first_chunk = true;
+  size_t ordinal = 0;
+  for (const auto& marker : src.app_data) {
+    if (marker.empty() || marker[0] != marker_type) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if ((payload.size() < tag.size()) ||
+        memcmp(payload.data(), tag.data(), tag.size()) != 0) {
+      continue;
+    }
+    payload.remove_prefix(tag.size());
+    if (payload.size() < 2) {
+      return JXL_FAILURE("Chunk is too small.");
+    }
+    uint8_t index = payload[0];
+    uint8_t total = payload[1];
+    ordinal++;
+    if (!allow_permutations) {
+      if (index != ordinal) return JXL_FAILURE("Invalid chunk order.");
+    }
+
+    payload.remove_prefix(2);
+
+    JXL_RETURN_IF_ERROR(total != 0);
+    if (is_first_chunk) {
+      is_first_chunk = false;
+      expected_number_of_parts = total;
+      // 1-based indices; 0-th element is added for convenience.
+      chunks.resize(total + 1);
+      presence.resize(total + 1);
+    } else {
+      JXL_RETURN_IF_ERROR(expected_number_of_parts == total);
+    }
+
+    if (index == 0 || index > total) {
+      return JXL_FAILURE("Invalid chunk index.");
+    }
+
+    if (presence[index]) {
+      return JXL_FAILURE("Duplicate chunk.");
+    }
+    presence[index] = true;
+    chunks[index] = payload;
+  }
+
+  for (size_t i = 0; i < expected_number_of_parts; ++i) {
+    // 0-th element is not used.
+    size_t index = i + 1;
+    if (!presence[index]) {
+      return JXL_FAILURE("Missing chunk.");
+    }
+    output->append(chunks[index]);
+  }
+
+  return true;
+}
+
+Status SetBlobsFromJpegData(const jpeg::JPEGData& jpeg_data, Blobs* blobs) {
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    auto& marker = jpeg_data.app_data[i];
+    if (marker.empty() || marker[0] != kApp1) {
+      continue;
+    }
+    ByteSpan payload;
+    if (!GetMarkerPayload(marker.data(), marker.size(), &payload)) {
+      // Something is wrong with this marker; does not care.
+      continue;
+    }
+    if (payload.size() >= sizeof kExifTag &&
+        !memcmp(payload.data(), kExifTag, sizeof kExifTag)) {
+      if (blobs->exif.empty()) {
+        blobs->exif.resize(payload.size() - sizeof kExifTag);
+        memcpy(blobs->exif.data(), payload.data() + sizeof kExifTag,
+               payload.size() - sizeof kExifTag);
+      } else {
+        JXL_WARNING(
+            "ReJPEG: multiple Exif blobs, storing only first one in the JPEG "
+            "XL container\n");
+      }
+    }
+    if (payload.size() >= sizeof kXMPTag &&
+        !memcmp(payload.data(), kXMPTag, sizeof kXMPTag)) {
+      if (blobs->xmp.empty()) {
+        blobs->xmp.resize(payload.size() - sizeof kXMPTag);
+        memcpy(blobs->xmp.data(), payload.data() + sizeof kXMPTag,
+               payload.size() - sizeof kXMPTag);
+      } else {
+        JXL_WARNING(
+            "ReJPEG: multiple XMP blobs, storing only first one in the JPEG "
+            "XL container\n");
+      }
+    }
+  }
+  return true;
+}
+
+static inline bool IsJPG(const Span<const uint8_t> bytes) {
+  return bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xD8;
+}
+
+}  // namespace
+
+Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                    ColorEncoding* color_encoding) {
+  PaddedBytes icc_profile;
+  if (!ParseChunkedMarker(jpg, kApp2, ByteSpan(kIccProfileTag), &icc_profile)) {
+    JXL_WARNING("ReJPEG: corrupted ICC profile\n");
+    icc_profile.clear();
+  }
+
+  if (icc_profile.empty()) {
+    bool is_gray = (jpg.components.size() == 1);
+    *color_encoding = ColorEncoding::SRGB(is_gray);
+    return true;
+  }
+
+  return color_encoding->SetICC(std::move(icc_profile));
+}
+
+Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+                      const CompressParams& cparams) {
+  jpeg_data.app_marker_type.resize(jpeg_data.app_data.size(),
+                                   AppMarkerType::kUnknown);
+  JXL_RETURN_IF_ERROR(DetectIccProfile(jpeg_data));
+  JXL_RETURN_IF_ERROR(DetectBlobs(jpeg_data));
+  BitWriter writer;
+  JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  *bytes = std::move(writer).TakeBytes();
+  BrotliEncoderState* brotli_enc =
+      BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
+  int effort = cparams.brotli_effort;
+  if (effort < 0) effort = 11 - static_cast<int>(cparams.speed_tier);
+  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, effort);
+  size_t total_data = 0;
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) {
+      continue;
+    }
+    total_data += jpeg_data.app_data[i].size();
+  }
+  for (size_t i = 0; i < jpeg_data.com_data.size(); i++) {
+    total_data += jpeg_data.com_data[i].size();
+  }
+  for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) {
+    total_data += jpeg_data.inter_marker_data[i].size();
+  }
+  total_data += jpeg_data.tail_data.size();
+  size_t initial_size = bytes->size();
+  size_t brotli_capacity = BrotliEncoderMaxCompressedSize(total_data);
+  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_SIZE_HINT, total_data);
+  bytes->resize(bytes->size() + brotli_capacity);
+  size_t enc_size = 0;
+  auto br_append = [&](const std::vector<uint8_t>& data, bool last) {
+    size_t available_in = data.size();
+    const uint8_t* in = data.data();
+    uint8_t* out = &(*bytes)[initial_size + enc_size];
+    do {
+      uint8_t* out_before = out;
+      msan::MemoryIsInitialized(in, available_in);
+      JXL_CHECK(BrotliEncoderCompressStream(
+          brotli_enc, last ? BROTLI_OPERATION_FINISH : BROTLI_OPERATION_PROCESS,
+          &available_in, &in, &brotli_capacity, &out, &enc_size));
+      msan::UnpoisonMemory(out_before, out - out_before);
+    } while (BrotliEncoderHasMoreOutput(brotli_enc) || available_in > 0);
+  };
+
+  for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
+    if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) {
+      continue;
+    }
+    br_append(jpeg_data.app_data[i], /*last=*/false);
+  }
+  for (size_t i = 0; i < jpeg_data.com_data.size(); i++) {
+    br_append(jpeg_data.com_data[i], /*last=*/false);
+  }
+  for (size_t i = 0; i < jpeg_data.inter_marker_data.size(); i++) {
+    br_append(jpeg_data.inter_marker_data[i], /*last=*/false);
+  }
+  br_append(jpeg_data.tail_data, /*last=*/true);
+  BrotliEncoderDestroyInstance(brotli_enc);
+  bytes->resize(initial_size + enc_size);
+  return true;
+}
+
+Status DecodeImageJPG(const Span<const uint8_t> bytes, CodecInOut* io) {
+  if (!IsJPG(bytes)) return false;
+  io->frames.clear();
+  io->frames.reserve(1);
+  io->frames.emplace_back(&io->metadata.m);
+  io->Main().jpeg_data = make_unique<jpeg::JPEGData>();
+  jpeg::JPEGData* jpeg_data = io->Main().jpeg_data.get();
+  if (!jpeg::ReadJpeg(bytes.data(), bytes.size(), jpeg::JpegReadMode::kReadAll,
+                      jpeg_data)) {
+    return JXL_FAILURE("Error reading JPEG");
+  }
+  JXL_RETURN_IF_ERROR(
+      SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding));
+  JXL_RETURN_IF_ERROR(SetBlobsFromJpegData(*jpeg_data, &io->blobs));
+  size_t nbcomp = jpeg_data->components.size();
+  if (nbcomp != 1 && nbcomp != 3) {
+    return JXL_FAILURE("Cannot recompress JPEGs with neither 1 nor 3 channels");
+  }
+  YCbCrChromaSubsampling cs;
+  if (nbcomp == 3) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < nbcomp; i++) {
+      hsample[i] = jpeg_data->components[i].h_samp_factor;
+      vsample[i] = jpeg_data->components[i].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample));
+  } else if (nbcomp == 1) {
+    uint8_t hsample[3], vsample[3];
+    for (size_t i = 0; i < 3; i++) {
+      hsample[i] = jpeg_data->components[0].h_samp_factor;
+      vsample[i] = jpeg_data->components[0].v_samp_factor;
+    }
+    JXL_RETURN_IF_ERROR(cs.Set(hsample, vsample));
+  }
+  bool is_rgb = false;
+  {
+    const auto& markers = jpeg_data->marker_order;
+    // If there is a JFIF marker, this is YCbCr. Otherwise...
+    if (std::find(markers.begin(), markers.end(), 0xE0) == markers.end()) {
+      // Try to find an 'Adobe' marker.
+      size_t app_markers = 0;
+      size_t i = 0;
+      for (; i < markers.size(); i++) {
+        // This is an APP marker.
+        if ((markers[i] & 0xF0) == 0xE0) {
+          JXL_CHECK(app_markers < jpeg_data->app_data.size());
+          // APP14 marker
+          if (markers[i] == 0xEE) {
+            const auto& data = jpeg_data->app_data[app_markers];
+            if (data.size() == 15 && data[3] == 'A' && data[4] == 'd' &&
+                data[5] == 'o' && data[6] == 'b' && data[7] == 'e') {
+              // 'Adobe' marker.
+              is_rgb = data[14] == 0;
+              break;
+            }
+          }
+          app_markers++;
+        }
+      }
+
+      if (i == markers.size()) {
+        // No 'Adobe' marker, guess from component IDs.
+        is_rgb = nbcomp == 3 && jpeg_data->components[0].id == 'R' &&
+                 jpeg_data->components[1].id == 'G' &&
+                 jpeg_data->components[2].id == 'B';
+      }
+    }
+  }
+
+  io->Main().chroma_subsampling = cs;
+  io->Main().color_transform =
+      (!is_rgb || nbcomp == 1) ? ColorTransform::kYCbCr : ColorTransform::kNone;
+
+  io->metadata.m.SetIntensityTarget(kDefaultIntensityTarget);
+  io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE);
+  io->SetFromImage(Image3F(jpeg_data->width, jpeg_data->height),
+                   io->metadata.m.color_encoding);
+  SetIntensityTarget(&io->metadata.m);
+  return true;
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h
new file mode 100644
index 0000000000..806128c465
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_H_
+#define LIB_JXL_JPEG_ENC_JPEG_DATA_H_
+
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+                      const CompressParams& cparams);
+
+Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                    ColorEncoding* color_encoding);
+
+/**
+ * Decodes bytes containing JPEG codestream into a CodecInOut as coefficients
+ * only, for lossless JPEG transcoding.
+ */
+Status DecodeImageJPG(Span<const uint8_t> bytes, CodecInOut* io);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc
new file mode 100644
index 0000000000..f569b73363
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.cc
@@ -0,0 +1,1053 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+namespace {
+static const int kBrunsliMaxSampling = 15;
+
+// Macros for commonly used error conditions.
+
+#define JXL_JPEG_VERIFY_LEN(n)                                \
+  if (*pos + (n) > len) {                                     \
+    return JXL_FAILURE("Unexpected end of input: pos=%" PRIuS \
+                       " need=%d len=%" PRIuS,                \
+                       *pos, static_cast<int>(n), len);       \
+  }
+
+#define JXL_JPEG_VERIFY_INPUT(var, low, high, code)                    \
+  if ((var) < (low) || (var) > (high)) {                               \
+    return JXL_FAILURE("Invalid " #var ": %d", static_cast<int>(var)); \
+  }
+
+#define JXL_JPEG_VERIFY_MARKER_END()                             \
+  if (start_pos + marker_len != *pos) {                          \
+    return JXL_FAILURE("Invalid marker length: declared=%" PRIuS \
+                       " actual=%" PRIuS,                        \
+                       marker_len, (*pos - start_pos));          \
+  }
+
+#define JXL_JPEG_EXPECT_MARKER()                                 \
+  if (pos + 2 > len || data[pos] != 0xff) {                      \
+    return JXL_FAILURE(                                          \
+        "Marker byte (0xff) expected, found: 0x%.2x pos=%" PRIuS \
+        " len=%" PRIuS,                                          \
+        (pos < len ? data[pos] : 0), pos, len);                  \
+  }
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+// Reads the Start of Frame (SOF) marker segment and fills in *jpg with the
+// parsed data.
+bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode,
+                size_t* pos, JPEGData* jpg) {
+  if (jpg->width != 0) {
+    return JXL_FAILURE("Duplicate SOF marker.");
+  }
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(8);
+  size_t marker_len = ReadUint16(data, pos);
+  int precision = ReadUint8(data, pos);
+  int height = ReadUint16(data, pos);
+  int width = ReadUint16(data, pos);
+  int num_components = ReadUint8(data, pos);
+  // 'jbrd' is hardcoded for 8bits:
+  JXL_JPEG_VERIFY_INPUT(precision, 8, 8, PRECISION);
+  JXL_JPEG_VERIFY_INPUT(height, 1, kMaxDimPixels, HEIGHT);
+  JXL_JPEG_VERIFY_INPUT(width, 1, kMaxDimPixels, WIDTH);
+  JXL_JPEG_VERIFY_INPUT(num_components, 1, kMaxComponents, NUMCOMP);
+  JXL_JPEG_VERIFY_LEN(3 * num_components);
+  jpg->height = height;
+  jpg->width = width;
+  jpg->components.resize(num_components);
+
+  // Read sampling factors and quant table index for each component.
+  std::vector<bool> ids_seen(256, false);
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    const int id = ReadUint8(data, pos);
+    if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
+      return JXL_FAILURE("Duplicate ID %d in SOF.", id);
+    }
+    ids_seen[id] = true;
+    jpg->components[i].id = id;
+    int factor = ReadUint8(data, pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    JXL_JPEG_VERIFY_INPUT(h_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR);
+    JXL_JPEG_VERIFY_INPUT(v_samp_factor, 1, kBrunsliMaxSampling, SAMP_FACTOR);
+    jpg->components[i].h_samp_factor = h_samp_factor;
+    jpg->components[i].v_samp_factor = v_samp_factor;
+    jpg->components[i].quant_idx = ReadUint8(data, pos);
+    max_h_samp_factor = std::max(max_h_samp_factor, h_samp_factor);
+    max_v_samp_factor = std::max(max_v_samp_factor, v_samp_factor);
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8);
+  int MCU_cols = DivCeil(jpg->width, max_h_samp_factor * 8);
+  // Compute the block dimensions for each component.
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    if (max_h_samp_factor % c->h_samp_factor != 0 ||
+        max_v_samp_factor % c->v_samp_factor != 0) {
+      return JXL_FAILURE("Non-integral subsampling ratios.");
+    }
+    c->width_in_blocks = MCU_cols * c->h_samp_factor;
+    c->height_in_blocks = MCU_rows * c->v_samp_factor;
+    const uint64_t num_blocks =
+        static_cast<uint64_t>(c->width_in_blocks) * c->height_in_blocks;
+    if (mode == JpegReadMode::kReadAll) {
+      c->coeffs.resize(num_blocks * kDCTBlockSize);
+    }
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Start of Scan (SOS) marker segment and fills in *scan_info with the
+// parsed data.
+bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(3);
+  size_t marker_len = ReadUint16(data, pos);
+  size_t comps_in_scan = ReadUint8(data, pos);
+  JXL_JPEG_VERIFY_INPUT(comps_in_scan, 1, jpg->components.size(),
+                        COMPS_IN_SCAN);
+
+  JPEGScanInfo scan_info;
+  scan_info.num_components = comps_in_scan;
+  JXL_JPEG_VERIFY_LEN(2 * comps_in_scan);
+  std::vector<bool> ids_seen(256, false);
+  for (size_t i = 0; i < comps_in_scan; ++i) {
+    uint32_t id = ReadUint8(data, pos);
+    if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
+      return JXL_FAILURE("Duplicate ID %d in SOS.", id);
+    }
+    ids_seen[id] = true;
+    bool found_index = false;
+    for (size_t j = 0; j < jpg->components.size(); ++j) {
+      if (jpg->components[j].id == id) {
+        scan_info.components[i].comp_idx = j;
+        found_index = true;
+      }
+    }
+    if (!found_index) {
+      return JXL_FAILURE("SOS marker: Could not find component with id %d", id);
+    }
+    int c = ReadUint8(data, pos);
+    int dc_tbl_idx = c >> 4;
+    int ac_tbl_idx = c & 0xf;
+    JXL_JPEG_VERIFY_INPUT(dc_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    JXL_JPEG_VERIFY_INPUT(ac_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    scan_info.components[i].dc_tbl_idx = dc_tbl_idx;
+    scan_info.components[i].ac_tbl_idx = ac_tbl_idx;
+  }
+  JXL_JPEG_VERIFY_LEN(3);
+  scan_info.Ss = ReadUint8(data, pos);
+  scan_info.Se = ReadUint8(data, pos);
+  JXL_JPEG_VERIFY_INPUT(static_cast<int>(scan_info.Ss), 0, 63, START_OF_SCAN);
+  JXL_JPEG_VERIFY_INPUT(scan_info.Se, scan_info.Ss, 63, END_OF_SCAN);
+  int c = ReadUint8(data, pos);
+  scan_info.Ah = c >> 4;
+  scan_info.Al = c & 0xf;
+  if (scan_info.Ah != 0 && scan_info.Al != scan_info.Ah - 1) {
+    // section G.1.1.1.2 : Successive approximation control only improves
+    // by one bit at a time. But it's not always respected, so we just issue
+    // a warning.
+    JXL_WARNING("Invalid progressive parameters: Al=%d Ah=%d", scan_info.Al,
+                scan_info.Ah);
+  }
+  // Check that all the Huffman tables needed for this scan are defined.
+  for (size_t i = 0; i < comps_in_scan; ++i) {
+    bool found_dc_table = false;
+    bool found_ac_table = false;
+    for (size_t j = 0; j < jpg->huffman_code.size(); ++j) {
+      uint32_t slot_id = jpg->huffman_code[j].slot_id;
+      if (slot_id == scan_info.components[i].dc_tbl_idx) {
+        found_dc_table = true;
+      } else if (slot_id == scan_info.components[i].ac_tbl_idx + 16) {
+        found_ac_table = true;
+      }
+    }
+    if (scan_info.Ss == 0 && !found_dc_table) {
+      return JXL_FAILURE(
+          "SOS marker: Could not find DC Huffman table with index %d",
+          scan_info.components[i].dc_tbl_idx);
+    }
+    if (scan_info.Se > 0 && !found_ac_table) {
+      return JXL_FAILURE(
+          "SOS marker: Could not find AC Huffman table with index %d",
+          scan_info.components[i].ac_tbl_idx);
+    }
+  }
+  jpg->scan_info.push_back(scan_info);
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and fills in *jpg with
+// the parsed data. Builds the Huffman decoding table in either dc_huff_lut or
+// ac_huff_lut, depending on the type and solt_id of Huffman code being read.
+bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode,
+                std::vector<HuffmanTableEntry>* dc_huff_lut,
+                std::vector<HuffmanTableEntry>* ac_huff_lut, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    return JXL_FAILURE("DHT marker: no Huffman table found");
+  }
+  while (*pos < start_pos + marker_len) {
+    JXL_JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    JPEGHuffmanCode huff;
+    huff.slot_id = ReadUint8(data, pos);
+    int huffman_index = huff.slot_id;
+    int is_ac_table = (huff.slot_id & 0x10) != 0;
+    HuffmanTableEntry* huff_lut;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*ac_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    } else {
+      JXL_JPEG_VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*dc_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    }
+    huff.counts[0] = 0;
+    int total_count = 0;
+    int space = 1 << kJpegHuffmanMaxBitLength;
+    int max_depth = 1;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, pos);
+      if (count != 0) {
+        max_depth = i;
+      }
+      huff.counts[i] = count;
+      total_count += count;
+      space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+    }
+    if (is_ac_table) {
+      JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize,
+                            HUFFMAN_CODE);
+    } else {
+      JXL_JPEG_VERIFY_INPUT(total_count, 0, kJpegDCAlphabetSize, HUFFMAN_CODE);
+    }
+    JXL_JPEG_VERIFY_LEN(total_count);
+    std::vector<bool> values_seen(256, false);
+    for (int i = 0; i < total_count; ++i) {
+      int value = ReadUint8(data, pos);
+      if (!is_ac_table) {
+        JXL_JPEG_VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE);
+      }
+      if (values_seen[value]) {
+        return JXL_FAILURE("Duplicate Huffman code value %d", value);
+      }
+      values_seen[value] = true;
+      huff.values[i] = value;
+    }
+    // Add an invalid symbol that will have the all 1 code.
+    ++huff.counts[max_depth];
+    huff.values[total_count] = kJpegHuffmanAlphabetSize;
+    space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+    if (space < 0) {
+      return JXL_FAILURE("Invalid Huffman code lengths.");
+    } else if (space > 0 && huff_lut[0].value != 0xffff) {
+      // Re-initialize the values to an invalid symbol so that we can recognize
+      // it when reading the bit stream using a Huffman code with space > 0.
+      for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+        huff_lut[i].bits = 0;
+        huff_lut[i].value = 0xffff;
+      }
+    }
+    huff.is_last = (*pos == start_pos + marker_len);
+    if (mode == JpegReadMode::kReadAll) {
+      BuildJpegHuffmanTable(&huff.counts[0], &huff.values[0], huff_lut);
+    }
+    jpg->huffman_code.push_back(huff);
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Quantization Table (DQT) marker segment and fills in *jpg
+// with the parsed data.
+bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    return JXL_FAILURE("DQT marker: no quantization table found");
+  }
+  while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) {
+    JXL_JPEG_VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, pos);
+    int quant_table_precision = quant_table_index >> 4;
+    JXL_JPEG_VERIFY_INPUT(quant_table_precision, 0, 1, QUANT_TBL_PRECISION);
+    quant_table_index &= 0xf;
+    JXL_JPEG_VERIFY_INPUT(quant_table_index, 0, 3, QUANT_TBL_INDEX);
+    JXL_JPEG_VERIFY_LEN((quant_table_precision + 1) * kDCTBlockSize);
+    JPEGQuantTable table;
+    table.index = quant_table_index;
+    table.precision = quant_table_precision;
+    for (size_t i = 0; i < kDCTBlockSize; ++i) {
+      int quant_val =
+          quant_table_precision ? ReadUint16(data, pos) : ReadUint8(data, pos);
+      JXL_JPEG_VERIFY_INPUT(quant_val, 1, 65535, QUANT_VAL);
+      table.values[kJPEGNaturalOrder[i]] = quant_val;
+    }
+    table.is_last = (*pos == start_pos + marker_len);
+    jpg->quant.push_back(table);
+  }
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the DRI marker and saves the restart interval into *jpg.
+bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos,
+                bool* found_dri, JPEGData* jpg) {
+  if (*found_dri) {
+    return JXL_FAILURE("Duplicate DRI marker.");
+  }
+  *found_dri = true;
+  const size_t start_pos = *pos;
+  JXL_JPEG_VERIFY_LEN(4);
+  size_t marker_len = ReadUint16(data, pos);
+  int restart_interval = ReadUint16(data, pos);
+  jpg->restart_interval = restart_interval;
+  JXL_JPEG_VERIFY_MARKER_END();
+  return true;
+}
+
+// Saves the APP marker segment as a string to *jpg.
+bool ProcessAPP(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  JXL_JPEG_VERIFY_LEN(marker_len - 2);
+  JXL_DASSERT(*pos >= 3);
+  // Save the marker type together with the app data.
+  const uint8_t* app_str_start = data + *pos - 3;
+  std::vector<uint8_t> app_str(app_str_start, app_str_start + marker_len + 1);
+  *pos += marker_len - 2;
+  jpg->app_data.push_back(app_str);
+  return true;
+}
+
+// Saves the COM marker segment as a string to *jpg.
+bool ProcessCOM(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  JXL_JPEG_VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  JXL_JPEG_VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  JXL_JPEG_VERIFY_LEN(marker_len - 2);
+  const uint8_t* com_str_start = data + *pos - 3;
+  std::vector<uint8_t> com_str(com_str_start, com_str_start + marker_len + 1);
+  *pos += marker_len - 2;
+  jpg->com_data.push_back(com_str);
+  return true;
+}
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_ - 2;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = data_[pos_];
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position where parsing should continue.
+  // Enqueue the padding bits seen (0 or 1).
+  // Returns false if there is inconsistent or invalid padding or the stream
+  // ended too early.
+  bool FinishStream(JPEGData* jpg, size_t* pos) {
+    int npadbits = bits_left_ & 7;
+    if (npadbits > 0) {
+      uint64_t padmask = (1ULL << npadbits) - 1;
+      uint64_t padbits = (val_ >> (bits_left_ - npadbits)) & padmask;
+      if (padbits != padmask) {
+        jpg->has_zero_padding_bit = true;
+      }
+      for (int i = npadbits - 1; i >= 0; --i) {
+        jpg->padding_bits.push_back((padbits >> i) & 1);
+      }
+    }
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = bits_left_ >> 3;
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (pos_ < next_marker_pos_ && data_[pos_] == 0 &&
+          data_[pos_ - 1] == 0xff) {
+        --pos_;
+      }
+    }
+    if (pos_ > next_marker_pos_) {
+      // Data ran out before the scan was complete.
+      return JXL_FAILURE("Unexpected end of scan.");
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+/**
+ * Returns the DC diff or AC value for extra bits value x and prefix code s.
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.1 – Difference magnitude categories for DC coding
+ *  SSSS | DIFF values
+ * ------+--------------------------
+ *     0 | 0
+ *     1 | –1, 1
+ *     2 | –3, –2, 2, 3
+ *     3 | –7..–4, 4..7
+ * ......|..........................
+ *    11 | –2047..–1024, 1024..2047
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.2 – Categories assigned to coefficient values
+ * [ Same as Table F.1, but does not include SSSS equal to 0 and 11]
+ *
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * F.1.2.1.1 Structure of DC code table
+ * For each category,... additional bits... appended... to uniquely identify
+ * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF
+ * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are
+ * appended... Most significant bit... is 0 for negative differences and 1 for
+ * positive differences.
+ *
+ * In other words the upper half of extra bits range represents DIFF as is.
+ * The lower half represents the negative DIFFs with an offset.
+ */
+int HuffExtend(int x, int s) {
+  JXL_DASSERT(s >= 1);
+  int half = 1 << (s - 1);
+  if (x >= half) {
+    JXL_DASSERT(x < (1 << s));
+    return x;
+  } else {
+    return x - (1 << s) + 1;
+  }
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, bool* reset_state, int* num_zero_runs,
+                    BitReaderState* br, JPEGData* jpg, coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      return JXL_FAILURE("Invalid Huffman symbol %d  for DC coefficient.", s);
+    }
+    int diff = 0;
+    if (s > 0) {
+      int bits = br->ReadBits(s);
+      diff = HuffExtend(bits, s);
+    }
+    int coeff = diff + *last_dc_coeff;
+    const int dc_coeff = coeff * Am;
+    coeffs[0] = dc_coeff;
+    // TODO(eustas): is there a more elegant / explicit way to check this?
+    if (dc_coeff != coeffs[0]) {
+      return JXL_FAILURE("Invalid DC coefficient %d", dc_coeff);
+    }
+    *last_dc_coeff = coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  *num_zero_runs = 0;
+  for (int k = Ss; k <= Se; k++) {
+    int sr = ReadSymbol(ac_huff, br);
+    if (sr >= kJpegHuffmanAlphabetSize) {
+      return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", sr,
+                         k);
+    }
+    int r = sr >> 4;
+    int s = sr & 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                           Se);
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        return JXL_FAILURE(
+            "Out of range AC coefficient value: s = %d Al = %d k = %d", s, Al,
+            k);
+      }
+      int bits = br->ReadBits(s);
+      int coeff = HuffExtend(bits, s);
+      coeffs[kJPEGNaturalOrder[k]] = coeff * Am;
+      *num_zero_runs = 0;
+    } else if (r == 15) {
+      k += 15;
+      ++(*num_zero_runs);
+    } else {
+      if (eobrun_allowed && k == Ss && *eobrun == 0) {
+        // We have two end-of-block runs right after each other, so we signal
+        // the jpeg encoder to force a state reset at this point.
+        *reset_state = true;
+      }
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          return JXL_FAILURE("End-of-block run crossing DC coeff.");
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, bool* reset_state, BitReaderState* br,
+                    JPEGData* jpg, coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s * Am;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = Am;
+  int m1 = -Am;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", s,
+                           k);
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d",
+                             s, k);
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          if (eobrun_allowed && k == Ss && *eobrun == 0) {
+            // We have two end-of-block runs right after each other, so we
+            // signal the jpeg encoder to force a state reset at this point.
+            *reset_state = true;
+          }
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              return JXL_FAILURE("End-of-block run crossing DC coeff.");
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                             Se);
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    return JXL_FAILURE("Extra zero run before end-of-block.");
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool ProcessRestart(const uint8_t* data, const size_t len,
+                    int* next_restart_marker, BitReaderState* br,
+                    JPEGData* jpg) {
+  size_t pos = 0;
+  if (!br->FinishStream(jpg, &pos)) {
+    return JXL_FAILURE("Invalid scan");
+  }
+  int expected_marker = 0xd0 + *next_restart_marker;
+  JXL_JPEG_EXPECT_MARKER();
+  int marker = data[pos + 1];
+  if (marker != expected_marker) {
+    return JXL_FAILURE("Did not find expected restart marker %d actual %d",
+                       expected_marker, marker);
+  }
+  br->Reset(pos + 2);
+  *next_restart_marker += 1;
+  *next_restart_marker &= 0x7;
+  return true;
+}
+
+bool ProcessScan(const uint8_t* data, const size_t len,
+                 const std::vector<HuffmanTableEntry>& dc_huff_lut,
+                 const std::vector<HuffmanTableEntry>& ac_huff_lut,
+                 uint16_t scan_progression[kMaxComponents][kDCTBlockSize],
+                 bool is_progressive, size_t* pos, JPEGData* jpg) {
+  if (!ProcessSOS(data, len, pos, jpg)) {
+    return false;
+  }
+  JPEGScanInfo* scan_info = &jpg->scan_info.back();
+  bool is_interleaved = (scan_info->num_components > 1);
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    max_h_samp_factor =
+        std::max(max_h_samp_factor, jpg->components[i].h_samp_factor);
+    max_v_samp_factor =
+        std::max(max_v_samp_factor, jpg->components[i].v_samp_factor);
+  }
+
+  int MCU_rows = DivCeil(jpg->height, max_v_samp_factor * 8);
+  int MCUs_per_row = DivCeil(jpg->width, max_h_samp_factor * 8);
+  if (!is_interleaved) {
+    const JPEGComponent& c = jpg->components[scan_info->components[0].comp_idx];
+    MCUs_per_row = DivCeil(jpg->width * c.h_samp_factor, 8 * max_h_samp_factor);
+    MCU_rows = DivCeil(jpg->height * c.v_samp_factor, 8 * max_v_samp_factor);
+  }
+  coeff_t last_dc_coeff[kMaxComponents] = {0};
+  BitReaderState br(data, len, *pos);
+  int restarts_to_go = jpg->restart_interval;
+  int next_restart_marker = 0;
+  int eobrun = -1;
+  int block_scan_index = 0;
+  const int Al = is_progressive ? scan_info->Al : 0;
+  const int Ah = is_progressive ? scan_info->Ah : 0;
+  const int Ss = is_progressive ? scan_info->Ss : 0;
+  const int Se = is_progressive ? scan_info->Se : 63;
+  const uint16_t scan_bitmask = Ah == 0 ? (0xffff << Al) : (1u << Al);
+  const uint16_t refinement_bitmask = (1 << Al) - 1;
+  for (size_t i = 0; i < scan_info->num_components; ++i) {
+    int comp_idx = scan_info->components[i].comp_idx;
+    for (int k = Ss; k <= Se; ++k) {
+      if (scan_progression[comp_idx][k] & scan_bitmask) {
+        return JXL_FAILURE(
+            "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
+            comp_idx, k, scan_progression[i][k], scan_bitmask);
+      }
+      if (scan_progression[comp_idx][k] & refinement_bitmask) {
+        return JXL_FAILURE(
+            "Invalid scan order, a more refined scan was already done: "
+            "component=%d k=%d prev_mask=%u cur_mask=%u",
+            comp_idx, k, scan_progression[i][k], scan_bitmask);
+      }
+      scan_progression[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (Al > 10) {
+    return JXL_FAILURE("Scan parameter Al=%d is not supported.", Al);
+  }
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Handle the restart intervals.
+      if (jpg->restart_interval > 0) {
+        if (restarts_to_go == 0) {
+          if (ProcessRestart(data, len, &next_restart_marker, &br, jpg)) {
+            restarts_to_go = jpg->restart_interval;
+            memset(static_cast<void*>(last_dc_coeff), 0, sizeof(last_dc_coeff));
+            if (eobrun > 0) {
+              return JXL_FAILURE("End-of-block run too long.");
+            }
+            eobrun = -1;  // fresh start
+          } else {
+            return JXL_FAILURE("Could not process restart.");
+          }
+        }
+        --restarts_to_go;
+      }
+      // Decode one MCU.
+      for (size_t i = 0; i < scan_info->num_components; ++i) {
+        JPEGComponentScanInfo* si = &scan_info->components[i];
+        JPEGComponent* c = &jpg->components[si->comp_idx];
+        const HuffmanTableEntry* dc_lut =
+            &dc_huff_lut[si->dc_tbl_idx * kJpegHuffmanLutSize];
+        const HuffmanTableEntry* ac_lut =
+            &ac_huff_lut[si->ac_tbl_idx * kJpegHuffmanLutSize];
+        int nblocks_y = is_interleaved ? c->v_samp_factor : 1;
+        int nblocks_x = is_interleaved ? c->h_samp_factor : 1;
+        for (int iy = 0; iy < nblocks_y; ++iy) {
+          for (int ix = 0; ix < nblocks_x; ++ix) {
+            int block_y = mcu_y * nblocks_y + iy;
+            int block_x = mcu_x * nblocks_x + ix;
+            int block_idx = block_y * c->width_in_blocks + block_x;
+            bool reset_state = false;
+            int num_zero_runs = 0;
+            coeff_t* coeffs = &c->coeffs[block_idx * kDCTBlockSize];
+            if (Ah == 0) {
+              if (!DecodeDCTBlock(dc_lut, ac_lut, Ss, Se, Al, &eobrun,
+                                  &reset_state, &num_zero_runs, &br, jpg,
+                                  &last_dc_coeff[si->comp_idx], coeffs)) {
+                return false;
+              }
+            } else {
+              if (!RefineDCTBlock(ac_lut, Ss, Se, Al, &eobrun, &reset_state,
+                                  &br, jpg, coeffs)) {
+                return false;
+              }
+            }
+            if (reset_state) {
+              scan_info->reset_points.emplace_back(block_scan_index);
+            }
+            if (num_zero_runs > 0) {
+              JPEGScanInfo::ExtraZeroRunInfo info;
+              info.block_idx = block_scan_index;
+              info.num_extra_zero_runs = num_zero_runs;
+              scan_info->extra_zero_runs.push_back(info);
+            }
+            ++block_scan_index;
+          }
+        }
+      }
+    }
+  }
+  if (eobrun > 0) {
+    return JXL_FAILURE("End-of-block run too long.");
+  }
+  if (!br.FinishStream(jpg, pos)) {
+    return JXL_FAILURE("Invalid scan.");
+  }
+  if (*pos > len) {
+    return JXL_FAILURE("Unexpected end of file during scan. pos=%" PRIuS
+                       " len=%" PRIuS,
+                       *pos, len);
+  }
+  return true;
+}
+
+// Changes the quant_idx field of the components to refer to the index of the
+// quant table in the jpg->quant array.
+bool FixupIndexes(JPEGData* jpg) {
+  for (size_t i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    bool found_index = false;
+    for (size_t j = 0; j < jpg->quant.size(); ++j) {
+      if (jpg->quant[j].index == c->quant_idx) {
+        c->quant_idx = j;
+        found_index = true;
+        break;
+      }
+    }
+    if (!found_index) {
+      return JXL_FAILURE("Quantization table with index %u not found",
+                         c->quant_idx);
+    }
+  }
+  return true;
+}
+
+size_t FindNextMarker(const uint8_t* data, const size_t len, size_t pos) {
+  // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+  static const uint8_t kIsValidMarker[] = {
+      1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+      1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+  };
+  size_t num_skipped = 0;
+  while (pos + 1 < len && (data[pos] != 0xff || data[pos + 1] < 0xc0 ||
+                           !kIsValidMarker[data[pos + 1] - 0xc0])) {
+    ++pos;
+    ++num_skipped;
+  }
+  return num_skipped;
+}
+
+}  // namespace
+
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg) {
+  size_t pos = 0;
+  // Check SOI marker.
+  JXL_JPEG_EXPECT_MARKER();
+  int marker = data[pos + 1];
+  pos += 2;
+  if (marker != 0xd8) {
+    return JXL_FAILURE("Did not find expected SOI marker, actual=%d", marker);
+  }
+  int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize;
+  std::vector<HuffmanTableEntry> dc_huff_lut(lut_size);
+  std::vector<HuffmanTableEntry> ac_huff_lut(lut_size);
+  bool found_sof = false;
+  bool found_dri = false;
+  uint16_t scan_progression[kMaxComponents][kDCTBlockSize] = {{0}};
+
+  jpg->padding_bits.resize(0);
+  bool is_progressive = false;  // default
+  do {
+    // Read next marker.
+    size_t num_skipped = FindNextMarker(data, len, pos);
+    if (num_skipped > 0) {
+      // Add a fake marker to indicate arbitrary in-between-markers data.
+      jpg->marker_order.push_back(0xff);
+      jpg->inter_marker_data.emplace_back(data + pos, data + pos + num_skipped);
+      pos += num_skipped;
+    }
+    JXL_JPEG_EXPECT_MARKER();
+    marker = data[pos + 1];
+    pos += 2;
+    bool ok = true;
+    switch (marker) {
+      case 0xc0:
+      case 0xc1:
+      case 0xc2:
+        is_progressive = (marker == 0xc2);
+        ok = ProcessSOF(data, len, mode, &pos, jpg);
+        found_sof = true;
+        break;
+      case 0xc4:
+        ok = ProcessDHT(data, len, mode, &dc_huff_lut, &ac_huff_lut, &pos, jpg);
+        break;
+      case 0xd0:
+      case 0xd1:
+      case 0xd2:
+      case 0xd3:
+      case 0xd4:
+      case 0xd5:
+      case 0xd6:
+      case 0xd7:
+        // RST markers do not have any data.
+        break;
+      case 0xd9:
+        // Found end marker.
+        break;
+      case 0xda:
+        if (mode == JpegReadMode::kReadAll) {
+          ok = ProcessScan(data, len, dc_huff_lut, ac_huff_lut,
+                           scan_progression, is_progressive, &pos, jpg);
+        }
+        break;
+      case 0xdb:
+        ok = ProcessDQT(data, len, &pos, jpg);
+        break;
+      case 0xdd:
+        ok = ProcessDRI(data, len, &pos, &found_dri, jpg);
+        break;
+      case 0xe0:
+      case 0xe1:
+      case 0xe2:
+      case 0xe3:
+      case 0xe4:
+      case 0xe5:
+      case 0xe6:
+      case 0xe7:
+      case 0xe8:
+      case 0xe9:
+      case 0xea:
+      case 0xeb:
+      case 0xec:
+      case 0xed:
+      case 0xee:
+      case 0xef:
+        if (mode != JpegReadMode::kReadTables) {
+          ok = ProcessAPP(data, len, &pos, jpg);
+        }
+        break;
+      case 0xfe:
+        if (mode != JpegReadMode::kReadTables) {
+          ok = ProcessCOM(data, len, &pos, jpg);
+        }
+        break;
+      default:
+        return JXL_FAILURE("Unsupported marker: %d pos=%" PRIuS " len=%" PRIuS,
+                           marker, pos, len);
+    }
+    if (!ok) {
+      return false;
+    }
+    jpg->marker_order.push_back(marker);
+    if (mode == JpegReadMode::kReadHeader && found_sof) {
+      break;
+    }
+  } while (marker != 0xd9);
+
+  if (!found_sof) {
+    return JXL_FAILURE("Missing SOF marker.");
+  }
+
+  // Supplemental checks.
+  if (mode == JpegReadMode::kReadAll) {
+    if (pos < len) {
+      jpg->tail_data = std::vector<uint8_t>(data + pos, data + len);
+    }
+    if (!FixupIndexes(jpg)) {
+      return false;
+    }
+    if (jpg->huffman_code.empty()) {
+      // Section B.2.4.2: "If a table has never been defined for a particular
+      // destination, then when this destination is specified in a scan header,
+      // the results are unpredictable."
+      return JXL_FAILURE("Need at least one Huffman code table.");
+    }
+    if (jpg->huffman_code.size() >= kMaxDHTMarkers) {
+      return JXL_FAILURE("Too many Huffman tables.");
+    }
+  }
+  return true;
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h
new file mode 100644
index 0000000000..3fad820e9d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_data_reader.h
@@ -0,0 +1,36 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Functions for reading a jpeg byte stream into a JPEGData object.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
+#define LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+enum class JpegReadMode {
+  kReadHeader,  // only basic headers
+  kReadTables,  // headers and tables (quant, Huffman, ...)
+  kReadAll,     // everything
+};
+
+// Parses the JPEG stream contained in data[*pos ... len) and fills in *jpg with
+// the parsed information.
+// If mode is kReadHeader, it fills in only the image dimensions in *jpg.
+// Returns false if the data is not valid JPEG, or if it contains an unsupported
+// JPEG feature.
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_DATA_READER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc
new file mode 100644
index 0000000000..38282e640a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.cc
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h"
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+namespace jxl {
+namespace jpeg {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < static_cast<int>(kJpegHuffmanMaxBitLength)) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;                   // current code length
+  int idx;                   // symbol index
+  int key;                   // prefix code
+  int reps;                  // number of replicate key values in current table
+  int low;                   // low bits for current root entry
+  int table_bits;            // key length of current table
+  int table_size;            // size of current table
+
+  // Make a local copy of the input bit length histogram.
+  int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0};
+  int total_count = 0;
+  for (len = 1; len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    tmp_count[len] = count[len];
+    total_count += tmp_count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < table_size; ++key) {
+      table[key] = code;
+    }
+    return;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(tmp_count, len);
+        table_size = 1 << table_bits;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+}
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h
new file mode 100644
index 0000000000..b8a60e4107
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/enc_jpeg_huffman_decode.h
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Utility function for building a Huffman lookup table for the jpeg decoder.
+
+#ifndef LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
+#define LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
+
+#include <stdint.h>
+
+namespace jxl {
+namespace jpeg {
+
+constexpr int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+constexpr int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  // Initialize the value to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  HuffmanTableEntry() : bits(0), value(0xffff) {}
+
+  uint8_t bits;    // number of bits used for this symbol
+  uint16_t value;  // symbol value or table offset
+};
+
+// Builds jpeg-style Huffman lookup table from the given symbols.
+// The symbols are in order of increasing bit lengths. The number of symbols
+// with bit length n is given in counts[n] for each n >= 1.
+void BuildJpegHuffmanTable(const uint32_t* counts, const uint32_t* symbols,
+                           HuffmanTableEntry* lut);
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_ENC_JPEG_HUFFMAN_DECODE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc
new file mode 100644
index 0000000000..430707b9ed
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc
@@ -0,0 +1,451 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/jpeg/jpeg_data.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace jpeg {
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+namespace {
+enum JPEGComponentType : uint32_t {
+  kGray = 0,
+  kYCbCr = 1,
+  kRGB = 2,
+  kCustom = 3,
+};
+
+struct JPEGInfo {
+  size_t num_app_markers = 0;
+  size_t num_com_markers = 0;
+  size_t num_scans = 0;
+  size_t num_intermarker = 0;
+  bool has_dri = false;
+};
+
+Status VisitMarker(uint8_t* marker, Visitor* visitor, JPEGInfo* info) {
+  uint32_t marker32 = *marker - 0xc0;
+  JXL_RETURN_IF_ERROR(visitor->Bits(6, 0x00, &marker32));
+  *marker = marker32 + 0xc0;
+  if ((*marker & 0xf0) == 0xe0) {
+    info->num_app_markers++;
+  }
+  if (*marker == 0xfe) {
+    info->num_com_markers++;
+  }
+  if (*marker == 0xda) {
+    info->num_scans++;
+  }
+  // We use a fake 0xff marker to signal intermarker data.
+  if (*marker == 0xff) {
+    info->num_intermarker++;
+  }
+  if (*marker == 0xdd) {
+    info->has_dri = true;
+  }
+  return true;
+}
+
+}  // namespace
+
+Status JPEGData::VisitFields(Visitor* visitor) {
+  bool is_gray = components.size() == 1;
+  JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_gray));
+  if (visitor->IsReading()) {
+    components.resize(is_gray ? 1 : 3);
+  }
+  JPEGInfo info;
+  if (visitor->IsReading()) {
+    uint8_t marker = 0xc0;
+    do {
+      JXL_RETURN_IF_ERROR(VisitMarker(&marker, visitor, &info));
+      marker_order.push_back(marker);
+      if (marker_order.size() > 16384) {
+        return JXL_FAILURE("Too many markers: %" PRIuS "\n",
+                           marker_order.size());
+      }
+    } while (marker != 0xd9);
+  } else {
+    if (marker_order.size() > 16384) {
+      return JXL_FAILURE("Too many markers: %" PRIuS "\n", marker_order.size());
+    }
+    for (size_t i = 0; i < marker_order.size(); i++) {
+      JXL_RETURN_IF_ERROR(VisitMarker(&marker_order[i], visitor, &info));
+    }
+    if (!marker_order.empty()) {
+      // Last marker should always be EOI marker.
+      JXL_CHECK(marker_order.back() == 0xd9);
+    }
+  }
+
+  // Size of the APP and COM markers.
+  if (visitor->IsReading()) {
+    app_data.resize(info.num_app_markers);
+    app_marker_type.resize(info.num_app_markers);
+    com_data.resize(info.num_com_markers);
+    scan_info.resize(info.num_scans);
+  }
+  JXL_ASSERT(app_data.size() == info.num_app_markers);
+  JXL_ASSERT(app_marker_type.size() == info.num_app_markers);
+  JXL_ASSERT(com_data.size() == info.num_com_markers);
+  JXL_ASSERT(scan_info.size() == info.num_scans);
+  for (size_t i = 0; i < app_data.size(); i++) {
+    auto& app = app_data[i];
+    // Encodes up to 8 different values.
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(0), Val(1), BitsOffset(1, 2), BitsOffset(2, 4), 0,
+                     reinterpret_cast<uint32_t*>(&app_marker_type[i])));
+    if (app_marker_type[i] != AppMarkerType::kUnknown &&
+        app_marker_type[i] != AppMarkerType::kICC &&
+        app_marker_type[i] != AppMarkerType::kExif &&
+        app_marker_type[i] != AppMarkerType::kXMP) {
+      return JXL_FAILURE("Unknown app marker type %u",
+                         static_cast<uint32_t>(app_marker_type[i]));
+    }
+    uint32_t len = app.size() - 1;
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) app.resize(len + 1);
+    if (app.size() < 3) {
+      return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", app.size());
+    }
+  }
+  for (auto& com : com_data) {
+    uint32_t len = com.size() - 1;
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) com.resize(len + 1);
+    if (com.size() < 3) {
+      return JXL_FAILURE("Invalid marker size: %" PRIuS "\n", com.size());
+    }
+  }
+
+  uint32_t num_quant_tables = quant.size();
+  JXL_RETURN_IF_ERROR(
+      visitor->U32(Val(1), Val(2), Val(3), Val(4), 2, &num_quant_tables));
+  if (num_quant_tables == 4) {
+    return JXL_FAILURE("Invalid number of quant tables");
+  }
+  if (visitor->IsReading()) {
+    quant.resize(num_quant_tables);
+  }
+  for (size_t i = 0; i < num_quant_tables; i++) {
+    if (quant[i].precision > 1) {
+      return JXL_FAILURE(
+          "Quant tables with more than 16 bits are not supported");
+    }
+    JXL_RETURN_IF_ERROR(visitor->Bits(1, 0, &quant[i].precision));
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, i, &quant[i].index));
+    JXL_RETURN_IF_ERROR(visitor->Bool(true, &quant[i].is_last));
+  }
+
+  JPEGComponentType component_type =
+      components.size() == 1 && components[0].id == 1 ? JPEGComponentType::kGray
+      : components.size() == 3 && components[0].id == 1 &&
+              components[1].id == 2 && components[2].id == 3
+          ? JPEGComponentType::kYCbCr
+      : components.size() == 3 && components[0].id == 'R' &&
+              components[1].id == 'G' && components[2].id == 'B'
+          ? JPEGComponentType::kRGB
+          : JPEGComponentType::kCustom;
+  JXL_RETURN_IF_ERROR(
+      visitor->Bits(2, JPEGComponentType::kYCbCr,
+                    reinterpret_cast<uint32_t*>(&component_type)));
+  uint32_t num_components;
+  if (component_type == JPEGComponentType::kGray) {
+    num_components = 1;
+  } else if (component_type != JPEGComponentType::kCustom) {
+    num_components = 3;
+  } else {
+    num_components = components.size();
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 3, &num_components));
+    if (num_components != 1 && num_components != 3) {
+      return JXL_FAILURE("Invalid number of components: %u", num_components);
+    }
+  }
+  if (visitor->IsReading()) {
+    components.resize(num_components);
+  }
+  if (component_type == JPEGComponentType::kCustom) {
+    for (size_t i = 0; i < components.size(); i++) {
+      JXL_RETURN_IF_ERROR(visitor->Bits(8, 0, &components[i].id));
+    }
+  } else if (component_type == JPEGComponentType::kGray) {
+    components[0].id = 1;
+  } else if (component_type == JPEGComponentType::kRGB) {
+    components[0].id = 'R';
+    components[1].id = 'G';
+    components[2].id = 'B';
+  } else {
+    components[0].id = 1;
+    components[1].id = 2;
+    components[2].id = 3;
+  }
+  size_t used_tables = 0;
+  for (size_t i = 0; i < components.size(); i++) {
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &components[i].quant_idx));
+    if (components[i].quant_idx >= quant.size()) {
+      return JXL_FAILURE("Invalid quant table for component %" PRIuS ": %u\n",
+                         i, components[i].quant_idx);
+    }
+    used_tables |= 1U << components[i].quant_idx;
+  }
+  for (size_t i = 0; i < quant.size(); i++) {
+    if (used_tables & (1 << i)) continue;
+    if (i == 0) return JXL_FAILURE("First quant table unused.");
+    // Unused quant table has to be set to copy of previous quant table
+    for (size_t j = 0; j < 64; j++) {
+      if (quant[i].values[j] != quant[i - 1].values[j]) {
+        return JXL_FAILURE("Non-trivial unused quant table");
+      }
+    }
+  }
+
+  uint32_t num_huff = huffman_code.size();
+  JXL_RETURN_IF_ERROR(visitor->U32(Val(4), BitsOffset(3, 2), BitsOffset(4, 10),
+                                   BitsOffset(6, 26), 4, &num_huff));
+  if (visitor->IsReading()) {
+    huffman_code.resize(num_huff);
+  }
+  for (JPEGHuffmanCode& hc : huffman_code) {
+    bool is_ac = hc.slot_id >> 4;
+    uint32_t id = hc.slot_id & 0xF;
+    JXL_RETURN_IF_ERROR(visitor->Bool(false, &is_ac));
+    JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &id));
+    hc.slot_id = (static_cast<uint32_t>(is_ac) << 4) | id;
+    JXL_RETURN_IF_ERROR(visitor->Bool(true, &hc.is_last));
+    size_t num_symbols = 0;
+    for (size_t i = 0; i <= 16; i++) {
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(3, 2),
+                                       Bits(8), 0, &hc.counts[i]));
+      num_symbols += hc.counts[i];
+    }
+    if (num_symbols < 1) {
+      // Actually, at least 2 symbols are required, since one of them is EOI.
+      return JXL_FAILURE("Empty Huffman table");
+    }
+    if (num_symbols > hc.values.size()) {
+      return JXL_FAILURE("Huffman code too large (%" PRIuS ")", num_symbols);
+    }
+    // Presence flags for 4 * 64 + 1 values.
+    uint64_t value_slots[5] = {};
+    for (size_t i = 0; i < num_symbols; i++) {
+      // Goes up to 256, included. Might have the same symbol appear twice...
+      JXL_RETURN_IF_ERROR(visitor->U32(Bits(2), BitsOffset(2, 4),
+                                       BitsOffset(4, 8), BitsOffset(8, 1), 0,
+                                       &hc.values[i]));
+      value_slots[hc.values[i] >> 6] |= (uint64_t)1 << (hc.values[i] & 0x3F);
+    }
+    if (hc.values[num_symbols - 1] != kJpegHuffmanAlphabetSize) {
+      return JXL_FAILURE("Missing EOI symbol");
+    }
+    // Last element, denoting EOI, have to be 1 after the loop.
+    JXL_ASSERT(value_slots[4] == 1);
+    size_t num_values = 1;
+    for (size_t i = 0; i < 4; ++i) num_values += hwy::PopCount(value_slots[i]);
+    if (num_values != num_symbols) {
+      return JXL_FAILURE("Duplicate Huffman symbols");
+    }
+    if (!is_ac) {
+      bool only_dc = ((value_slots[0] >> kJpegDCAlphabetSize) | value_slots[1] |
+                      value_slots[2] | value_slots[3]) == 0;
+      if (!only_dc) return JXL_FAILURE("Huffman symbols out of DC range");
+    }
+  }
+
+  for (auto& scan : scan_info) {
+    JXL_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), Val(4), 1, &scan.num_components));
+    if (scan.num_components >= 4) {
+      return JXL_FAILURE("Invalid number of components in SOS marker");
+    }
+    JXL_RETURN_IF_ERROR(visitor->Bits(6, 0, &scan.Ss));
+    JXL_RETURN_IF_ERROR(visitor->Bits(6, 63, &scan.Se));
+    JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Al));
+    JXL_RETURN_IF_ERROR(visitor->Bits(4, 0, &scan.Ah));
+    for (size_t i = 0; i < scan.num_components; i++) {
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].comp_idx));
+      if (scan.components[i].comp_idx >= components.size()) {
+        return JXL_FAILURE("Invalid component idx in SOS marker");
+      }
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].ac_tbl_idx));
+      JXL_RETURN_IF_ERROR(visitor->Bits(2, 0, &scan.components[i].dc_tbl_idx));
+    }
+    // TODO(veluca): actually set and use this value.
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), Val(2), BitsOffset(3, 3),
+                                     kMaxNumPasses - 1,
+                                     &scan.last_needed_pass));
+  }
+
+  // From here on, this is data that is not strictly necessary to get a valid
+  // JPEG, but necessary for bit-exact JPEG reconstruction.
+  if (info.has_dri) {
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &restart_interval));
+  }
+
+  for (auto& scan : scan_info) {
+    uint32_t num_reset_points = scan.reset_points.size();
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4),
+                                     BitsOffset(16, 20), 0, &num_reset_points));
+    if (visitor->IsReading()) {
+      scan.reset_points.resize(num_reset_points);
+    }
+    int last_block_idx = -1;
+    for (auto& block_idx : scan.reset_points) {
+      block_idx -= last_block_idx + 1;
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1),
+                                       BitsOffset(5, 9), BitsOffset(28, 41), 0,
+                                       &block_idx));
+      block_idx += last_block_idx + 1;
+      if (static_cast<int>(block_idx) < last_block_idx + 1) {
+        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
+                           last_block_idx);
+      }
+      // TODO(eustas): better upper boundary could be given at this point; also
+      //               it could be applied during reset_points reading.
+      if (block_idx > (1u << 30)) {
+        // At most 8K x 8K x num_channels blocks are expected. That is,
+        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
+        // image.
+        return JXL_FAILURE("Invalid block ID: %u", block_idx);
+      }
+      last_block_idx = block_idx;
+    }
+
+    uint32_t num_extra_zero_runs = scan.extra_zero_runs.size();
+    JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4),
+                                     BitsOffset(16, 20), 0,
+                                     &num_extra_zero_runs));
+    if (visitor->IsReading()) {
+      scan.extra_zero_runs.resize(num_extra_zero_runs);
+    }
+    last_block_idx = -1;
+    for (size_t i = 0; i < scan.extra_zero_runs.size(); ++i) {
+      uint32_t& block_idx = scan.extra_zero_runs[i].block_idx;
+      JXL_RETURN_IF_ERROR(visitor->U32(
+          Val(1), BitsOffset(2, 2), BitsOffset(4, 5), BitsOffset(8, 20), 1,
+          &scan.extra_zero_runs[i].num_extra_zero_runs));
+      block_idx -= last_block_idx + 1;
+      JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(3, 1),
+                                       BitsOffset(5, 9), BitsOffset(28, 41), 0,
+                                       &block_idx));
+      block_idx += last_block_idx + 1;
+      if (static_cast<int>(block_idx) < last_block_idx + 1) {
+        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
+                           last_block_idx);
+      }
+      if (block_idx > (1u << 30)) {
+        // At most 8K x 8K x num_channels blocks are expected. That is,
+        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
+        // image.
+        return JXL_FAILURE("Invalid block ID: %u", block_idx);
+      }
+      last_block_idx = block_idx;
+    }
+  }
+  std::vector<uint32_t> inter_marker_data_sizes;
+  inter_marker_data_sizes.reserve(info.num_intermarker);
+  for (size_t i = 0; i < info.num_intermarker; ++i) {
+    uint32_t len = visitor->IsReading() ? 0 : inter_marker_data[i].size();
+    JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &len));
+    if (visitor->IsReading()) inter_marker_data_sizes.emplace_back(len);
+  }
+  uint32_t tail_data_len = tail_data.size();
+  if (!visitor->IsReading() && tail_data_len > 4260096) {
+    return JXL_FAILURE("Tail data too large (max size = 4260096, size = %u).",
+                       tail_data_len);
+  }
+  JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(8, 1),
+                                   BitsOffset(16, 257), BitsOffset(22, 65793),
+                                   0, &tail_data_len));
+
+  JXL_RETURN_IF_ERROR(visitor->Bool(false, &has_zero_padding_bit));
+  if (has_zero_padding_bit) {
+    uint32_t nbit = padding_bits.size();
+    JXL_RETURN_IF_ERROR(visitor->Bits(24, 0, &nbit));
+    if (visitor->IsReading()) {
+      padding_bits.reserve(std::min<uint32_t>(1024u, nbit));
+      for (uint32_t i = 0; i < nbit; i++) {
+        bool bbit = false;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        padding_bits.push_back(bbit);
+      }
+    } else {
+      for (uint8_t& bit : padding_bits) {
+        bool bbit = bit;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        bit = bbit;
+      }
+    }
+  }
+
+  // Apply postponed actions.
+  if (visitor->IsReading()) {
+    tail_data.resize(tail_data_len);
+    JXL_ASSERT(inter_marker_data_sizes.size() == info.num_intermarker);
+    inter_marker_data.reserve(info.num_intermarker);
+    for (size_t i = 0; i < info.num_intermarker; ++i) {
+      inter_marker_data.emplace_back(inter_marker_data_sizes[i]);
+    }
+  }
+
+  return true;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+void JPEGData::CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row,
+                                int* MCU_rows) const {
+  const bool is_interleaved = (scan.num_components > 1);
+  const JPEGComponent& base_component = components[scan.components[0].comp_idx];
+  // h_group / v_group act as numerators for converting number of blocks to
+  // number of MCU. In interleaved mode it is 1, so MCU is represented with
+  // max_*_samp_factor blocks. In non-interleaved mode we choose numerator to
+  // be the samping factor, consequently MCU is always represented with single
+  // block.
+  const int h_group = is_interleaved ? 1 : base_component.h_samp_factor;
+  const int v_group = is_interleaved ? 1 : base_component.v_samp_factor;
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (const auto& c : components) {
+    max_h_samp_factor = std::max(c.h_samp_factor, max_h_samp_factor);
+    max_v_samp_factor = std::max(c.v_samp_factor, max_v_samp_factor);
+  }
+  *MCUs_per_row = DivCeil(width * h_group, 8 * max_h_samp_factor);
+  *MCU_rows = DivCeil(height * v_group, 8 * max_v_samp_factor);
+}
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+
+Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data) {
+  size_t icc_pos = 0;
+  for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
+    if (jpeg_data->app_marker_type[i] != jpeg::AppMarkerType::kICC) {
+      continue;
+    }
+    size_t len = jpeg_data->app_data[i].size() - 17;
+    if (icc_pos + len > icc.size()) {
+      return JXL_FAILURE(
+          "ICC length is less than APP markers: requested %" PRIuS
+          " more bytes, "
+          "%" PRIuS " available",
+          len, icc.size() - icc_pos);
+    }
+    memcpy(&jpeg_data->app_data[i][17], icc.data() + icc_pos, len);
+    icc_pos += len;
+  }
+  if (icc_pos != icc.size() && icc_pos != 0) {
+    return JXL_FAILURE("ICC length is more than APP markers");
+  }
+  return true;
+}
+
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jpeg
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h
new file mode 100644
index 0000000000..70ff4f8e05
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.h
@@ -0,0 +1,216 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Data structures that represent the non-pixel contents of a jpeg file.
+
+#ifndef LIB_JXL_JPEG_JPEG_DATA_H_
+#define LIB_JXL_JPEG_JPEG_DATA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+namespace jpeg {
+
+constexpr int kMaxComponents = 4;
+constexpr int kMaxQuantTables = 4;
+constexpr int kMaxHuffmanTables = 4;
+constexpr size_t kJpegHuffmanMaxBitLength = 16;
+constexpr int kJpegHuffmanAlphabetSize = 256;
+constexpr int kJpegDCAlphabetSize = 12;
+constexpr int kMaxDHTMarkers = 512;
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kApp1 = 0xE1;
+constexpr uint8_t kApp2 = 0xE2;
+const uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+const uint8_t kExifTag[6] = "Exif\0";
+const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/";
+
+/* clang-format off */
+constexpr uint32_t kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+constexpr uint32_t kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+/* clang-format on */
+
+// Quantization values for an 8x8 pixel block.
+struct JPEGQuantTable {
+  std::array<int32_t, kDCTBlockSize> values;
+  uint32_t precision = 0;
+  // The index of this quantization table as it was parsed from the input JPEG.
+  // Each DQT marker segment contains an 'index' field, and we save this index
+  // here. Valid values are 0 to 3.
+  uint32_t index = 0;
+  // Set to true if this table is the last one within its marker segment.
+  bool is_last = true;
+};
+
+// Huffman code and decoding lookup table used for DC and AC coefficients.
+struct JPEGHuffmanCode {
+  // Bit length histogram.
+  std::array<uint32_t, kJpegHuffmanMaxBitLength + 1> counts = {};
+  // Symbol values sorted by increasing bit lengths.
+  std::array<uint32_t, kJpegHuffmanAlphabetSize + 1> values = {};
+  // The index of the Huffman code in the current set of Huffman codes. For AC
+  // component Huffman codes, 0x10 is added to the index.
+  int slot_id = 0;
+  // Set to true if this Huffman code is the last one within its marker segment.
+  bool is_last = true;
+};
+
+// Huffman table indexes used for one component of one scan.
+struct JPEGComponentScanInfo {
+  uint32_t comp_idx;
+  uint32_t dc_tbl_idx;
+  uint32_t ac_tbl_idx;
+};
+
+// Contains information that is used in one scan.
+struct JPEGScanInfo {
+  // Parameters used for progressive scans (named the same way as in the spec):
+  //   Ss : Start of spectral band in zig-zag sequence.
+  //   Se : End of spectral band in zig-zag sequence.
+  //   Ah : Successive approximation bit position, high.
+  //   Al : Successive approximation bit position, low.
+  uint32_t Ss;
+  uint32_t Se;
+  uint32_t Ah;
+  uint32_t Al;
+  uint32_t num_components = 0;
+  std::array<JPEGComponentScanInfo, 4> components;
+  // Last codestream pass that is needed to write this scan.
+  uint32_t last_needed_pass = 0;
+
+  // Extra information required for bit-precise JPEG file reconstruction.
+
+  // Set of block indexes where the JPEG encoder has to flush the end-of-block
+  // runs and refinement bits.
+  std::vector<uint32_t> reset_points;
+  // The number of extra zero runs (Huffman symbol 0xf0) before the end of
+  // block (if nonzero), indexed by block index.
+  // All of these symbols can be omitted without changing the pixel values, but
+  // some jpeg encoders put these at the end of blocks.
+  typedef struct {
+    uint32_t block_idx;
+    uint32_t num_extra_zero_runs;
+  } ExtraZeroRunInfo;
+  std::vector<ExtraZeroRunInfo> extra_zero_runs;
+};
+
+typedef int16_t coeff_t;
+
+// Represents one component of a jpeg file.
+struct JPEGComponent {
+  JPEGComponent()
+      : id(0),
+        h_samp_factor(1),
+        v_samp_factor(1),
+        quant_idx(0),
+        width_in_blocks(0),
+        height_in_blocks(0) {}
+
+  // One-byte id of the component.
+  uint32_t id;
+  // Horizontal and vertical sampling factors.
+  // In interleaved mode, each minimal coded unit (MCU) has
+  // h_samp_factor x v_samp_factor DCT blocks from this component.
+  int h_samp_factor;
+  int v_samp_factor;
+  // The index of the quantization table used for this component.
+  uint32_t quant_idx;
+  // The dimensions of the component measured in 8x8 blocks.
+  uint32_t width_in_blocks;
+  uint32_t height_in_blocks;
+  // The DCT coefficients of this component, laid out block-by-block, divided
+  // through the quantization matrix values.
+  std::vector<coeff_t> coeffs;
+};
+
+enum class AppMarkerType : uint32_t {
+  kUnknown = 0,
+  kICC = 1,
+  kExif = 2,
+  kXMP = 3,
+};
+
+// Represents a parsed jpeg file.
+struct JPEGData : public Fields {
+  JPEGData()
+      : width(0), height(0), restart_interval(0), has_zero_padding_bit(false) {}
+
+  JXL_FIELDS_NAME(JPEGData)
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  // Doesn't serialize everything - skips brotli-encoded data and what is
+  // already encoded in the codestream.
+  Status VisitFields(Visitor* visitor) override;
+#else
+  Status VisitFields(Visitor* /* visitor */) override {
+    JXL_ABORT("JPEG transcoding support not enabled");
+  }
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+  void CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row,
+                        int* MCU_rows) const;
+
+  int width;
+  int height;
+  uint32_t restart_interval;
+  std::vector<std::vector<uint8_t>> app_data;
+  std::vector<AppMarkerType> app_marker_type;
+  std::vector<std::vector<uint8_t>> com_data;
+  std::vector<JPEGQuantTable> quant;
+  std::vector<JPEGHuffmanCode> huffman_code;
+  std::vector<JPEGComponent> components;
+  std::vector<JPEGScanInfo> scan_info;
+  std::vector<uint8_t> marker_order;
+  std::vector<std::vector<uint8_t>> inter_marker_data;
+  std::vector<uint8_t> tail_data;
+
+  // Extra information required for bit-precise JPEG file reconstruction.
+
+  bool has_zero_padding_bit;
+  std::vector<uint8_t> padding_bits;
+};
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+// Set ICC profile in jpeg_data.
+Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data);
+#else
+static JXL_INLINE Status SetJPEGDataFromICC(const PaddedBytes& /* icc */,
+                                            jpeg::JPEGData* /* jpeg_data */) {
+  JXL_ABORT("JPEG transcoding support not enabled");
+}
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+}  // namespace jpeg
+}  // namespace jxl
+
+#endif  // LIB_JXL_JPEG_JPEG_DATA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jxl.syms b/third_party/jpeg-xl/lib/jxl/jxl.syms
new file mode 100644
index 0000000000..0f398d7151
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl.syms
@@ -0,0 +1,5 @@
+{
+  extern "C" {
+    jpegxl_*;
+  };
+};
diff --git a/third_party/jpeg-xl/lib/jxl/jxl.version b/third_party/jpeg-xl/lib/jxl/jxl.version
new file mode 100644
index 0000000000..26b0e9e54d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl.version
@@ -0,0 +1,17 @@
+JXL_0 {
+  global:
+    Jxl*;
+
+  local:
+    # Hide all the std namespace symbols. std namespace is explicitly marked
+    # as visibility(default) and header-only functions or methods (such as those
+    # from templates) should be exposed in shared libraries as weak symbols but
+    # this is only needed when we expose those types in the shared library API
+    # in any way. We don't use C++ std types in the API and we also don't
+    # support exceptions in the library.
+    # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
+    # about this.
+    extern "C++" {
+      *std::*;
+    };
+};
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_inspection.h b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h
new file mode 100644
index 0000000000..0b70a58523
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_inspection.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_JXL_INSPECTION_H_
+#define LIB_JXL_JXL_INSPECTION_H_
+
+#include <functional>
+
+#include "lib/jxl/image.h"
+
+namespace jxl {
+// Type of the inspection-callback which, if enabled, will be called on various
+// intermediate data during image processing, allowing inspection access.
+//
+// Returns false if processing can be stopped at that point, true otherwise.
+// This is only advisory - it is always OK to just continue processing.
+using InspectorImage3F = std::function<bool(const char*, const Image3F&)>;
+}  // namespace jxl
+
+#endif  // LIB_JXL_JXL_INSPECTION_H_
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_osx.syms b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms
new file mode 100644
index 0000000000..96bc568025
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_osx.syms
@@ -0,0 +1 @@
+_Jxl*
diff --git a/third_party/jpeg-xl/lib/jxl/jxl_test.cc b/third_party/jpeg-xl/lib/jxl/jxl_test.cc
new file mode 100644
index 0000000000..0a676802f6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/jxl_test.cc
@@ -0,0 +1,1537 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jxl.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+#include <future>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+#include "tools/box/box.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace {
+using extras::JXLCompressParams;
+using extras::JXLDecompressParams;
+using extras::PackedPixelFile;
+using test::ButteraugliDistance;
+using test::ComputeDistance2;
+using test::Roundtrip;
+using test::TestImage;
+using test::ThreadPoolForTests;
+
+#define JXL_TEST_NL 0  // Disabled in code
+
+TEST(JxlTest, RoundtripSinglePixel) {
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 55);
+}
+
+TEST(JxlTest, RoundtripSinglePixelWithAlpha) {
+  TestImage t;
+  t.SetDimensions(1, 1).SetChannels(4).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 59);
+}
+
+// Changing serialized signature causes Decode to fail.
+#ifndef JXL_CRASH_ON_ERROR
+TEST(JxlTest, RoundtripMarker) {
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
+  for (size_t i = 0; i < 2; ++i) {
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(extras::EncodeImageJXL({}, t.ppf(), /*jpeg_bytes=*/nullptr,
+                                       &compressed));
+    compressed[i] ^= 0xFF;
+    PackedPixelFile ppf_out;
+    EXPECT_FALSE(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                        {}, /*decodec_bytes=*/nullptr,
+                                        &ppf_out));
+  }
+}
+#endif
+
+TEST(JxlTest, RoundtripTinyFast) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(32, 32);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);
+  cparams.distance = 4.0f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 192, 10);
+}
+
+TEST(JxlTest, RoundtripSmallD1) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  {
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 766, 40);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.2));
+  }
+
+  // With a lower intensity target than the default, the bitrate should be
+  // smaller.
+  t.ppf().info.intensity_target = 100.0f;
+
+  {
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 659, 20);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.3));
+    EXPECT_EQ(ppf_out.info.intensity_target, t.ppf().info.intensity_target);
+  }
+}
+TEST(JxlTest, RoundtripResample2) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 18772, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(90));
+}
+
+TEST(JxlTest, RoundtripResample2Slow) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.distance = 10.0;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 4088, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(250));
+}
+
+TEST(JxlTest, RoundtripResample2MT) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  // image has to be large enough to have multiple groups after downsampling
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 228283, 1000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(340));
+}
+
+// Roundtrip the image using a parallel runner that executes single-threaded but
+// in random order.
+TEST(JxlTest, RoundtripOutOfOrderProcessing) {
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  // Image size is selected so that the block border needed is larger than the
+  // amount of pixels available on the next block.
+  t.SetDimensions(513, 515);
+
+  JXLCompressParams cparams;
+  // Force epf so we end up needing a lot of border.
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 22584, 400);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 1.35);
+}
+
+TEST(JxlTest, RoundtripOutOfOrderProcessingBorder) {
+  FakeParallelRunner fake_pool(/*order_seed=*/47, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  // Image size is selected so that the block border needed is larger than the
+  // amount of pixels available on the next block.
+  t.SetDimensions(513, 515);
+
+  JXLCompressParams cparams;
+  // Force epf so we end up needing a lot of border.
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 10907, 200);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 2.9);
+}
+
+TEST(JxlTest, RoundtripResample4) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 4);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 5824, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(22));
+}
+
+TEST(JxlTest, RoundtripResample8) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 8);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 2036, 50);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(50));
+}
+
+TEST(JxlTest, RoundtripUnalignedD2) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 12;
+  size_t ysize = t.ppf().info.ysize / 7;
+  t.SetDimensions(xsize, ysize);
+
+  JXLCompressParams cparams;
+  cparams.distance = 2.0;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 506, 30);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.72));
+}
+
+TEST(JxlTest, RoundtripMultiGroup) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  auto test = [&](jxl::SpeedTier speed_tier, float target_distance,
+                  size_t expected_size, float expected_distance) {
+    ThreadPoolForTests pool(4);
+    JXLCompressParams cparams;
+    int64_t effort = 10 - static_cast<int>(speed_tier);
+    cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, effort);
+    cparams.distance = target_distance;
+
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), expected_size,
+                700);
+    EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out),
+                IsSlightlyBelow(expected_distance));
+  };
+
+  auto run_kitten = std::async(std::launch::async, test, SpeedTier::kKitten,
+                               1.0f, 54895u, 11.7);
+  auto run_wombat = std::async(std::launch::async, test, SpeedTier::kWombat,
+                               2.0f, 33507u, 20.0);
+}
+
+TEST(JxlTest, RoundtripRGBToGrayscale) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  io.ShrinkTo(600, 1024);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0f;
+  cparams.speed_tier = SpeedTier::kFalcon;
+
+  JXLDecompressParams dparams;
+  dparams.color_space = "Gra_D65_Rel_SRG";
+
+  CodecInOut io2;
+  EXPECT_FALSE(io.Main().IsGray());
+  size_t compressed_size;
+  JXL_EXPECT_OK(
+      Roundtrip(&io, cparams, dparams, &io2, _, &compressed_size, &pool));
+  EXPECT_LE(compressed_size, 65000u);
+  EXPECT_TRUE(io2.Main().IsGray());
+
+  // Convert original to grayscale here, because TransformTo refuses to
+  // convert between grayscale and RGB.
+  ColorEncoding srgb_lin = ColorEncoding::LinearSRGB(/*is_gray=*/false);
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_lin, GetJxlCms()));
+  Image3F* color = io.Main().color();
+  for (size_t y = 0; y < color->ysize(); ++y) {
+    float* row_r = color->PlaneRow(0, y);
+    float* row_g = color->PlaneRow(1, y);
+    float* row_b = color->PlaneRow(2, y);
+    for (size_t x = 0; x < color->xsize(); ++x) {
+      float luma = 0.2126 * row_r[x] + 0.7152 * row_g[x] + 0.0722 * row_b[x];
+      row_r[x] = row_g[x] = row_b[x] = luma;
+    }
+  }
+  ColorEncoding srgb_gamma = ColorEncoding::SRGB(/*is_gray=*/false);
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_gamma, GetJxlCms()));
+  io.metadata.m.color_encoding = io2.Main().c_current();
+  io.Main().OverrideProfile(io2.Main().c_current());
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr, &pool),
+      IsSlightlyBelow(1.36));
+}
+
+TEST(JxlTest, RoundtripLargeFast) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 445684, 5000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(100));
+}
+
+TEST(JxlTest, RoundtripDotsForceEpf) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 41472, 300);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(18));
+}
+
+// Checks for differing size/distance in two consecutive runs of distance 2,
+// which involves additional processing including adaptive reconstruction.
+// Failing this may be a sign of race conditions or invalid memory accesses.
+TEST(JxlTest, RoundtripD2Consistent) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  for (size_t xsize = 48; xsize > 40; --xsize) {
+    t.SetDimensions(xsize, 15);
+
+    PackedPixelFile ppf2;
+    const size_t size2 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
+
+    PackedPixelFile ppf3;
+    const size_t size3 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf3);
+
+    // Exact same compressed size.
+    EXPECT_EQ(size2, size3);
+
+    // Exact same distance.
+    const float dist2 = ComputeDistance2(t.ppf(), ppf2);
+    const float dist3 = ComputeDistance2(t.ppf(), ppf3);
+    EXPECT_EQ(dist2, dist3);
+  }
+}
+
+// Same as above, but for full image, testing multiple groups.
+TEST(JxlTest, RoundtripLargeConsistent) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
+
+  auto roundtrip_and_compare = [&]() {
+    ThreadPoolForTests pool(8);
+    PackedPixelFile ppf2;
+    size_t size = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
+    double dist = ComputeDistance2(t.ppf(), ppf2);
+    return std::tuple<size_t, double>(size, dist);
+  };
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  auto future2 = std::async(std::launch::async, roundtrip_and_compare);
+  auto future3 = std::async(std::launch::async, roundtrip_and_compare);
+
+  const auto result2 = future2.get();
+  const auto result3 = future3.get();
+
+  // Exact same compressed size.
+  EXPECT_EQ(std::get<0>(result2), std::get<0>(result3));
+
+  // Exact same distance.
+  EXPECT_EQ(std::get<1>(result2), std::get<1>(result3));
+}
+
+TEST(JxlTest, RoundtripSmallNL) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 783, 25);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
+}
+
+TEST(JxlTest, RoundtripNoGaborishNoAR) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 0);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 38561, 200);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.8));
+}
+
+TEST(JxlTest, RoundtripSmallNoGaborish) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 811, 20);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
+}
+
+TEST(JxlTest, RoundtripSmallPatchesAlpha) {
+  ThreadPool* pool = nullptr;
+  TestImage t;
+  t.SetDimensions(256, 256).SetChannels(4);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
+  // This pattern should be picked up by the patch detection heuristics.
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
+      frame.SetValue(y, x, 3, 1.0f);
+    }
+  }
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 597, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f));
+}
+
+TEST(JxlTest, RoundtripSmallPatches) {
+  ThreadPool* pool = nullptr;
+  TestImage t;
+  t.SetDimensions(256, 256);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
+  // This pattern should be picked up by the patch detection heuristics.
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
+    }
+  }
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 486, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.012f));
+}
+
+// TODO(szabadka) Add encoder and decoder API functions that accept frame
+// buffers in arbitrary unsigned and floating point formats, and then roundtrip
+// test the lossless codepath to make sure the exact binary representations
+// are preserved.
+#if 0
+TEST(JxlTest, RoundtripImageBundleOriginalBits) {
+  // Image does not matter, only io.metadata.m and io2.metadata.m are tested.
+  Image3F image(1, 1);
+  ZeroFillImage(&image);
+  CodecInOut io;
+  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
+  io.SetFromImage(std::move(image), ColorEncoding::LinearSRGB());
+
+  CompressParams cparams;
+
+  // Test unsigned integers from 1 to 32 bits
+  for (uint32_t bit_depth = 1; bit_depth <= 32; bit_depth++) {
+    if (bit_depth == 32) {
+      // TODO(lode): allow testing 32, however the code below ends up in
+      // enc_modular which does not support 32. We only want to test the header
+      // encoding though, so try without modular.
+      break;
+    }
+
+    io.metadata.m.SetUintSamples(bit_depth);
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+
+    EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
+    EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits());
+  }
+
+  // Test various existing and non-existing floating point formats
+  for (uint32_t bit_depth = 8; bit_depth <= 32; bit_depth++) {
+    if (bit_depth != 32) {
+      // TODO: test other float types once they work
+      break;
+    }
+
+    uint32_t exponent_bit_depth;
+    if (bit_depth < 10) {
+      exponent_bit_depth = 2;
+    } else if (bit_depth < 12) {
+      exponent_bit_depth = 3;
+    } else if (bit_depth < 16) {
+      exponent_bit_depth = 4;
+    } else if (bit_depth < 20) {
+      exponent_bit_depth = 5;
+    } else if (bit_depth < 24) {
+      exponent_bit_depth = 6;
+    } else if (bit_depth < 28) {
+      exponent_bit_depth = 7;
+    } else {
+      exponent_bit_depth = 8;
+    }
+
+    io.metadata.m.bit_depth.bits_per_sample = bit_depth;
+    io.metadata.m.bit_depth.floating_point_sample = true;
+    io.metadata.m.bit_depth.exponent_bits_per_sample = exponent_bit_depth;
+
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2));
+
+    EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
+    EXPECT_TRUE(io2.metadata.m.bit_depth.floating_point_sample);
+    EXPECT_EQ(exponent_bit_depth,
+              io2.metadata.m.bit_depth.exponent_bits_per_sample);
+    EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits());
+  }
+}
+#endif
+
+TEST(JxlTest, RoundtripGrayscale) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  ASSERT_NE(io.xsize(), 0u);
+  io.ShrinkTo(128, 128);
+  EXPECT_TRUE(io.Main().IsGray());
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 1.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+    CodecInOut io2;
+    EXPECT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &io2));
+    EXPECT_TRUE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 7000u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(1.6));
+  }
+
+  // Test with larger butteraugli distance and other settings enabled so
+  // different jxl codepaths trigger.
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 8.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+    CodecInOut io2;
+    EXPECT_TRUE(test::DecodeFile({}, Span<const uint8_t>(compressed), &io2));
+    EXPECT_TRUE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 1300u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(6.0));
+  }
+
+  {
+    CompressParams cparams;
+    cparams.butteraugli_distance = 1.0;
+
+    PaddedBytes compressed;
+    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                           aux_out));
+
+    CodecInOut io2;
+    JXLDecompressParams dparams;
+    dparams.color_space = "RGB_D65_SRG_Rel_SRG";
+    EXPECT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+    EXPECT_FALSE(io2.Main().IsGray());
+
+    EXPECT_LE(compressed.size(), 7000u);
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(1.6));
+  }
+}
+
+TEST(JxlTest, RoundtripAlpha) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  ASSERT_NE(io.xsize(), 0u);
+  ASSERT_TRUE(io.metadata.m.HasAlpha());
+  ASSERT_TRUE(io.Main().HasAlpha());
+  io.ShrinkTo(300, 300);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+
+  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
+  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
+  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+  PaddedBytes compressed;
+  EXPECT_TRUE(
+      EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out));
+
+  EXPECT_LE(compressed.size(), 10077u);
+
+  for (bool use_image_callback : {false, true}) {
+    for (bool unpremul_alpha : {false, true}) {
+      CodecInOut io2;
+      JXLDecompressParams dparams;
+      dparams.use_image_callback = use_image_callback;
+      dparams.unpremultiply_alpha = unpremul_alpha;
+      EXPECT_TRUE(
+          test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+      EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                      GetJxlCms(),
+                                      /*distmap=*/nullptr),
+                  IsSlightlyBelow(1.15));
+    }
+  }
+}
+
+namespace {
+// Performs "PremultiplyAlpha" for each ImageBundle (preview/frames).
+bool PremultiplyAlpha(CodecInOut& io) {
+  const auto doPremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::PremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                              color->PlaneRow(2, y), alpha->Row(y),
+                              color->xsize());
+    }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doPremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doPremultiplyAlpha(ib);
+  }
+  eci->alpha_associated = true;
+  return true;
+}
+
+bool UnpremultiplyAlpha(CodecInOut& io) {
+  const auto doUnpremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::UnpremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                                color->PlaneRow(2, y), alpha->Row(y),
+                                color->xsize());
+    }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || !eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doUnpremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doUnpremultiplyAlpha(ib);
+  }
+  eci->alpha_associated = false;
+  return true;
+}
+}  // namespace
+
+TEST(JxlTest, RoundtripAlphaPremultiplied) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  CodecInOut io, io_nopremul;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_nopremul));
+
+  ASSERT_NE(io.xsize(), 0u);
+  ASSERT_TRUE(io.metadata.m.HasAlpha());
+  ASSERT_TRUE(io.Main().HasAlpha());
+  io.ShrinkTo(300, 300);
+  io_nopremul.ShrinkTo(300, 300);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+
+  EXPECT_FALSE(io.Main().AlphaIsPremultiplied());
+  EXPECT_TRUE(PremultiplyAlpha(io));
+  EXPECT_TRUE(io.Main().AlphaIsPremultiplied());
+
+  EXPECT_FALSE(io_nopremul.Main().AlphaIsPremultiplied());
+
+  PassesEncoderState enc_state;
+  AuxOut* aux_out = nullptr;
+  PaddedBytes compressed;
+  EXPECT_TRUE(
+      EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(), aux_out));
+  EXPECT_LE(compressed.size(), 10000u);
+
+  for (bool use_image_callback : {false, true}) {
+    for (bool unpremul_alpha : {false, true}) {
+      for (bool use_uint8 : {false, true}) {
+        printf(
+            "Testing premultiplied alpha using %s %s requesting "
+            "%spremultiplied output.\n",
+            use_uint8 ? "uint8" : "float",
+            use_image_callback ? "image callback" : "image_buffer",
+            unpremul_alpha ? "un" : "");
+        CodecInOut io2;
+        JXLDecompressParams dparams;
+        dparams.use_image_callback = use_image_callback;
+        dparams.unpremultiply_alpha = unpremul_alpha;
+        if (use_uint8) {
+          dparams.accepted_formats = {
+              {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}};
+        }
+        EXPECT_TRUE(
+            test::DecodeFile(dparams, Span<const uint8_t>(compressed), &io2));
+
+        EXPECT_EQ(unpremul_alpha, !io2.Main().AlphaIsPremultiplied());
+        if (!unpremul_alpha) {
+          EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames,
+                                          cparams.ba_params, GetJxlCms(),
+                                          /*distmap=*/nullptr),
+                      IsSlightlyBelow(1.2));
+          EXPECT_TRUE(UnpremultiplyAlpha(io2));
+          EXPECT_FALSE(io2.Main().AlphaIsPremultiplied());
+        }
+        EXPECT_THAT(ButteraugliDistance(io_nopremul.frames, io2.frames,
+                                        cparams.ba_params, GetJxlCms(),
+                                        /*distmap=*/nullptr),
+                    IsSlightlyBelow(1.47));
+      }
+    }
+  }
+}
+
+TEST(JxlTest, RoundtripAlphaResampling) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 5);  // kHare
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 12803, 130);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(5.2));
+}
+
+TEST(JxlTest, RoundtripAlphaResamplingOnlyAlpha) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 33571, 400);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.49));
+}
+
+TEST(JxlTest, RoundtripAlphaNonMultipleOf8) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(12, 12);
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 107, 10);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.95));
+}
+
+TEST(JxlTest, RoundtripAlpha16) {
+  ThreadPoolForTests pool(4);
+  // The image is wider than 512 pixels to ensure multiple groups are tested.
+  size_t xsize = 1200, ysize = 160;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 6);  // kWombat
+  cparams.distance = 0.5;
+
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  // This still keeps happening (2023-04-18).
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 3466, 120);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.65));
+}
+
+namespace {
+JXLCompressParams CompressParamsForLossless() {
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 6);  // Weighted
+  cparams.distance = 0;
+  return cparams;
+}
+}  // namespace
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 222167);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8ThunderGradient)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 2);             // kThunder
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 5);  // Gradient
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 261684);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8LightningGradient)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+
+  PackedPixelFile ppf_out;
+  // Lax comparison because different SIMD will cause different compression.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out),
+              IsSlightlyBelow(286848u));
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 230766);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+TEST(JxlTest, RoundtripLossless8Alpha) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(t.ppf().info.alpha_bits, 8);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 248817);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 8);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripLossless16Alpha) {
+  ThreadPool* pool = nullptr;
+  size_t xsize = 1200, ysize = 160;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize + 37;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 4849, 100);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripLossless16AlphaNotMisdetectedAs8Bit) {
+  ThreadPool* pool = nullptr;
+  size_t xsize = 128, ysize = 128;
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
+  // All 16-bit values, both color and alpha, of this image are below 64.
+  // This allows testing if a code path wrongly concludes it's an 8-bit instead
+  // of 16-bit image (or even 6-bit).
+  const float mul = 1.0f / 65535;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = y * 64 / ysize;
+      uint16_t g = x * 64 / xsize + 37;
+      uint16_t b = (y + x) * 64 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
+    }
+  }
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 543, 75);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 16);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
+}
+
+TEST(JxlTest, RoundtripDots) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
+  cparams.distance = 0.04;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 284295, 3000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.35));
+}
+
+TEST(JxlTest, RoundtripNoise) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_NOISE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41385, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.48));
+}
+
+TEST(JxlTest, RoundtripLossless8Gray) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  TestImage t;
+  t.SetColorEncoding("Gra_D65_Rel_SRG").DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(t.ppf().color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 92766);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 8);
+}
+
+#if JPEGXL_ENABLE_GIF
+
+TEST(JxlTest, RoundtripAnimation) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_THAT(Roundtrip(t.ppf(), {}, dparams, pool, &ppf_out),
+              IsSlightlyBelow(2600));
+
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out),
+#if JXL_HIGH_PRECISION
+            1.55);
+#else
+            1.75);
+#endif
+}
+
+TEST(JxlTest, RoundtripLosslessAnimation) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
+
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 958);
+
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 5e-4);
+}
+
+TEST(JxlTest, RoundtripAnimationPatches) {
+  ThreadPool* pool = nullptr;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/animation_patches.gif");
+
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(2u, t.ppf().frames.size());
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PATCHES, 1);
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // 40k with no patches, 27k with patch frames encoded multiple times.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out),
+              IsSlightlyBelow(16710));
+  EXPECT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  // >10 with broken patches
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.05));
+}
+
+#endif  // JPEGXL_ENABLE_GIF
+
+size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) {
+  std::vector<uint8_t> jpeg_bytes(jpeg_in.data(),
+                                  jpeg_in.data() + jpeg_in.size());
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes,
+                                     &compressed));
+
+  jxl::JXLDecompressParams dparams;
+  test::SetThreadParallelRunner(dparams, pool);
+  std::vector<uint8_t> out;
+  jxl::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &out));
+  EXPECT_EQ(out.size(), jpeg_in.size());
+  size_t failures = 0;
+  for (size_t i = 0; i < std::min(out.size(), jpeg_in.size()); i++) {
+    if (out[i] != jpeg_in[i]) {
+      EXPECT_EQ(out[i], jpeg_in[i])
+          << "byte mismatch " << i << " " << out[i] << " != " << jpeg_in[i];
+      if (++failures > 4) {
+        return compressed.size();
+      }
+    }
+  }
+  return compressed.size();
+}
+
+void RoundtripJpegToPixels(const PaddedBytes& jpeg_in,
+                           JXLDecompressParams dparams, ThreadPool* pool,
+                           PackedPixelFile* ppf_out) {
+  std::vector<uint8_t> jpeg_bytes(jpeg_in.data(),
+                                  jpeg_in.data() + jpeg_in.size());
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes,
+                                     &compressed));
+
+  test::SetThreadParallelRunner(dparams, pool);
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, ppf_out, nullptr));
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  // JPEG size is 696,659 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 568940u, 10);
+}
+
+#if JPEGXL_ENABLE_JPEG
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(12));
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(11));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420EarlyFlush)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  JXLDecompressParams dparams;
+  dparams.max_downsampling = 8;
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, dparams, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4410));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420Mul16)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower_cropped.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4));
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels_asymmetric)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(10));
+}
+
+#endif
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionGray)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_gray.jpg");
+  // JPEG size is 456,528 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 387496u, 200);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  // JPEG size is 546,797 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455560u, 10);
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_luma_subsample)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "jxl/flower/flower.png.im_q85_luma_subsample.jpg");
+  // JPEG size is 400,724 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 325354u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444_12)) {
+  // 444 JPEG that has an interesting sampling-factor (1x2, 1x2, 1x2).
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_444_1x2.jpg");
+  // JPEG size is 703,874 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 569679u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression422)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_422.jpg");
+  // JPEG size is 522,057 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 499282u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression440)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_440.jpg");
+  // JPEG size is 603,623 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 501151u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_asymmetric)) {
+  // 2x vertical downsample of one chroma channel, 2x horizontal downsample of
+  // the other.
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
+  // JPEG size is 604,601 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 500602u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420Progr)) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig =
+      jxl::test::ReadTestData("jxl/flower/flower.png.im_q85_420_progr.jpg");
+  // JPEG size is 522,057 bytes.
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455499u, 10);
+}
+
+TEST(JxlTest, RoundtripProgressive) {
+  ThreadPoolForTests pool(4);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 61635, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.4));
+}
+
+TEST(JxlTest, RoundtripProgressiveLevel2Slow) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 72841, 1000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.17));
+}
+
+TEST(JxlTest, RoundtripUnsignedCustomBitdepthLossless) {
+  ThreadPool* pool = nullptr;
+  for (uint32_t num_channels = 1; num_channels < 6; ++num_channels) {
+    for (JxlEndianness endianness : {JXL_LITTLE_ENDIAN, JXL_BIG_ENDIAN}) {
+      for (uint32_t bitdepth = 3; bitdepth <= 16; ++bitdepth) {
+        if (bitdepth <= 8 && endianness == JXL_BIG_ENDIAN) continue;
+        printf("Testing %u channel unsigned %u bit %s endian lossless.\n",
+               num_channels, bitdepth,
+               endianness == JXL_LITTLE_ENDIAN ? "little" : "big");
+        TestImage t;
+        t.SetDimensions(256, 256).SetChannels(num_channels);
+        t.SetAllBitDepths(bitdepth).SetEndianness(endianness);
+        TestImage::Frame frame = t.AddFrame();
+        frame.RandomFill();
+
+        JXLCompressParams cparams = CompressParamsForLossless();
+        cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        JXLDecompressParams dparams;
+        dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+        dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        PackedPixelFile ppf_out;
+        Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out);
+
+        ASSERT_TRUE(test::SamePixels(t.ppf(), ppf_out));
+      }
+    }
+  }
+}
+
+TEST(JxlTest, LosslessPNMRoundtrip) {
+  static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"};
+  static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"};
+  for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) {
+    for (size_t channels = 1; channels <= 4; ++channels) {
+      if (bit_depth == 1 && (channels == 2 || channels == 4)) continue;
+      std::string extension(kExtension[channels]);
+      std::string filename = "jxl/flower/flower_small." +
+                             std::string(kChannels[channels]) + ".depth" +
+                             std::to_string(bit_depth) + extension;
+      const PaddedBytes orig = jxl::test::ReadTestData(filename);
+      test::TestImage t;
+      if (channels < 3) t.SetColorEncoding("Gra_D65_Rel_SRG");
+      t.DecodeFromBytes(orig);
+
+      JXLCompressParams cparams = CompressParamsForLossless();
+      cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+      cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      JXLDecompressParams dparams;
+      dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+      dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      PackedPixelFile ppf_out;
+      Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out);
+
+      extras::EncodedImage encoded;
+      auto encoder = extras::Encoder::FromExtension(extension);
+      ASSERT_TRUE(encoder.get());
+      ASSERT_TRUE(encoder->Encode(ppf_out, &encoded, nullptr));
+      ASSERT_EQ(encoded.bitstreams.size(), 1);
+      ASSERT_EQ(orig.size(), encoded.bitstreams[0].size());
+      EXPECT_EQ(0,
+                memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size()));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code.h b/third_party/jpeg-xl/lib/jxl/lehmer_code.h
new file mode 100644
index 0000000000..dd1d21c6f7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/lehmer_code.h
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LEHMER_CODE_H_
+#define LIB_JXL_LEHMER_CODE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Permutation <=> factorial base representation (Lehmer code).
+
+using LehmerT = uint32_t;
+
+template <typename T>
+constexpr T ValueOfLowest1Bit(T t) {
+  return t & -t;
+}
+
+// Computes the Lehmer (factorial basis) code of permutation, an array of n
+// unique indices in [0..n), and stores it in code[0..len). N*logN time.
+// temp must have n + 1 elements but need not be initialized.
+template <typename PermutationT>
+void ComputeLehmerCode(const PermutationT* JXL_RESTRICT permutation,
+                       uint32_t* JXL_RESTRICT temp, const size_t n,
+                       LehmerT* JXL_RESTRICT code) {
+  for (size_t idx = 0; idx < n + 1; ++idx) temp[idx] = 0;
+
+  for (size_t idx = 0; idx < n; ++idx) {
+    const PermutationT s = permutation[idx];
+
+    // Compute sum in Fenwick tree
+    uint32_t penalty = 0;
+    uint32_t i = s + 1;
+    while (i != 0) {
+      penalty += temp[i];
+      i &= i - 1;  // clear lowest bit
+    }
+    JXL_DASSERT(s >= penalty);
+    code[idx] = s - penalty;
+    i = s + 1;
+    // Add operation in Fenwick tree
+    while (i < n + 1) {
+      temp[i] += 1;
+      i += ValueOfLowest1Bit(i);
+    }
+  }
+}
+
+// Decodes the Lehmer code in code[0..n) into permutation[0..n).
+// temp must have 1 << CeilLog2(n) elements but need not be initialized.
+template <typename PermutationT>
+void DecodeLehmerCode(const LehmerT* JXL_RESTRICT code,
+                      uint32_t* JXL_RESTRICT temp, size_t n,
+                      PermutationT* JXL_RESTRICT permutation) {
+  JXL_DASSERT(n != 0);
+  const size_t log2n = CeilLog2Nonzero(n);
+  const size_t padded_n = 1ull << log2n;
+
+  for (size_t i = 0; i < padded_n; i++) {
+    const int32_t i1 = static_cast<int32_t>(i + 1);
+    temp[i] = static_cast<uint32_t>(ValueOfLowest1Bit(i1));
+  }
+
+  for (size_t i = 0; i < n; i++) {
+    JXL_DASSERT(code[i] + i < n);
+    uint32_t rank = code[i] + 1;
+
+    // Extract i-th unused element via implicit order-statistics tree.
+    size_t bit = padded_n;
+    size_t next = 0;
+    for (size_t i = 0; i <= log2n; i++) {
+      const size_t cand = next + bit;
+      JXL_DASSERT(cand >= 1);
+      bit >>= 1;
+      if (temp[cand - 1] < rank) {
+        next = cand;
+        rank -= temp[cand - 1];
+      }
+    }
+
+    permutation[i] = next;
+
+    // Mark as used
+    next += 1;
+    while (next <= padded_n) {
+      temp[next - 1] -= 1;
+      next += ValueOfLowest1Bit(next);
+    }
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LEHMER_CODE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc
new file mode 100644
index 0000000000..acda762545
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/lehmer_code_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/lehmer_code.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+template <typename PermutationT>
+struct WorkingSet {
+  explicit WorkingSet(size_t max_n)
+      : padded_n(1ull << CeilLog2Nonzero(max_n + 1)),
+        permutation(max_n),
+        temp(padded_n),
+        lehmer(max_n),
+        decoded(max_n) {}
+
+  size_t padded_n;
+  std::vector<PermutationT> permutation;
+  std::vector<uint32_t> temp;
+  std::vector<LehmerT> lehmer;
+  std::vector<PermutationT> decoded;
+};
+
+template <typename PermutationT>
+void Roundtrip(size_t n, WorkingSet<PermutationT>* ws) {
+  JXL_ASSERT(n != 0);
+  const size_t padded_n = 1ull << CeilLog2Nonzero(n);
+
+  Rng rng(n * 65537 + 13);
+
+  // Ensure indices fit into PermutationT
+  EXPECT_LE(n, 1ULL << (sizeof(PermutationT) * 8));
+
+  std::iota(ws->permutation.begin(), ws->permutation.begin() + n, 0);
+
+  // For various random permutations:
+  for (size_t rep = 0; rep < 3; ++rep) {
+    rng.Shuffle(ws->permutation.data(), n);
+
+    // Must decode to the same permutation
+    ComputeLehmerCode(ws->permutation.data(), ws->temp.data(), n,
+                      ws->lehmer.data());
+    memset(ws->temp.data(), 0, padded_n * 4);
+    DecodeLehmerCode(ws->lehmer.data(), ws->temp.data(), n, ws->decoded.data());
+
+    for (size_t i = 0; i < n; ++i) {
+      EXPECT_EQ(ws->permutation[i], ws->decoded[i]);
+    }
+  }
+}
+
+// Preallocates arrays and tests n = [begin, end).
+template <typename PermutationT>
+void RoundtripSizeRange(ThreadPool* pool, uint32_t begin, uint32_t end) {
+  ASSERT_NE(0u, begin);  // n = 0 not allowed.
+  std::vector<WorkingSet<PermutationT>> working_sets;
+
+  JXL_CHECK(RunOnPool(
+      pool, begin, end,
+      [&working_sets, end](const size_t num_threads) {
+        for (size_t i = 0; i < num_threads; i++) {
+          working_sets.emplace_back(end - 1);
+        }
+        return true;
+      },
+      [&working_sets](const uint32_t n, const size_t thread) {
+        Roundtrip(n, &working_sets[thread]);
+      },
+      "lehmer test"));
+}
+
+TEST(LehmerCodeTest, TestRoundtrips) {
+  test::ThreadPoolForTests pool(8);
+
+  RoundtripSizeRange<uint16_t>(&pool, 1, 1026);
+
+  // Ensures PermutationT can fit > 16 bit values.
+  RoundtripSizeRange<uint32_t>(&pool, 65536, 65540);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/libjxl.pc.in b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in
new file mode 100644
index 0000000000..4a7af65b7c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/libjxl.pc.in
@@ -0,0 +1,13 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@PKGCONFIG_TARGET_LIBS@
+includedir=@PKGCONFIG_TARGET_INCLUDES@
+
+Name: libjxl
+Description: Loads and saves JPEG XL files
+Version: @JPEGXL_LIBRARY_VERSION@
+Requires.private: @JPEGXL_LIBRARY_REQUIRES@
+Libs: -L${libdir} -ljxl
+Libs.private: -lm
+Cflags: -I${includedir}
+Cflags.private: -DJXL_STATIC_DEFINE
diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.cc b/third_party/jpeg-xl/lib/jxl/loop_filter.cc
new file mode 100644
index 0000000000..5afe87617d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/loop_filter.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/loop_filter.h"
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+LoopFilter::LoopFilter() { Bundle::Init(this); }
+Status LoopFilter::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  // Must come before AllDefault.
+
+  if (visitor->AllDefault(*this, &all_default)) {
+    // Overwrite all serialized fields, but not any nonserialized_*.
+    visitor->SetDefault(this);
+    return true;
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &gab));
+  if (visitor->Conditional(gab)) {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &gab_custom));
+    if (visitor->Conditional(gab_custom)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_x_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_x_weight2));
+      if (std::abs(1.0f + (gab_x_weight1 + gab_x_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish x weights lead to near 0 unnormalized kernel");
+      }
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_y_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_y_weight2));
+      if (std::abs(1.0f + (gab_y_weight1 + gab_y_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish y weights lead to near 0 unnormalized kernel");
+      }
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.104699568f, &gab_b_weight1));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(1.1 * 0.055680538f, &gab_b_weight2));
+      if (std::abs(1.0f + (gab_b_weight1 + gab_b_weight2) * 4) < 1e-8) {
+        return JXL_FAILURE(
+            "Gaborish b weights lead to near 0 unnormalized kernel");
+      }
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(2, 2, &epf_iters));
+  if (visitor->Conditional(epf_iters > 0)) {
+    if (visitor->Conditional(!nonserialized_is_modular)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sharp_custom));
+      if (visitor->Conditional(epf_sharp_custom)) {
+        for (size_t i = 0; i < kEpfSharpEntries; ++i) {
+          JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
+              float(i) / float(kEpfSharpEntries - 1), &epf_sharp_lut[i]));
+        }
+      }
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_weight_custom));
+    if (visitor->Conditional(epf_weight_custom)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(40.0f, &epf_channel_scale[0]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(5.0f, &epf_channel_scale[1]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(3.5f, &epf_channel_scale[2]));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.45f, &epf_pass1_zeroflush));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.6f, &epf_pass2_zeroflush));
+    }
+
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &epf_sigma_custom));
+    if (visitor->Conditional(epf_sigma_custom)) {
+      if (visitor->Conditional(!nonserialized_is_modular)) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.46f, &epf_quant_mul));
+      }
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(0.9f, &epf_pass0_sigma_scale));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(6.5f, &epf_pass2_sigma_scale));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->F16(0.6666666666666666f, &epf_border_sad_mul));
+    }
+    if (visitor->Conditional(nonserialized_is_modular)) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->F16(1.0f, &epf_sigma_for_modular));
+      if (epf_sigma_for_modular < 1e-8) {
+        return JXL_FAILURE("EPF: sigma for modular is too small");
+      }
+    }
+  }
+
+  JXL_QUIET_RETURN_IF_ERROR(visitor->BeginExtensions(&extensions));
+  // Extensions: in chronological order of being added to the format.
+  return visitor->EndExtensions();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/loop_filter.h b/third_party/jpeg-xl/lib/jxl/loop_filter.h
new file mode 100644
index 0000000000..e4b418ba2b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/loop_filter.h
@@ -0,0 +1,76 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LOOP_FILTER_H_
+#define LIB_JXL_LOOP_FILTER_H_
+
+// Parameters for loop filter(s), stored in each frame.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+struct LoopFilter : public Fields {
+  LoopFilter();
+  JXL_FIELDS_NAME(LoopFilter)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  size_t Padding() const {
+    static const size_t padding_per_epf_iter[4] = {0, 2, 3, 6};
+    return padding_per_epf_iter[epf_iters] + (gab ? 1 : 0);
+  }
+
+  mutable bool all_default;
+
+  // --- Gaborish convolution
+  bool gab;
+
+  bool gab_custom;
+  float gab_x_weight1;
+  float gab_x_weight2;
+  float gab_y_weight1;
+  float gab_y_weight2;
+  float gab_b_weight1;
+  float gab_b_weight2;
+
+  // --- Edge-preserving filter
+
+  // Number of EPF stages to apply. 0 means EPF disabled. 1 applies only the
+  // first stage, 2 applies both stages and 3 applies the first stage twice and
+  // the second stage once.
+  uint32_t epf_iters;
+
+  bool epf_sharp_custom;
+  enum { kEpfSharpEntries = 8 };
+  float epf_sharp_lut[kEpfSharpEntries];
+
+  bool epf_weight_custom;      // Custom weight params
+  float epf_channel_scale[3];  // Relative weight of each channel
+  float epf_pass1_zeroflush;   // Minimum weight for first pass
+  float epf_pass2_zeroflush;   // Minimum weight for second pass
+
+  bool epf_sigma_custom;        // Custom sigma parameters
+  float epf_quant_mul;          // Sigma is ~ this * quant
+  float epf_pass0_sigma_scale;  // Multiplier for sigma in pass 0
+  float epf_pass2_sigma_scale;  // Multiplier for sigma in the second pass
+  float epf_border_sad_mul;     // (inverse) multiplier for sigma on borders
+
+  float epf_sigma_for_modular;
+
+  uint64_t extensions;
+
+  bool nonserialized_is_modular = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LOOP_FILTER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/luminance.cc b/third_party/jpeg-xl/lib/jxl/luminance.cc
new file mode 100644
index 0000000000..10151f4267
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/luminance.cc
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/luminance.h"
+
+#include "lib/jxl/image_metadata.h"
+
+namespace jxl {
+
+void SetIntensityTarget(ImageMetadata* m) {
+  if (m->color_encoding.tf.IsPQ()) {
+    // Peak luminance of PQ as defined by SMPTE ST 2084:2014.
+    m->SetIntensityTarget(10000);
+  } else if (m->color_encoding.tf.IsHLG()) {
+    // Nominal display peak luminance used as a reference by
+    // Rec. ITU-R BT.2100-2.
+    m->SetIntensityTarget(1000);
+  } else {
+    // SDR
+    m->SetIntensityTarget(kDefaultIntensityTarget);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/luminance.h b/third_party/jpeg-xl/lib/jxl/luminance.h
new file mode 100644
index 0000000000..3181576823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/luminance.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LUMINANCE_H_
+#define LIB_JXL_LUMINANCE_H_
+
+namespace jxl {
+
+// Chooses a default intensity target based on the transfer function of the
+// image, if known. For SDR images or images not known to be HDR, returns
+// kDefaultIntensityTarget, for images known to have PQ or HLG transfer function
+// returns a higher value.
+
+struct ImageMetadata;
+// TODO(eustas): rename
+void SetIntensityTarget(ImageMetadata* m);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LUMINANCE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/matrix_ops.h b/third_party/jpeg-xl/lib/jxl/matrix_ops.h
new file mode 100644
index 0000000000..1a969bd4f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/matrix_ops.h
@@ -0,0 +1,84 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MATRIX_OPS_H_
+#define LIB_JXL_MATRIX_OPS_H_
+
+// 3x3 matrix operations.
+
+#include <cmath>  // abs
+#include <cstddef>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Computes C = A * B, where A, B, C are 3x3 matrices.
+template <typename T>
+void Mul3x3Matrix(const T* a, const T* b, T* c) {
+  alignas(16) T temp[3];  // For transposed column
+  for (size_t x = 0; x < 3; x++) {
+    for (size_t z = 0; z < 3; z++) {
+      temp[z] = b[z * 3 + x];
+    }
+    for (size_t y = 0; y < 3; y++) {
+      double e = 0;
+      for (size_t z = 0; z < 3; z++) {
+        e += a[y * 3 + z] * temp[z];
+      }
+      c[y * 3 + x] = e;
+    }
+  }
+}
+
+// Computes C = A * B, where A is 3x3 matrix and B is vector.
+template <typename T>
+void Mul3x3Vector(const T* a, const T* b, T* c) {
+  for (size_t y = 0; y < 3; y++) {
+    double e = 0;
+    for (size_t x = 0; x < 3; x++) {
+      e += a[y * 3 + x] * b[x];
+    }
+    c[y] = e;
+  }
+}
+
+// Inverts a 3x3 matrix in place.
+template <typename T>
+Status Inv3x3Matrix(T* matrix) {
+  // Intermediate computation is done in double precision.
+  double temp[9];
+  temp[0] = static_cast<double>(matrix[4]) * matrix[8] -
+            static_cast<double>(matrix[5]) * matrix[7];
+  temp[1] = static_cast<double>(matrix[2]) * matrix[7] -
+            static_cast<double>(matrix[1]) * matrix[8];
+  temp[2] = static_cast<double>(matrix[1]) * matrix[5] -
+            static_cast<double>(matrix[2]) * matrix[4];
+  temp[3] = static_cast<double>(matrix[5]) * matrix[6] -
+            static_cast<double>(matrix[3]) * matrix[8];
+  temp[4] = static_cast<double>(matrix[0]) * matrix[8] -
+            static_cast<double>(matrix[2]) * matrix[6];
+  temp[5] = static_cast<double>(matrix[2]) * matrix[3] -
+            static_cast<double>(matrix[0]) * matrix[5];
+  temp[6] = static_cast<double>(matrix[3]) * matrix[7] -
+            static_cast<double>(matrix[4]) * matrix[6];
+  temp[7] = static_cast<double>(matrix[1]) * matrix[6] -
+            static_cast<double>(matrix[0]) * matrix[7];
+  temp[8] = static_cast<double>(matrix[0]) * matrix[4] -
+            static_cast<double>(matrix[1]) * matrix[3];
+  double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6];
+  if (std::abs(det) < 1e-10) {
+    return JXL_FAILURE("Matrix determinant is too close to 0");
+  }
+  double idet = 1.0 / det;
+  for (size_t i = 0; i < 9; i++) {
+    matrix[i] = temp[i] * idet;
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MATRIX_OPS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc
new file mode 100644
index 0000000000..87727e75cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/memory_manager_internal.h"
+
+#include <stdlib.h>
+
+namespace jxl {
+
+void* MemoryManagerDefaultAlloc(void* opaque, size_t size) {
+  return malloc(size);
+}
+
+void MemoryManagerDefaultFree(void* opaque, void* address) { free(address); }
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
new file mode 100644
index 0000000000..f8a5cd8d59
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
+#define LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
+
+// Memory allocator with support for alignment + misalignment.
+
+#include <jxl/memory_manager.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>  // memcpy
+
+#include <atomic>
+#include <memory>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Default alloc and free functions.
+void* MemoryManagerDefaultAlloc(void* opaque, size_t size);
+void MemoryManagerDefaultFree(void* opaque, void* address);
+
+// Initializes the memory manager instance with the passed one. The
+// MemoryManager passed in |memory_manager| may be NULL or contain NULL
+// functions which will be initialized with the default ones. If either alloc
+// or free are NULL, then both must be NULL, otherwise this function returns an
+// error.
+static JXL_INLINE Status MemoryManagerInit(
+    JxlMemoryManager* self, const JxlMemoryManager* memory_manager) {
+  if (memory_manager) {
+    *self = *memory_manager;
+  } else {
+    memset(self, 0, sizeof(*self));
+  }
+  if (!self->alloc != !self->free) {
+    return false;
+  }
+  if (!self->alloc) self->alloc = jxl::MemoryManagerDefaultAlloc;
+  if (!self->free) self->free = jxl::MemoryManagerDefaultFree;
+
+  return true;
+}
+
+static JXL_INLINE void* MemoryManagerAlloc(
+    const JxlMemoryManager* memory_manager, size_t size) {
+  return memory_manager->alloc(memory_manager->opaque, size);
+}
+
+static JXL_INLINE void MemoryManagerFree(const JxlMemoryManager* memory_manager,
+                                         void* address) {
+  return memory_manager->free(memory_manager->opaque, address);
+}
+
+// Helper class to be used as a deleter in a unique_ptr<T> call.
+class MemoryManagerDeleteHelper {
+ public:
+  explicit MemoryManagerDeleteHelper(const JxlMemoryManager* memory_manager)
+      : memory_manager_(memory_manager) {}
+
+  // Delete and free the passed pointer using the memory_manager.
+  template <typename T>
+  void operator()(T* address) const {
+    if (!address) {
+      return;
+    }
+    address->~T();
+    return memory_manager_->free(memory_manager_->opaque, address);
+  }
+
+ private:
+  const JxlMemoryManager* memory_manager_;
+};
+
+template <typename T>
+using MemoryManagerUniquePtr = std::unique_ptr<T, MemoryManagerDeleteHelper>;
+
+// Creates a new object T allocating it with the memory allocator into a
+// unique_ptr.
+template <typename T, typename... Args>
+JXL_INLINE MemoryManagerUniquePtr<T> MemoryManagerMakeUnique(
+    const JxlMemoryManager* memory_manager, Args&&... args) {
+  T* mem =
+      static_cast<T*>(memory_manager->alloc(memory_manager->opaque, sizeof(T)));
+  if (!mem) {
+    // Allocation error case.
+    return MemoryManagerUniquePtr<T>(nullptr,
+                                     MemoryManagerDeleteHelper(memory_manager));
+  }
+  return MemoryManagerUniquePtr<T>(new (mem) T(std::forward<Args>(args)...),
+                                   MemoryManagerDeleteHelper(memory_manager));
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MEMORY_MANAGER_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h
new file mode 100644
index 0000000000..914cd6a4e4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/context_predict.h
@@ -0,0 +1,626 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
+#define LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+namespace weighted {
+constexpr static size_t kNumPredictors = 4;
+constexpr static int64_t kPredExtraBits = 3;
+constexpr static int64_t kPredictionRound = ((1 << kPredExtraBits) >> 1) - 1;
+constexpr static size_t kNumProperties = 1;
+
+struct Header : public Fields {
+  JXL_FIELDS_NAME(WeightedPredictorHeader)
+  // TODO(janwas): move to cc file, avoid including fields.h.
+  Header() { Bundle::Init(this); }
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    if (visitor->AllDefault(*this, &all_default)) {
+      // Overwrite all serialized fields, but not any nonserialized_*.
+      visitor->SetDefault(this);
+      return true;
+    }
+    auto visit_p = [visitor](pixel_type val, pixel_type *p) {
+      uint32_t up = *p;
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(5, val, &up));
+      *p = up;
+      return Status(true);
+    };
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(16, &p1C));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(10, &p2C));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Ca));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cb));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(7, &p3Cc));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Cd));
+    JXL_QUIET_RETURN_IF_ERROR(visit_p(0, &p3Ce));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xd, &w[0]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[1]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[2]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(4, 0xc, &w[3]));
+    return true;
+  }
+
+  bool all_default;
+  pixel_type p1C = 0, p2C = 0, p3Ca = 0, p3Cb = 0, p3Cc = 0, p3Cd = 0, p3Ce = 0;
+  uint32_t w[kNumPredictors] = {};
+};
+
+struct State {
+  pixel_type_w prediction[kNumPredictors] = {};
+  pixel_type_w pred = 0;  // *before* removing the added bits.
+  std::vector<uint32_t> pred_errors[kNumPredictors];
+  std::vector<int32_t> error;
+  const Header header;
+
+  // Allows to approximate division by a number from 1 to 64.
+  uint32_t divlookup[64];
+
+  constexpr static pixel_type_w AddBits(pixel_type_w x) {
+    return uint64_t(x) << kPredExtraBits;
+  }
+
+  State(Header header, size_t xsize, size_t ysize) : header(header) {
+    // Extra margin to avoid out-of-bounds writes.
+    // All have space for two rows of data.
+    for (size_t i = 0; i < 4; i++) {
+      pred_errors[i].resize((xsize + 2) * 2);
+    }
+    error.resize((xsize + 2) * 2);
+    // Initialize division lookup table.
+    for (int i = 0; i < 64; i++) {
+      divlookup[i] = (1 << 24) / (i + 1);
+    }
+  }
+
+  // Approximates 4+(maxweight<<24)/(x+1), avoiding division
+  JXL_INLINE uint32_t ErrorWeight(uint64_t x, uint32_t maxweight) const {
+    int shift = static_cast<int>(FloorLog2Nonzero(x + 1)) - 5;
+    if (shift < 0) shift = 0;
+    return 4 + ((maxweight * divlookup[x >> shift]) >> shift);
+  }
+
+  // Approximates the weighted average of the input values with the given
+  // weights, avoiding division. Weights must sum to at least 16.
+  JXL_INLINE pixel_type_w
+  WeightedAverage(const pixel_type_w *JXL_RESTRICT p,
+                  std::array<uint32_t, kNumPredictors> w) const {
+    uint32_t weight_sum = 0;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      weight_sum += w[i];
+    }
+    JXL_DASSERT(weight_sum > 15);
+    uint32_t log_weight = FloorLog2Nonzero(weight_sum);  // at least 4.
+    weight_sum = 0;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      w[i] >>= log_weight - 4;
+      weight_sum += w[i];
+    }
+    // for rounding.
+    pixel_type_w sum = (weight_sum >> 1) - 1;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      sum += p[i] * w[i];
+    }
+    return (sum * divlookup[weight_sum - 1]) >> 24;
+  }
+
+  template <bool compute_properties>
+  JXL_INLINE pixel_type_w Predict(size_t x, size_t y, size_t xsize,
+                                  pixel_type_w N, pixel_type_w W,
+                                  pixel_type_w NE, pixel_type_w NW,
+                                  pixel_type_w NN, Properties *properties,
+                                  size_t offset) {
+    size_t cur_row = y & 1 ? 0 : (xsize + 2);
+    size_t prev_row = y & 1 ? (xsize + 2) : 0;
+    size_t pos_N = prev_row + x;
+    size_t pos_NE = x < xsize - 1 ? pos_N + 1 : pos_N;
+    size_t pos_NW = x > 0 ? pos_N - 1 : pos_N;
+    std::array<uint32_t, kNumPredictors> weights;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      // pred_errors[pos_N] also contains the error of pixel W.
+      // pred_errors[pos_NW] also contains the error of pixel WW.
+      weights[i] = pred_errors[i][pos_N] + pred_errors[i][pos_NE] +
+                   pred_errors[i][pos_NW];
+      weights[i] = ErrorWeight(weights[i], header.w[i]);
+    }
+
+    N = AddBits(N);
+    W = AddBits(W);
+    NE = AddBits(NE);
+    NW = AddBits(NW);
+    NN = AddBits(NN);
+
+    pixel_type_w teW = x == 0 ? 0 : error[cur_row + x - 1];
+    pixel_type_w teN = error[pos_N];
+    pixel_type_w teNW = error[pos_NW];
+    pixel_type_w sumWN = teN + teW;
+    pixel_type_w teNE = error[pos_NE];
+
+    if (compute_properties) {
+      pixel_type_w p = teW;
+      if (std::abs(teN) > std::abs(p)) p = teN;
+      if (std::abs(teNW) > std::abs(p)) p = teNW;
+      if (std::abs(teNE) > std::abs(p)) p = teNE;
+      (*properties)[offset++] = p;
+    }
+
+    prediction[0] = W + NE - N;
+    prediction[1] = N - (((sumWN + teNE) * header.p1C) >> 5);
+    prediction[2] = W - (((sumWN + teNW) * header.p2C) >> 5);
+    prediction[3] =
+        N - ((teNW * header.p3Ca + teN * header.p3Cb + teNE * header.p3Cc +
+              (NN - N) * header.p3Cd + (NW - W) * header.p3Ce) >>
+             5);
+
+    pred = WeightedAverage(prediction, weights);
+
+    // If all three have the same sign, skip clamping.
+    if (((teN ^ teW) | (teN ^ teNW)) > 0) {
+      return (pred + kPredictionRound) >> kPredExtraBits;
+    }
+
+    // Otherwise, clamp to min/max of neighbouring pixels (just W, NE, N).
+    pixel_type_w mx = std::max(W, std::max(NE, N));
+    pixel_type_w mn = std::min(W, std::min(NE, N));
+    pred = std::max(mn, std::min(mx, pred));
+    return (pred + kPredictionRound) >> kPredExtraBits;
+  }
+
+  JXL_INLINE void UpdateErrors(pixel_type_w val, size_t x, size_t y,
+                               size_t xsize) {
+    size_t cur_row = y & 1 ? 0 : (xsize + 2);
+    size_t prev_row = y & 1 ? (xsize + 2) : 0;
+    val = AddBits(val);
+    error[cur_row + x] = pred - val;
+    for (size_t i = 0; i < kNumPredictors; i++) {
+      pixel_type_w err =
+          (std::abs(prediction[i] - val) + kPredictionRound) >> kPredExtraBits;
+      // For predicting in the next row.
+      pred_errors[i][cur_row + x] = err;
+      // Add the error on this pixel to the error on the NE pixel. This has the
+      // effect of adding the error on this pixel to the E and EE pixels.
+      pred_errors[i][prev_row + x + 1] += err;
+    }
+  }
+};
+
+// Encoder helper function to set the parameters to some presets.
+inline void PredictorMode(int i, Header *header) {
+  switch (i) {
+    case 0:
+      // ~ lossless16 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 16;
+      header->p2C = 10;
+      header->p3Ca = 7;
+      header->p3Cb = 7;
+      header->p3Cc = 7;
+      header->p3Cd = 0;
+      header->p3Ce = 0;
+      break;
+    case 1:
+      // ~ default lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xb;
+      header->p1C = 8;
+      header->p2C = 8;
+      header->p3Ca = 4;
+      header->p3Cb = 0;
+      header->p3Cc = 3;
+      header->p3Cd = 23;
+      header->p3Ce = 2;
+      break;
+    case 2:
+      // ~ west lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xd;
+      header->w[3] = 0xc;
+      header->p1C = 10;
+      header->p2C = 9;
+      header->p3Ca = 7;
+      header->p3Cb = 0;
+      header->p3Cc = 0;
+      header->p3Cd = 16;
+      header->p3Ce = 9;
+      break;
+    case 3:
+      // ~ north lossless8 predictor
+      header->w[0] = 0xd;
+      header->w[1] = 0xd;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 16;
+      header->p2C = 8;
+      header->p3Ca = 0;
+      header->p3Cb = 16;
+      header->p3Cc = 0;
+      header->p3Cd = 23;
+      header->p3Ce = 0;
+      break;
+    case 4:
+    default:
+      // something else, because why not
+      header->w[0] = 0xd;
+      header->w[1] = 0xc;
+      header->w[2] = 0xc;
+      header->w[3] = 0xc;
+      header->p1C = 10;
+      header->p2C = 10;
+      header->p3Ca = 5;
+      header->p3Cb = 5;
+      header->p3Cc = 5;
+      header->p3Cd = 12;
+      header->p3Ce = 4;
+      break;
+  }
+}
+}  // namespace weighted
+
+// Stores a node and its two children at the same time. This significantly
+// reduces the number of branches needed during decoding.
+struct FlatDecisionNode {
+  // Property + splitval of the top node.
+  int32_t property0;  // -1 if leaf.
+  union {
+    PropertyVal splitval0;
+    Predictor predictor;
+  };
+  uint32_t childID;  // childID is ctx id if leaf.
+  // Property+splitval of the two child nodes.
+  union {
+    PropertyVal splitvals[2];
+    int32_t multiplier;
+  };
+  union {
+    int32_t properties[2];
+    int64_t predictor_offset;
+  };
+};
+using FlatTree = std::vector<FlatDecisionNode>;
+
+class MATreeLookup {
+ public:
+  explicit MATreeLookup(const FlatTree &tree) : nodes_(tree) {}
+  struct LookupResult {
+    uint32_t context;
+    Predictor predictor;
+    int64_t offset;
+    int32_t multiplier;
+  };
+  JXL_INLINE LookupResult Lookup(const Properties &properties) const {
+    uint32_t pos = 0;
+    while (true) {
+      const FlatDecisionNode &node = nodes_[pos];
+      if (node.property0 < 0) {
+        return {node.childID, node.predictor, node.predictor_offset,
+                node.multiplier};
+      }
+      bool p0 = properties[node.property0] <= node.splitval0;
+      uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0];
+      uint32_t off1 =
+          2 | (properties[node.properties[1]] <= node.splitvals[1] ? 1 : 0);
+      pos = node.childID + (p0 ? off1 : off0);
+    }
+  }
+
+ private:
+  const FlatTree &nodes_;
+};
+
+static constexpr size_t kExtraPropsPerChannel = 4;
+static constexpr size_t kNumNonrefProperties =
+    kNumStaticProperties + 13 + weighted::kNumProperties;
+
+constexpr size_t kWPProp = kNumNonrefProperties - weighted::kNumProperties;
+constexpr size_t kGradientProp = 9;
+
+// Clamps gradient to the min/max of n, w (and l, implicitly).
+static JXL_INLINE int32_t ClampedGradient(const int32_t n, const int32_t w,
+                                          const int32_t l) {
+  const int32_t m = std::min(n, w);
+  const int32_t M = std::max(n, w);
+  // The end result of this operation doesn't overflow or underflow if the
+  // result is between m and M, but the intermediate value may overflow, so we
+  // do the intermediate operations in uint32_t and check later if we had an
+  // overflow or underflow condition comparing m, M and l directly.
+  // grad = M + m - l = n + w - l
+  const int32_t grad =
+      static_cast<int32_t>(static_cast<uint32_t>(n) + static_cast<uint32_t>(w) -
+                           static_cast<uint32_t>(l));
+  // We use two sets of ternary operators to force the evaluation of them in
+  // any case, allowing the compiler to avoid branches and use cmovl/cmovg in
+  // x86.
+  const int32_t grad_clamp_M = (l < m) ? M : grad;
+  return (l > M) ? m : grad_clamp_M;
+}
+
+inline pixel_type_w Select(pixel_type_w a, pixel_type_w b, pixel_type_w c) {
+  pixel_type_w p = a + b - c;
+  pixel_type_w pa = std::abs(p - a);
+  pixel_type_w pb = std::abs(p - b);
+  return pa < pb ? a : b;
+}
+
+inline void PrecomputeReferences(const Channel &ch, size_t y,
+                                 const Image &image, uint32_t i,
+                                 Channel *references) {
+  ZeroFillImage(&references->plane);
+  uint32_t offset = 0;
+  size_t num_extra_props = references->w;
+  intptr_t onerow = references->plane.PixelsPerRow();
+  for (int32_t j = static_cast<int32_t>(i) - 1;
+       j >= 0 && offset < num_extra_props; j--) {
+    if (image.channel[j].w != image.channel[i].w ||
+        image.channel[j].h != image.channel[i].h) {
+      continue;
+    }
+    if (image.channel[j].hshift != image.channel[i].hshift) continue;
+    if (image.channel[j].vshift != image.channel[i].vshift) continue;
+    pixel_type *JXL_RESTRICT rp = references->Row(0) + offset;
+    const pixel_type *JXL_RESTRICT rpp = image.channel[j].Row(y);
+    const pixel_type *JXL_RESTRICT rpprev = image.channel[j].Row(y ? y - 1 : 0);
+    for (size_t x = 0; x < ch.w; x++, rp += onerow) {
+      pixel_type_w v = rpp[x];
+      rp[0] = std::abs(v);
+      rp[1] = v;
+      pixel_type_w vleft = (x ? rpp[x - 1] : 0);
+      pixel_type_w vtop = (y ? rpprev[x] : vleft);
+      pixel_type_w vtopleft = (x && y ? rpprev[x - 1] : vleft);
+      pixel_type_w vpredicted = ClampedGradient(vleft, vtop, vtopleft);
+      rp[2] = std::abs(v - vpredicted);
+      rp[3] = v - vpredicted;
+    }
+
+    offset += kExtraPropsPerChannel;
+  }
+}
+
+struct PredictionResult {
+  int context = 0;
+  pixel_type_w guess = 0;
+  Predictor predictor;
+  int32_t multiplier;
+};
+
+inline void InitPropsRow(
+    Properties *p,
+    const std::array<pixel_type, kNumStaticProperties> &static_props,
+    const int y) {
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    (*p)[i] = static_props[i];
+  }
+  (*p)[2] = y;
+  (*p)[9] = 0;  // local gradient.
+}
+
+namespace detail {
+enum PredictorMode {
+  kUseTree = 1,
+  kUseWP = 2,
+  kForceComputeProperties = 4,
+  kAllPredictions = 8,
+  kNoEdgeCases = 16
+};
+
+JXL_INLINE pixel_type_w PredictOne(Predictor p, pixel_type_w left,
+                                   pixel_type_w top, pixel_type_w toptop,
+                                   pixel_type_w topleft, pixel_type_w topright,
+                                   pixel_type_w leftleft,
+                                   pixel_type_w toprightright,
+                                   pixel_type_w wp_pred) {
+  switch (p) {
+    case Predictor::Zero:
+      return pixel_type_w{0};
+    case Predictor::Left:
+      return left;
+    case Predictor::Top:
+      return top;
+    case Predictor::Select:
+      return Select(left, top, topleft);
+    case Predictor::Weighted:
+      return wp_pred;
+    case Predictor::Gradient:
+      return pixel_type_w{ClampedGradient(left, top, topleft)};
+    case Predictor::TopLeft:
+      return topleft;
+    case Predictor::TopRight:
+      return topright;
+    case Predictor::LeftLeft:
+      return leftleft;
+    case Predictor::Average0:
+      return (left + top) / 2;
+    case Predictor::Average1:
+      return (left + topleft) / 2;
+    case Predictor::Average2:
+      return (topleft + top) / 2;
+    case Predictor::Average3:
+      return (top + topright) / 2;
+    case Predictor::Average4:
+      return (6 * top - 2 * toptop + 7 * left + 1 * leftleft +
+              1 * toprightright + 3 * topright + 8) /
+             16;
+    default:
+      return pixel_type_w{0};
+  }
+}
+
+template <int mode>
+JXL_INLINE PredictionResult Predict(
+    Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp,
+    const intptr_t onerow, const size_t x, const size_t y, Predictor predictor,
+    const MATreeLookup *lookup, const Channel *references,
+    weighted::State *wp_state, pixel_type_w *predictions) {
+  // We start in position 3 because of 2 static properties + y.
+  size_t offset = 3;
+  constexpr bool compute_properties =
+      mode & kUseTree || mode & kForceComputeProperties;
+  constexpr bool nec = mode & kNoEdgeCases;
+  pixel_type_w left = (nec || x ? pp[-1] : (y ? pp[-onerow] : 0));
+  pixel_type_w top = (nec || y ? pp[-onerow] : left);
+  pixel_type_w topleft = (nec || (x && y) ? pp[-1 - onerow] : left);
+  pixel_type_w topright = (nec || (x + 1 < w && y) ? pp[1 - onerow] : top);
+  pixel_type_w leftleft = (nec || x > 1 ? pp[-2] : left);
+  pixel_type_w toptop = (nec || y > 1 ? pp[-onerow - onerow] : top);
+  pixel_type_w toprightright =
+      (nec || (x + 2 < w && y) ? pp[2 - onerow] : topright);
+
+  if (compute_properties) {
+    // location
+    (*p)[offset++] = x;
+    // neighbors
+    (*p)[offset++] = std::abs(top);
+    (*p)[offset++] = std::abs(left);
+    (*p)[offset++] = top;
+    (*p)[offset++] = left;
+
+    // local gradient
+    (*p)[offset] = left - (*p)[offset + 1];
+    offset++;
+    // local gradient
+    (*p)[offset++] = left + top - topleft;
+
+    // FFV1 context properties
+    (*p)[offset++] = left - topleft;
+    (*p)[offset++] = topleft - top;
+    (*p)[offset++] = top - topright;
+    (*p)[offset++] = top - toptop;
+    (*p)[offset++] = left - leftleft;
+  }
+
+  pixel_type_w wp_pred = 0;
+  if (mode & kUseWP) {
+    wp_pred = wp_state->Predict<compute_properties>(
+        x, y, w, top, left, topright, topleft, toptop, p, offset);
+  }
+  if (!nec && compute_properties) {
+    offset += weighted::kNumProperties;
+    // Extra properties.
+    const pixel_type *JXL_RESTRICT rp = references->Row(x);
+    for (size_t i = 0; i < references->w; i++) {
+      (*p)[offset++] = rp[i];
+    }
+  }
+  PredictionResult result;
+  if (mode & kUseTree) {
+    MATreeLookup::LookupResult lr = lookup->Lookup(*p);
+    result.context = lr.context;
+    result.guess = lr.offset;
+    result.multiplier = lr.multiplier;
+    predictor = lr.predictor;
+  }
+  if (mode & kAllPredictions) {
+    for (size_t i = 0; i < kNumModularPredictors; i++) {
+      predictions[i] = PredictOne((Predictor)i, left, top, toptop, topleft,
+                                  topright, leftleft, toprightright, wp_pred);
+    }
+  }
+  result.guess += PredictOne(predictor, left, top, toptop, topleft, topright,
+                             leftleft, toprightright, wp_pred);
+  result.predictor = predictor;
+
+  return result;
+}
+}  // namespace detail
+
+inline PredictionResult PredictNoTreeNoWP(size_t w,
+                                          const pixel_type *JXL_RESTRICT pp,
+                                          const intptr_t onerow, const int x,
+                                          const int y, Predictor predictor) {
+  return detail::Predict</*mode=*/0>(
+      /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr,
+      /*references=*/nullptr, /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictNoTreeWP(size_t w,
+                                        const pixel_type *JXL_RESTRICT pp,
+                                        const intptr_t onerow, const int x,
+                                        const int y, Predictor predictor,
+                                        weighted::State *wp_state) {
+  return detail::Predict<detail::kUseWP>(
+      /*p=*/nullptr, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr,
+      /*references=*/nullptr, wp_state, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictTreeNoWP(Properties *p, size_t w,
+                                        const pixel_type *JXL_RESTRICT pp,
+                                        const intptr_t onerow, const int x,
+                                        const int y,
+                                        const MATreeLookup &tree_lookup,
+                                        const Channel &references) {
+  return detail::Predict<detail::kUseTree>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+// Only use for y > 1, x > 1, x < w-2, and empty references
+JXL_INLINE PredictionResult
+PredictTreeNoWPNEC(Properties *p, size_t w, const pixel_type *JXL_RESTRICT pp,
+                   const intptr_t onerow, const int x, const int y,
+                   const MATreeLookup &tree_lookup, const Channel &references) {
+  return detail::Predict<detail::kUseTree | detail::kNoEdgeCases>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      /*wp_state=*/nullptr, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictTreeWP(Properties *p, size_t w,
+                                      const pixel_type *JXL_RESTRICT pp,
+                                      const intptr_t onerow, const int x,
+                                      const int y,
+                                      const MATreeLookup &tree_lookup,
+                                      const Channel &references,
+                                      weighted::State *wp_state) {
+  return detail::Predict<detail::kUseTree | detail::kUseWP>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      wp_state, /*predictions=*/nullptr);
+}
+
+inline PredictionResult PredictLearn(Properties *p, size_t w,
+                                     const pixel_type *JXL_RESTRICT pp,
+                                     const intptr_t onerow, const int x,
+                                     const int y, Predictor predictor,
+                                     const Channel &references,
+                                     weighted::State *wp_state) {
+  return detail::Predict<detail::kForceComputeProperties | detail::kUseWP>(
+      p, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, &references,
+      wp_state, /*predictions=*/nullptr);
+}
+
+inline void PredictLearnAll(Properties *p, size_t w,
+                            const pixel_type *JXL_RESTRICT pp,
+                            const intptr_t onerow, const int x, const int y,
+                            const Channel &references,
+                            weighted::State *wp_state,
+                            pixel_type_w *predictions) {
+  detail::Predict<detail::kForceComputeProperties | detail::kUseWP |
+                  detail::kAllPredictions>(
+      p, w, pp, onerow, x, y, Predictor::Zero,
+      /*lookup=*/nullptr, &references, wp_state, predictions);
+}
+
+inline void PredictAllNoWP(size_t w, const pixel_type *JXL_RESTRICT pp,
+                           const intptr_t onerow, const int x, const int y,
+                           pixel_type_w *predictions) {
+  detail::Predict<detail::kAllPredictions>(
+      /*p=*/nullptr, w, pp, onerow, x, y, Predictor::Zero,
+      /*lookup=*/nullptr,
+      /*references=*/nullptr, /*wp_state=*/nullptr, predictions);
+}
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_CONTEXT_PREDICT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc
new file mode 100644
index 0000000000..66562f7dfd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc
@@ -0,0 +1,107 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/dec_ma.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+namespace {
+
+Status ValidateTree(
+    const Tree &tree,
+    const std::vector<std::pair<pixel_type, pixel_type>> &prop_bounds,
+    size_t root) {
+  if (tree[root].property == -1) return true;
+  size_t p = tree[root].property;
+  int val = tree[root].splitval;
+  if (prop_bounds[p].first > val) return JXL_FAILURE("Invalid tree");
+  // Splitting at max value makes no sense: left range will be exactly same
+  // as parent, right range will be invalid (min > max).
+  if (prop_bounds[p].second <= val) return JXL_FAILURE("Invalid tree");
+  auto new_bounds = prop_bounds;
+  new_bounds[p].first = val + 1;
+  JXL_RETURN_IF_ERROR(ValidateTree(tree, new_bounds, tree[root].lchild));
+  new_bounds[p] = prop_bounds[p];
+  new_bounds[p].second = val;
+  return ValidateTree(tree, new_bounds, tree[root].rchild);
+}
+
+Status DecodeTree(BitReader *br, ANSSymbolReader *reader,
+                  const std::vector<uint8_t> &context_map, Tree *tree,
+                  size_t tree_size_limit) {
+  size_t leaf_id = 0;
+  size_t to_decode = 1;
+  tree->clear();
+  while (to_decode > 0) {
+    JXL_RETURN_IF_ERROR(br->AllReadsWithinBounds());
+    if (tree->size() > tree_size_limit) {
+      return JXL_FAILURE("Tree is too large: %" PRIuS " nodes vs %" PRIuS
+                         " max nodes",
+                         tree->size(), tree_size_limit);
+    }
+    to_decode--;
+    uint32_t prop1 = reader->ReadHybridUint(kPropertyContext, br, context_map);
+    if (prop1 > 256) return JXL_FAILURE("Invalid tree property value");
+    int property = prop1 - 1;
+    if (property == -1) {
+      size_t predictor =
+          reader->ReadHybridUint(kPredictorContext, br, context_map);
+      if (predictor >= kNumModularPredictors) {
+        return JXL_FAILURE("Invalid predictor");
+      }
+      int64_t predictor_offset =
+          UnpackSigned(reader->ReadHybridUint(kOffsetContext, br, context_map));
+      uint32_t mul_log =
+          reader->ReadHybridUint(kMultiplierLogContext, br, context_map);
+      if (mul_log >= 31) {
+        return JXL_FAILURE("Invalid multiplier logarithm");
+      }
+      uint32_t mul_bits =
+          reader->ReadHybridUint(kMultiplierBitsContext, br, context_map);
+      if (mul_bits + 1 >= 1u << (31u - mul_log)) {
+        return JXL_FAILURE("Invalid multiplier");
+      }
+      uint32_t multiplier = (mul_bits + 1U) << mul_log;
+      tree->emplace_back(-1, 0, leaf_id++, 0, static_cast<Predictor>(predictor),
+                         predictor_offset, multiplier);
+      continue;
+    }
+    int splitval =
+        UnpackSigned(reader->ReadHybridUint(kSplitValContext, br, context_map));
+    tree->emplace_back(property, splitval, tree->size() + to_decode + 1,
+                       tree->size() + to_decode + 2, Predictor::Zero, 0, 1);
+    to_decode += 2;
+  }
+  std::vector<std::pair<pixel_type, pixel_type>> prop_bounds;
+  prop_bounds.resize(256, {std::numeric_limits<pixel_type>::min(),
+                           std::numeric_limits<pixel_type>::max()});
+  return ValidateTree(*tree, prop_bounds, 0);
+}
+}  // namespace
+
+Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit) {
+  std::vector<uint8_t> tree_context_map;
+  ANSCode tree_code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumTreeContexts, &tree_code, &tree_context_map));
+  // TODO(eustas): investigate more infinite tree cases.
+  if (tree_code.degenerate_symbols[tree_context_map[kPropertyContext]] > 0) {
+    return JXL_FAILURE("Infinite tree");
+  }
+  ANSSymbolReader reader(&tree_code, br);
+  JXL_RETURN_IF_ERROR(DecodeTree(br, &reader, tree_context_map, tree,
+                                 std::min(tree_size_limit, kMaxTreeSize)));
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS decode final state failed");
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h
new file mode 100644
index 0000000000..a910c4deb1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.h
@@ -0,0 +1,66 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
+#define LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// inner nodes
+struct PropertyDecisionNode {
+  PropertyVal splitval;
+  int16_t property;  // -1: leaf node, lchild points to leaf node
+  uint32_t lchild;
+  uint32_t rchild;
+  Predictor predictor;
+  int64_t predictor_offset;
+  uint32_t multiplier;
+
+  PropertyDecisionNode(int p, int split_val, int lchild, int rchild,
+                       Predictor predictor, int64_t predictor_offset,
+                       uint32_t multiplier)
+      : splitval(split_val),
+        property(p),
+        lchild(lchild),
+        rchild(rchild),
+        predictor(predictor),
+        predictor_offset(predictor_offset),
+        multiplier(multiplier) {}
+  PropertyDecisionNode()
+      : splitval(0),
+        property(-1),
+        lchild(0),
+        rchild(0),
+        predictor(Predictor::Zero),
+        predictor_offset(0),
+        multiplier(1) {}
+  static PropertyDecisionNode Leaf(Predictor predictor, int64_t offset = 0,
+                                   uint32_t multiplier = 1) {
+    return PropertyDecisionNode(-1, 0, 0, 0, predictor, offset, multiplier);
+  }
+  static PropertyDecisionNode Split(int p, int split_val, int lchild,
+                                    int rchild = -1) {
+    if (rchild == -1) rchild = lchild + 1;
+    return PropertyDecisionNode(p, split_val, lchild, rchild, Predictor::Zero,
+                                0, 1);
+  }
+};
+
+using Tree = std::vector<PropertyDecisionNode>;
+
+Status DecodeTree(BitReader *br, Tree *tree, size_t tree_size_limit);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_DEC_MA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
new file mode 100644
index 0000000000..f2a1705e4b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.cc
@@ -0,0 +1,124 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jxl/base/os_macros.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/options.h"
+
+#if JXL_OS_IOS
+#define JXL_ENABLE_DOT 0
+#else
+#define JXL_ENABLE_DOT 1  // iOS lacks C89 system()
+#endif
+
+namespace jxl {
+
+const char *PredictorName(Predictor p) {
+  switch (p) {
+    case Predictor::Zero:
+      return "Zero";
+    case Predictor::Left:
+      return "Left";
+    case Predictor::Top:
+      return "Top";
+    case Predictor::Average0:
+      return "Avg0";
+    case Predictor::Average1:
+      return "Avg1";
+    case Predictor::Average2:
+      return "Avg2";
+    case Predictor::Average3:
+      return "Avg3";
+    case Predictor::Average4:
+      return "Avg4";
+    case Predictor::Select:
+      return "Sel";
+    case Predictor::Gradient:
+      return "Grd";
+    case Predictor::Weighted:
+      return "Wgh";
+    case Predictor::TopLeft:
+      return "TopL";
+    case Predictor::TopRight:
+      return "TopR";
+    case Predictor::LeftLeft:
+      return "LL";
+    default:
+      return "INVALID";
+  };
+}
+
+std::string PropertyName(size_t i) {
+  static_assert(kNumNonrefProperties == 16, "Update this function");
+  switch (i) {
+    case 0:
+      return "c";
+    case 1:
+      return "g";
+    case 2:
+      return "y";
+    case 3:
+      return "x";
+    case 4:
+      return "|N|";
+    case 5:
+      return "|W|";
+    case 6:
+      return "N";
+    case 7:
+      return "W";
+    case 8:
+      return "W-WW-NW+NWW";
+    case 9:
+      return "W+N-NW";
+    case 10:
+      return "W-NW";
+    case 11:
+      return "NW-N";
+    case 12:
+      return "N-NE";
+    case 13:
+      return "N-NN";
+    case 14:
+      return "W-WW";
+    case 15:
+      return "WGH";
+    default:
+      return "ch[" + ToString(15 - (int)i) + "]";
+  }
+}
+
+void PrintTree(const Tree &tree, const std::string &path) {
+  FILE *f = fopen((path + ".dot").c_str(), "w");
+  fprintf(f, "graph{\n");
+  for (size_t cur = 0; cur < tree.size(); cur++) {
+    if (tree[cur].property < 0) {
+      fprintf(f, "n%05" PRIuS " [label=\"%s%+" PRId64 " (x%u)\"];\n", cur,
+              PredictorName(tree[cur].predictor), tree[cur].predictor_offset,
+              tree[cur].multiplier);
+    } else {
+      fprintf(f, "n%05" PRIuS " [label=\"%s>%d\"];\n", cur,
+              PropertyName(tree[cur].property).c_str(), tree[cur].splitval);
+      fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].lchild);
+      fprintf(f, "n%05" PRIuS " -- n%05d;\n", cur, tree[cur].rchild);
+    }
+  }
+  fprintf(f, "}\n");
+  fclose(f);
+#if JXL_ENABLE_DOT
+  JXL_ASSERT(
+      system(("dot " + path + ".dot -T svg -o " + path + ".svg").c_str()) == 0);
+#endif
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h
new file mode 100644
index 0000000000..78deaab1b8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_debug_tree.h
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+const char *PredictorName(Predictor p);
+std::string PropertyName(size_t i);
+
+void PrintTree(const Tree &tree, const std::string &path);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_DEBUG_TREE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
new file mode 100644
index 0000000000..c8c183335e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cinttypes>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_debug_tree.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+#include "lib/jxl/toc.h"
+
+namespace jxl {
+
+namespace {
+// Plot tree (if enabled) and predictor usage map.
+constexpr bool kWantDebug = false;
+constexpr bool kPrintTree = false;
+
+inline std::array<uint8_t, 3> PredictorColor(Predictor p) {
+  switch (p) {
+    case Predictor::Zero:
+      return {{0, 0, 0}};
+    case Predictor::Left:
+      return {{255, 0, 0}};
+    case Predictor::Top:
+      return {{0, 255, 0}};
+    case Predictor::Average0:
+      return {{0, 0, 255}};
+    case Predictor::Average4:
+      return {{192, 128, 128}};
+    case Predictor::Select:
+      return {{255, 255, 0}};
+    case Predictor::Gradient:
+      return {{255, 0, 255}};
+    case Predictor::Weighted:
+      return {{0, 255, 255}};
+      // TODO
+    default:
+      return {{255, 255, 255}};
+  };
+}
+
+}  // namespace
+
+void GatherTreeData(const Image &image, pixel_type chan, size_t group_id,
+                    const weighted::Header &wp_header,
+                    const ModularOptions &options, TreeSamples &tree_samples,
+                    size_t *total_pixels) {
+  const Channel &channel = image.channel[chan];
+
+  JXL_DEBUG_V(7, "Learning %" PRIuS "x%" PRIuS " channel %d", channel.w,
+              channel.h, chan);
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  Properties properties(kNumNonrefProperties +
+                        kExtraPropsPerChannel * options.max_properties);
+  double pixel_fraction = std::min(1.0f, options.nb_repeats);
+  // a fraction of 0 is used to disable learning entirely.
+  if (pixel_fraction > 0) {
+    pixel_fraction = std::max(pixel_fraction,
+                              std::min(1.0, 1024.0 / (channel.w * channel.h)));
+  }
+  uint64_t threshold =
+      (std::numeric_limits<uint64_t>::max() >> 32) * pixel_fraction;
+  uint64_t s[2] = {static_cast<uint64_t>(0x94D049BB133111EBull),
+                   static_cast<uint64_t>(0xBF58476D1CE4E5B9ull)};
+  // Xorshift128+ adapted from xorshift128+-inl.h
+  auto use_sample = [&]() {
+    auto s1 = s[0];
+    const auto s0 = s[1];
+    const auto bits = s1 + s0;  // b, c
+    s[0] = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s[1] = s1;
+    return (bits >> 32) <= threshold;
+  };
+
+  const intptr_t onerow = channel.plane.PixelsPerRow();
+  Channel references(properties.size() - kNumNonrefProperties, channel.w);
+  weighted::State wp_state(wp_header, channel.w, channel.h);
+  tree_samples.PrepareForSamples(pixel_fraction * channel.h * channel.w + 64);
+  for (size_t y = 0; y < channel.h; y++) {
+    const pixel_type *JXL_RESTRICT p = channel.Row(y);
+    PrecomputeReferences(channel, y, image, chan, &references);
+    InitPropsRow(&properties, static_props, y);
+    // TODO(veluca): avoid computing WP if we don't use its property or
+    // predictions.
+    for (size_t x = 0; x < channel.w; x++) {
+      pixel_type_w pred[kNumModularPredictors];
+      if (tree_samples.NumPredictors() != 1) {
+        PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references,
+                        &wp_state, pred);
+      } else {
+        pred[static_cast<int>(tree_samples.PredictorFromIndex(0))] =
+            PredictLearn(&properties, channel.w, p + x, onerow, x, y,
+                         tree_samples.PredictorFromIndex(0), references,
+                         &wp_state)
+                .guess;
+      }
+      (*total_pixels)++;
+      if (use_sample()) {
+        tree_samples.AddSample(p[x], properties, pred);
+      }
+      wp_state.UpdateErrors(p[x], x, y, channel.w);
+    }
+  }
+}
+
+Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
+               const ModularOptions &options,
+               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
+               StaticPropRange static_prop_range = {}) {
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    if (static_prop_range[i][1] == 0) {
+      static_prop_range[i][1] = std::numeric_limits<uint32_t>::max();
+    }
+  }
+  if (!tree_samples.HasSamples()) {
+    Tree tree;
+    tree.emplace_back();
+    tree.back().predictor = tree_samples.PredictorFromIndex(0);
+    tree.back().property = -1;
+    tree.back().predictor_offset = 0;
+    tree.back().multiplier = 1;
+    return tree;
+  }
+  float pixel_fraction = tree_samples.NumSamples() * 1.0f / total_pixels;
+  float required_cost = pixel_fraction * 0.9 + 0.1;
+  tree_samples.AllSamplesDone();
+  Tree tree;
+  ComputeBestTree(tree_samples,
+                  options.splitting_heuristics_node_threshold * required_cost,
+                  multiplier_info, static_prop_range,
+                  options.fast_decode_multiplier, &tree);
+  return tree;
+}
+
+Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
+                                 const weighted::Header &wp_header,
+                                 const Tree &global_tree, Token **tokenpp,
+                                 AuxOut *aux_out, size_t group_id,
+                                 bool skip_encoder_fast_path) {
+  const Channel &channel = image.channel[chan];
+  Token *tokenp = *tokenpp;
+  JXL_ASSERT(channel.w != 0 && channel.h != 0);
+
+  Image3F predictor_img;
+  if (kWantDebug) predictor_img = Image3F(channel.w, channel.h);
+
+  JXL_DEBUG_V(6,
+              "Encoding %" PRIuS "x%" PRIuS
+              " channel %d, "
+              "(shift=%i,%i)",
+              channel.w, channel.h, chan, channel.hshift, channel.vshift);
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  bool use_wp, is_wp_only;
+  bool is_gradient_only;
+  size_t num_props;
+  FlatTree tree = FilterTree(global_tree, static_props, &num_props, &use_wp,
+                             &is_wp_only, &is_gradient_only);
+  Properties properties(num_props);
+  MATreeLookup tree_lookup(tree);
+  JXL_DEBUG_V(3, "Encoding using a MA tree with %" PRIuS " nodes", tree.size());
+
+  // Check if this tree is a WP-only tree with a small enough property value
+  // range.
+  // Initialized to avoid clang-tidy complaining.
+  uint16_t context_lookup[2 * kPropRangeFast] = {};
+  int8_t offsets[2 * kPropRangeFast] = {};
+  if (is_wp_only) {
+    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets);
+  }
+  if (is_gradient_only) {
+    is_gradient_only = TreeToLookupTable(tree, context_lookup, offsets);
+  }
+
+  if (is_wp_only && !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Weighted)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    Properties properties(1);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        int32_t residual = r[x] - guess - offsets[pos];
+        *tokenp++ = Token(ctx_id, PackSigned(residual));
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor == Predictor::Gradient &&
+             tree[0].multiplier == 1 && tree[0].predictor_offset == 0 &&
+             !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Gradient)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        int32_t residual = r[x] - guess;
+        *tokenp++ = Token(tree[0].childID, PackSigned(residual));
+      }
+    }
+  } else if (is_gradient_only && !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Gradient)[c]),
+                &predictor_img.Plane(c));
+    }
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        uint32_t pos =
+            kPropRangeFast +
+            std::min<pixel_type_w>(
+                std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
+                kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        int32_t residual = r[x] - guess - offsets[pos];
+        *tokenp++ = Token(ctx_id, PackSigned(residual));
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor == Predictor::Zero &&
+             tree[0].multiplier == 1 && tree[0].predictor_offset == 0 &&
+             !skip_encoder_fast_path) {
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(Predictor::Zero)[c]),
+                &predictor_img.Plane(c));
+    }
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        *tokenp++ = Token(tree[0].childID, PackSigned(p[x]));
+      }
+    }
+  } else if (tree.size() == 1 && tree[0].predictor != Predictor::Weighted &&
+             (tree[0].multiplier & (tree[0].multiplier - 1)) == 0 &&
+             tree[0].predictor_offset == 0 && !skip_encoder_fast_path) {
+    // multiplier is a power of 2.
+    for (size_t c = 0; c < 3; c++) {
+      FillImage(static_cast<float>(PredictorColor(tree[0].predictor)[c]),
+                &predictor_img.Plane(c));
+    }
+    uint32_t mul_shift = FloorLog2Nonzero((uint32_t)tree[0].multiplier);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult pred = PredictNoTreeNoWP(channel.w, r + x, onerow, x,
+                                                  y, tree[0].predictor);
+        pixel_type_w residual = r[x] - pred.guess;
+        JXL_DASSERT((residual >> mul_shift) * tree[0].multiplier == residual);
+        *tokenp++ = Token(tree[0].childID, PackSigned(residual >> mul_shift));
+      }
+    }
+
+  } else if (!use_wp && !skip_encoder_fast_path) {
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, image, chan, &references);
+      float *pred_img_row[3];
+      if (kWantDebug) {
+        for (size_t c = 0; c < 3; c++) {
+          pred_img_row[c] = predictor_img.PlaneRow(c, y);
+        }
+      }
+      InitPropsRow(&properties, static_props, y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                            tree_lookup, references);
+        if (kWantDebug) {
+          for (size_t i = 0; i < 3; i++) {
+            pred_img_row[i][x] = PredictorColor(res.predictor)[i];
+          }
+        }
+        pixel_type_w residual = p[x] - res.guess;
+        JXL_ASSERT(residual % res.multiplier == 0);
+        *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
+      }
+    }
+  } else {
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    for (size_t y = 0; y < channel.h; y++) {
+      const pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, image, chan, &references);
+      float *pred_img_row[3];
+      if (kWantDebug) {
+        for (size_t c = 0; c < 3; c++) {
+          pred_img_row[c] = predictor_img.PlaneRow(c, y);
+        }
+      }
+      InitPropsRow(&properties, static_props, y);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                          tree_lookup, references, &wp_state);
+        if (kWantDebug) {
+          for (size_t i = 0; i < 3; i++) {
+            pred_img_row[i][x] = PredictorColor(res.predictor)[i];
+          }
+        }
+        pixel_type_w residual = p[x] - res.guess;
+        JXL_ASSERT(residual % res.multiplier == 0);
+        *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
+        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      }
+    }
+  }
+  if (kWantDebug && WantDebugOutput(aux_out)) {
+    aux_out->DumpImage(
+        ("pred_" + ToString(group_id) + "_" + ToString(chan)).c_str(),
+        predictor_img);
+  }
+  *tokenpp = tokenp;
+  return true;
+}
+
+Status ModularEncode(const Image &image, const ModularOptions &options,
+                     BitWriter *writer, AuxOut *aux_out, size_t layer,
+                     size_t group_id, TreeSamples *tree_samples,
+                     size_t *total_pixels, const Tree *tree,
+                     GroupHeader *header, std::vector<Token> *tokens,
+                     size_t *width) {
+  if (image.error) return JXL_FAILURE("Invalid image");
+  size_t nb_channels = image.channel.size();
+  JXL_DEBUG_V(
+      2, "Encoding %" PRIuS "-channel, %i-bit, %" PRIuS "x%" PRIuS " image.",
+      nb_channels, image.bitdepth, image.w, image.h);
+
+  if (nb_channels < 1) {
+    return true;  // is there any use for a zero-channel image?
+  }
+
+  // encode transforms
+  GroupHeader header_storage;
+  if (header == nullptr) header = &header_storage;
+  Bundle::Init(header);
+  if (options.predictor == Predictor::Weighted) {
+    weighted::PredictorMode(options.wp_mode, &header->wp_header);
+  }
+  header->transforms = image.transform;
+  // This doesn't actually work
+  if (tree != nullptr) {
+    header->use_global_tree = true;
+  }
+  if (tree_samples == nullptr && tree == nullptr) {
+    JXL_RETURN_IF_ERROR(Bundle::Write(*header, writer, layer, aux_out));
+  }
+
+  TreeSamples tree_samples_storage;
+  size_t total_pixels_storage = 0;
+  if (!total_pixels) total_pixels = &total_pixels_storage;
+  // If there's no tree, compute one (or gather data to).
+  if (tree == nullptr) {
+    bool gather_data = tree_samples != nullptr;
+    if (tree_samples == nullptr) {
+      JXL_RETURN_IF_ERROR(tree_samples_storage.SetPredictor(
+          options.predictor, options.wp_tree_mode));
+      JXL_RETURN_IF_ERROR(tree_samples_storage.SetProperties(
+          options.splitting_heuristics_properties, options.wp_tree_mode));
+      std::vector<pixel_type> pixel_samples;
+      std::vector<pixel_type> diff_samples;
+      std::vector<uint32_t> group_pixel_count;
+      std::vector<uint32_t> channel_pixel_count;
+      CollectPixelSamples(image, options, 0, group_pixel_count,
+                          channel_pixel_count, pixel_samples, diff_samples);
+      std::vector<ModularMultiplierInfo> dummy_multiplier_info;
+      StaticPropRange range;
+      tree_samples_storage.PreQuantizeProperties(
+          range, dummy_multiplier_info, group_pixel_count, channel_pixel_count,
+          pixel_samples, diff_samples, options.max_property_values);
+    }
+    for (size_t i = 0; i < nb_channels; i++) {
+      if (!image.channel[i].w || !image.channel[i].h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels &&
+          (image.channel[i].w > options.max_chan_size ||
+           image.channel[i].h > options.max_chan_size)) {
+        break;
+      }
+      GatherTreeData(image, i, group_id, header->wp_header, options,
+                     gather_data ? *tree_samples : tree_samples_storage,
+                     total_pixels);
+    }
+    if (gather_data) return true;
+  }
+
+  JXL_ASSERT((tree == nullptr) == (tokens == nullptr));
+
+  Tree tree_storage;
+  std::vector<std::vector<Token>> tokens_storage(1);
+  // Compute tree.
+  if (tree == nullptr) {
+    EntropyEncodingData code;
+    std::vector<uint8_t> context_map;
+
+    std::vector<std::vector<Token>> tree_tokens(1);
+    tree_storage =
+        LearnTree(std::move(tree_samples_storage), *total_pixels, options);
+    tree = &tree_storage;
+    tokens = &tokens_storage[0];
+
+    Tree decoded_tree;
+    TokenizeTree(*tree, &tree_tokens[0], &decoded_tree);
+    JXL_ASSERT(tree->size() == decoded_tree.size());
+    tree_storage = std::move(decoded_tree);
+
+    if (kWantDebug && kPrintTree && WantDebugOutput(aux_out)) {
+      PrintTree(*tree, aux_out->debug_prefix + "/tree_" + ToString(group_id));
+    }
+    // Write tree
+    BuildAndEncodeHistograms(HistogramParams(), kNumTreeContexts, tree_tokens,
+                             &code, &context_map, writer, kLayerModularTree,
+                             aux_out);
+    WriteTokens(tree_tokens[0], code, context_map, writer, kLayerModularTree,
+                aux_out);
+  }
+
+  size_t image_width = 0;
+  size_t total_tokens = 0;
+  for (size_t i = 0; i < nb_channels; i++) {
+    if (i >= image.nb_meta_channels &&
+        (image.channel[i].w > options.max_chan_size ||
+         image.channel[i].h > options.max_chan_size)) {
+      break;
+    }
+    if (image.channel[i].w > image_width) image_width = image.channel[i].w;
+    total_tokens += image.channel[i].w * image.channel[i].h;
+  }
+  if (options.zero_tokens) {
+    tokens->resize(tokens->size() + total_tokens, {0, 0});
+  } else {
+    // Do one big allocation for all the tokens we'll need,
+    // to avoid reallocs that might require copying.
+    size_t pos = tokens->size();
+    tokens->resize(pos + total_tokens);
+    Token *tokenp = tokens->data() + pos;
+    for (size_t i = 0; i < nb_channels; i++) {
+      if (!image.channel[i].w || !image.channel[i].h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels &&
+          (image.channel[i].w > options.max_chan_size ||
+           image.channel[i].h > options.max_chan_size)) {
+        break;
+      }
+      JXL_RETURN_IF_ERROR(EncodeModularChannelMAANS(
+          image, i, header->wp_header, *tree, &tokenp, aux_out, group_id,
+          options.skip_encoder_fast_path));
+    }
+    // Make sure we actually wrote all tokens
+    JXL_CHECK(tokenp == tokens->data() + tokens->size());
+  }
+
+  // Write data if not using a global tree/ANS stream.
+  if (!header->use_global_tree) {
+    EntropyEncodingData code;
+    std::vector<uint8_t> context_map;
+    HistogramParams histo_params;
+    histo_params.image_widths.push_back(image_width);
+    BuildAndEncodeHistograms(histo_params, (tree->size() + 1) / 2,
+                             tokens_storage, &code, &context_map, writer, layer,
+                             aux_out);
+    WriteTokens(tokens_storage[0], code, context_map, writer, layer, aux_out);
+  } else {
+    *width = image_width;
+  }
+  return true;
+}
+
+Status ModularGenericCompress(Image &image, const ModularOptions &opts,
+                              BitWriter *writer, AuxOut *aux_out, size_t layer,
+                              size_t group_id, TreeSamples *tree_samples,
+                              size_t *total_pixels, const Tree *tree,
+                              GroupHeader *header, std::vector<Token> *tokens,
+                              size_t *width) {
+  if (image.w == 0 || image.h == 0) return true;
+  ModularOptions options = opts;  // Make a copy to modify it.
+
+  if (options.predictor == static_cast<Predictor>(-1)) {
+    options.predictor = Predictor::Gradient;
+  }
+
+  size_t bits = writer ? writer->BitsWritten() : 0;
+  JXL_RETURN_IF_ERROR(ModularEncode(image, options, writer, aux_out, layer,
+                                    group_id, tree_samples, total_pixels, tree,
+                                    header, tokens, width));
+  bits = writer ? writer->BitsWritten() - bits : 0;
+  if (writer) {
+    JXL_DEBUG_V(4,
+                "Modular-encoded a %" PRIuS "x%" PRIuS
+                " bitdepth=%i nbchans=%" PRIuS " image in %" PRIuS " bytes",
+                image.w, image.h, image.bitdepth, image.channel.size(),
+                bits / 8);
+  }
+  (void)bits;
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
new file mode 100644
index 0000000000..04df504750
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/enc_ma.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
+               const ModularOptions &options,
+               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
+               StaticPropRange static_prop_range = {});
+
+// TODO(veluca): make cleaner interfaces.
+
+Status ModularGenericCompress(
+    Image &image, const ModularOptions &opts, BitWriter *writer,
+    AuxOut *aux_out = nullptr, size_t layer = 0, size_t group_id = 0,
+    // For gathering data for producing a global tree.
+    TreeSamples *tree_samples = nullptr, size_t *total_pixels = nullptr,
+    // For encoding with global tree.
+    const Tree *tree = nullptr, GroupHeader *header = nullptr,
+    std::vector<Token> *tokens = nullptr, size_t *widths = nullptr);
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc
new file mode 100644
index 0000000000..d0f6b47566
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.cc
@@ -0,0 +1,1023 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/enc_ma.h"
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "lib/jxl/modular/encoding/ma_common.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/encoding/enc_ma.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+
+const HWY_FULL(float) df;
+const HWY_FULL(int32_t) di;
+size_t Padded(size_t x) { return RoundUpTo(x, Lanes(df)); }
+
+float EstimateBits(const int32_t *counts, int32_t *rounded_counts,
+                   size_t num_symbols) {
+  // Try to approximate the effect of rounding up nonzero probabilities.
+  int32_t total = std::accumulate(counts, counts + num_symbols, 0);
+  const auto min = Set(di, (total + ANS_TAB_SIZE - 1) >> ANS_LOG_TAB_SIZE);
+  const auto zero_i = Zero(di);
+  for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
+    auto counts_v = LoadU(di, &counts[i]);
+    counts_v = IfThenElse(Eq(counts_v, zero_i), zero_i,
+                          IfThenElse(Lt(counts_v, min), min, counts_v));
+    StoreU(counts_v, di, &rounded_counts[i]);
+  }
+  // Compute entropy of the "rounded" probabilities.
+  const auto zero = Zero(df);
+  const size_t total_scalar =
+      std::accumulate(rounded_counts, rounded_counts + num_symbols, 0);
+  const auto inv_total = Set(df, 1.0f / total_scalar);
+  auto bits_lanes = Zero(df);
+  auto total_v = Set(di, total_scalar);
+  for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
+    const auto counts_v = ConvertTo(df, LoadU(di, &counts[i]));
+    const auto round_counts_v = LoadU(di, &rounded_counts[i]);
+    const auto probs = Mul(ConvertTo(df, round_counts_v), inv_total);
+    const auto nbps = IfThenElse(Eq(round_counts_v, total_v), BitCast(di, zero),
+                                 BitCast(di, FastLog2f(df, probs)));
+    bits_lanes = Sub(bits_lanes, IfThenElse(Eq(counts_v, zero), zero,
+                                            Mul(counts_v, BitCast(df, nbps))));
+  }
+  return GetLane(SumOfLanes(df, bits_lanes));
+}
+
+void MakeSplitNode(size_t pos, int property, int splitval, Predictor lpred,
+                   int64_t loff, Predictor rpred, int64_t roff, Tree *tree) {
+  // Note that the tree splits on *strictly greater*.
+  (*tree)[pos].lchild = tree->size();
+  (*tree)[pos].rchild = tree->size() + 1;
+  (*tree)[pos].splitval = splitval;
+  (*tree)[pos].property = property;
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = rpred;
+  tree->back().predictor_offset = roff;
+  tree->back().multiplier = 1;
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = lpred;
+  tree->back().predictor_offset = loff;
+  tree->back().multiplier = 1;
+}
+
+enum class IntersectionType { kNone, kPartial, kInside };
+IntersectionType BoxIntersects(StaticPropRange needle, StaticPropRange haystack,
+                               uint32_t &partial_axis, uint32_t &partial_val) {
+  bool partial = false;
+  for (size_t i = 0; i < kNumStaticProperties; i++) {
+    if (haystack[i][0] >= needle[i][1]) {
+      return IntersectionType::kNone;
+    }
+    if (haystack[i][1] <= needle[i][0]) {
+      return IntersectionType::kNone;
+    }
+    if (haystack[i][0] <= needle[i][0] && haystack[i][1] >= needle[i][1]) {
+      continue;
+    }
+    partial = true;
+    partial_axis = i;
+    if (haystack[i][0] > needle[i][0] && haystack[i][0] < needle[i][1]) {
+      partial_val = haystack[i][0] - 1;
+    } else {
+      JXL_DASSERT(haystack[i][1] > needle[i][0] &&
+                  haystack[i][1] < needle[i][1]);
+      partial_val = haystack[i][1] - 1;
+    }
+  }
+  return partial ? IntersectionType::kPartial : IntersectionType::kInside;
+}
+
+void SplitTreeSamples(TreeSamples &tree_samples, size_t begin, size_t pos,
+                      size_t end, size_t prop) {
+  auto cmp = [&](size_t a, size_t b) {
+    return int32_t(tree_samples.Property(prop, a)) -
+           int32_t(tree_samples.Property(prop, b));
+  };
+  Rng rng(0);
+  while (end > begin + 1) {
+    {
+      size_t pivot = rng.UniformU(begin, end);
+      tree_samples.Swap(begin, pivot);
+    }
+    size_t pivot_begin = begin;
+    size_t pivot_end = pivot_begin + 1;
+    for (size_t i = begin + 1; i < end; i++) {
+      JXL_DASSERT(i >= pivot_end);
+      JXL_DASSERT(pivot_end > pivot_begin);
+      int32_t cmp_result = cmp(i, pivot_begin);
+      if (cmp_result < 0) {  // i < pivot, move pivot forward and put i before
+                             // the pivot.
+        tree_samples.ThreeShuffle(pivot_begin, pivot_end, i);
+        pivot_begin++;
+        pivot_end++;
+      } else if (cmp_result == 0) {
+        tree_samples.Swap(pivot_end, i);
+        pivot_end++;
+      }
+    }
+    JXL_DASSERT(pivot_begin >= begin);
+    JXL_DASSERT(pivot_end > pivot_begin);
+    JXL_DASSERT(pivot_end <= end);
+    for (size_t i = begin; i < pivot_begin; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) < 0);
+    }
+    for (size_t i = pivot_end; i < end; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) > 0);
+    }
+    for (size_t i = pivot_begin; i < pivot_end; i++) {
+      JXL_DASSERT(cmp(i, pivot_begin) == 0);
+    }
+    // We now have that [begin, pivot_begin) is < pivot, [pivot_begin,
+    // pivot_end) is = pivot, and [pivot_end, end) is > pivot.
+    // If pos falls in the first or the last interval, we continue in that
+    // interval; otherwise, we are done.
+    if (pivot_begin > pos) {
+      end = pivot_begin;
+    } else if (pivot_end < pos) {
+      begin = pivot_end;
+    } else {
+      break;
+    }
+  }
+}
+
+void FindBestSplit(TreeSamples &tree_samples, float threshold,
+                   const std::vector<ModularMultiplierInfo> &mul_info,
+                   StaticPropRange initial_static_prop_range,
+                   float fast_decode_multiplier, Tree *tree) {
+  struct NodeInfo {
+    size_t pos;
+    size_t begin;
+    size_t end;
+    uint64_t used_properties;
+    StaticPropRange static_prop_range;
+  };
+  std::vector<NodeInfo> nodes;
+  nodes.push_back(NodeInfo{0, 0, tree_samples.NumDistinctSamples(), 0,
+                           initial_static_prop_range});
+
+  size_t num_predictors = tree_samples.NumPredictors();
+  size_t num_properties = tree_samples.NumProperties();
+
+  // TODO(veluca): consider parallelizing the search (processing multiple nodes
+  // at a time).
+  while (!nodes.empty()) {
+    size_t pos = nodes.back().pos;
+    size_t begin = nodes.back().begin;
+    size_t end = nodes.back().end;
+    uint64_t used_properties = nodes.back().used_properties;
+    StaticPropRange static_prop_range = nodes.back().static_prop_range;
+    nodes.pop_back();
+    if (begin == end) continue;
+
+    struct SplitInfo {
+      size_t prop = 0;
+      uint32_t val = 0;
+      size_t pos = 0;
+      float lcost = std::numeric_limits<float>::max();
+      float rcost = std::numeric_limits<float>::max();
+      Predictor lpred = Predictor::Zero;
+      Predictor rpred = Predictor::Zero;
+      float Cost() { return lcost + rcost; }
+    };
+
+    SplitInfo best_split_static_constant;
+    SplitInfo best_split_static;
+    SplitInfo best_split_nonstatic;
+    SplitInfo best_split_nowp;
+
+    JXL_DASSERT(begin <= end);
+    JXL_DASSERT(end <= tree_samples.NumDistinctSamples());
+
+    // Compute the maximum token in the range.
+    size_t max_symbols = 0;
+    for (size_t pred = 0; pred < num_predictors; pred++) {
+      for (size_t i = begin; i < end; i++) {
+        uint32_t tok = tree_samples.Token(pred, i);
+        max_symbols = max_symbols > tok + 1 ? max_symbols : tok + 1;
+      }
+    }
+    max_symbols = Padded(max_symbols);
+    std::vector<int32_t> rounded_counts(max_symbols);
+    std::vector<int32_t> counts(max_symbols * num_predictors);
+    std::vector<uint32_t> tot_extra_bits(num_predictors);
+    for (size_t pred = 0; pred < num_predictors; pred++) {
+      for (size_t i = begin; i < end; i++) {
+        counts[pred * max_symbols + tree_samples.Token(pred, i)] +=
+            tree_samples.Count(i);
+        tot_extra_bits[pred] +=
+            tree_samples.NBits(pred, i) * tree_samples.Count(i);
+      }
+    }
+
+    float base_bits;
+    {
+      size_t pred = tree_samples.PredictorIndex((*tree)[pos].predictor);
+      base_bits = EstimateBits(counts.data() + pred * max_symbols,
+                               rounded_counts.data(), max_symbols) +
+                  tot_extra_bits[pred];
+    }
+
+    SplitInfo *best = &best_split_nonstatic;
+
+    SplitInfo forced_split;
+    // The multiplier ranges cut halfway through the current ranges of static
+    // properties. We do this even if the current node is not a leaf, to
+    // minimize the number of nodes in the resulting tree.
+    for (size_t i = 0; i < mul_info.size(); i++) {
+      uint32_t axis, val;
+      IntersectionType t =
+          BoxIntersects(static_prop_range, mul_info[i].range, axis, val);
+      if (t == IntersectionType::kNone) continue;
+      if (t == IntersectionType::kInside) {
+        (*tree)[pos].multiplier = mul_info[i].multiplier;
+        break;
+      }
+      if (t == IntersectionType::kPartial) {
+        forced_split.val = tree_samples.QuantizeProperty(axis, val);
+        forced_split.prop = axis;
+        forced_split.lcost = forced_split.rcost = base_bits / 2 - threshold;
+        forced_split.lpred = forced_split.rpred = (*tree)[pos].predictor;
+        best = &forced_split;
+        best->pos = begin;
+        JXL_ASSERT(best->prop == tree_samples.PropertyFromIndex(best->prop));
+        for (size_t x = begin; x < end; x++) {
+          if (tree_samples.Property(best->prop, x) <= best->val) {
+            best->pos++;
+          }
+        }
+        break;
+      }
+    }
+
+    if (best != &forced_split) {
+      std::vector<int> prop_value_used_count;
+      std::vector<int> count_increase;
+      std::vector<size_t> extra_bits_increase;
+      // For each property, compute which of its values are used, and what
+      // tokens correspond to those usages. Then, iterate through the values,
+      // and compute the entropy of each side of the split (of the form `prop >
+      // threshold`). Finally, find the split that minimizes the cost.
+      struct CostInfo {
+        float cost = std::numeric_limits<float>::max();
+        float extra_cost = 0;
+        float Cost() const { return cost + extra_cost; }
+        Predictor pred;  // will be uninitialized in some cases, but never used.
+      };
+      std::vector<CostInfo> costs_l;
+      std::vector<CostInfo> costs_r;
+
+      std::vector<int32_t> counts_above(max_symbols);
+      std::vector<int32_t> counts_below(max_symbols);
+
+      // The lower the threshold, the higher the expected noisiness of the
+      // estimate. Thus, discourage changing predictors.
+      float change_pred_penalty = 800.0f / (100.0f + threshold);
+      for (size_t prop = 0; prop < num_properties && base_bits > threshold;
+           prop++) {
+        costs_l.clear();
+        costs_r.clear();
+        size_t prop_size = tree_samples.NumPropertyValues(prop);
+        if (extra_bits_increase.size() < prop_size) {
+          count_increase.resize(prop_size * max_symbols);
+          extra_bits_increase.resize(prop_size);
+        }
+        // Clear prop_value_used_count (which cannot be cleared "on the go")
+        prop_value_used_count.clear();
+        prop_value_used_count.resize(prop_size);
+
+        size_t first_used = prop_size;
+        size_t last_used = 0;
+
+        // TODO(veluca): consider finding multiple splits along a single
+        // property at the same time, possibly with a bottom-up approach.
+        for (size_t i = begin; i < end; i++) {
+          size_t p = tree_samples.Property(prop, i);
+          prop_value_used_count[p]++;
+          last_used = std::max(last_used, p);
+          first_used = std::min(first_used, p);
+        }
+        costs_l.resize(last_used - first_used);
+        costs_r.resize(last_used - first_used);
+        // For all predictors, compute the right and left costs of each split.
+        for (size_t pred = 0; pred < num_predictors; pred++) {
+          // Compute cost and histogram increments for each property value.
+          for (size_t i = begin; i < end; i++) {
+            size_t p = tree_samples.Property(prop, i);
+            size_t cnt = tree_samples.Count(i);
+            size_t sym = tree_samples.Token(pred, i);
+            count_increase[p * max_symbols + sym] += cnt;
+            extra_bits_increase[p] += tree_samples.NBits(pred, i) * cnt;
+          }
+          memcpy(counts_above.data(), counts.data() + pred * max_symbols,
+                 max_symbols * sizeof counts_above[0]);
+          memset(counts_below.data(), 0, max_symbols * sizeof counts_below[0]);
+          size_t extra_bits_below = 0;
+          // Exclude last used: this ensures neither counts_above nor
+          // counts_below is empty.
+          for (size_t i = first_used; i < last_used; i++) {
+            if (!prop_value_used_count[i]) continue;
+            extra_bits_below += extra_bits_increase[i];
+            // The increase for this property value has been used, and will not
+            // be used again: clear it. Also below.
+            extra_bits_increase[i] = 0;
+            for (size_t sym = 0; sym < max_symbols; sym++) {
+              counts_above[sym] -= count_increase[i * max_symbols + sym];
+              counts_below[sym] += count_increase[i * max_symbols + sym];
+              count_increase[i * max_symbols + sym] = 0;
+            }
+            float rcost = EstimateBits(counts_above.data(),
+                                       rounded_counts.data(), max_symbols) +
+                          tot_extra_bits[pred] - extra_bits_below;
+            float lcost = EstimateBits(counts_below.data(),
+                                       rounded_counts.data(), max_symbols) +
+                          extra_bits_below;
+            JXL_DASSERT(extra_bits_below <= tot_extra_bits[pred]);
+            float penalty = 0;
+            // Never discourage moving away from the Weighted predictor.
+            if (tree_samples.PredictorFromIndex(pred) !=
+                    (*tree)[pos].predictor &&
+                (*tree)[pos].predictor != Predictor::Weighted) {
+              penalty = change_pred_penalty;
+            }
+            // If everything else is equal, disfavour Weighted (slower) and
+            // favour Zero (faster if it's the only predictor used in a
+            // group+channel combination)
+            if (tree_samples.PredictorFromIndex(pred) == Predictor::Weighted) {
+              penalty += 1e-8;
+            }
+            if (tree_samples.PredictorFromIndex(pred) == Predictor::Zero) {
+              penalty -= 1e-8;
+            }
+            if (rcost + penalty < costs_r[i - first_used].Cost()) {
+              costs_r[i - first_used].cost = rcost;
+              costs_r[i - first_used].extra_cost = penalty;
+              costs_r[i - first_used].pred =
+                  tree_samples.PredictorFromIndex(pred);
+            }
+            if (lcost + penalty < costs_l[i - first_used].Cost()) {
+              costs_l[i - first_used].cost = lcost;
+              costs_l[i - first_used].extra_cost = penalty;
+              costs_l[i - first_used].pred =
+                  tree_samples.PredictorFromIndex(pred);
+            }
+          }
+        }
+        // Iterate through the possible splits and find the one with minimum sum
+        // of costs of the two sides.
+        size_t split = begin;
+        for (size_t i = first_used; i < last_used; i++) {
+          if (!prop_value_used_count[i]) continue;
+          split += prop_value_used_count[i];
+          float rcost = costs_r[i - first_used].cost;
+          float lcost = costs_l[i - first_used].cost;
+          // WP was not used + we would use the WP property or predictor
+          bool adds_wp =
+              (tree_samples.PropertyFromIndex(prop) == kWPProp &&
+               (used_properties & (1LU << prop)) == 0) ||
+              ((costs_l[i - first_used].pred == Predictor::Weighted ||
+                costs_r[i - first_used].pred == Predictor::Weighted) &&
+               (*tree)[pos].predictor != Predictor::Weighted);
+          bool zero_entropy_side = rcost == 0 || lcost == 0;
+
+          SplitInfo &best =
+              prop < kNumStaticProperties
+                  ? (zero_entropy_side ? best_split_static_constant
+                                       : best_split_static)
+                  : (adds_wp ? best_split_nonstatic : best_split_nowp);
+          if (lcost + rcost < best.Cost()) {
+            best.prop = prop;
+            best.val = i;
+            best.pos = split;
+            best.lcost = lcost;
+            best.lpred = costs_l[i - first_used].pred;
+            best.rcost = rcost;
+            best.rpred = costs_r[i - first_used].pred;
+          }
+        }
+        // Clear extra_bits_increase and cost_increase for last_used.
+        extra_bits_increase[last_used] = 0;
+        for (size_t sym = 0; sym < max_symbols; sym++) {
+          count_increase[last_used * max_symbols + sym] = 0;
+        }
+      }
+
+      // Try to avoid introducing WP.
+      if (best_split_nowp.Cost() + threshold < base_bits &&
+          best_split_nowp.Cost() <= fast_decode_multiplier * best->Cost()) {
+        best = &best_split_nowp;
+      }
+      // Split along static props if possible and not significantly more
+      // expensive.
+      if (best_split_static.Cost() + threshold < base_bits &&
+          best_split_static.Cost() <= fast_decode_multiplier * best->Cost()) {
+        best = &best_split_static;
+      }
+      // Split along static props to create constant nodes if possible.
+      if (best_split_static_constant.Cost() + threshold < base_bits) {
+        best = &best_split_static_constant;
+      }
+    }
+
+    if (best->Cost() + threshold < base_bits) {
+      uint32_t p = tree_samples.PropertyFromIndex(best->prop);
+      pixel_type dequant =
+          tree_samples.UnquantizeProperty(best->prop, best->val);
+      // Split node and try to split children.
+      MakeSplitNode(pos, p, dequant, best->lpred, 0, best->rpred, 0, tree);
+      // "Sort" according to winning property
+      SplitTreeSamples(tree_samples, begin, best->pos, end, best->prop);
+      if (p >= kNumStaticProperties) {
+        used_properties |= 1 << best->prop;
+      }
+      auto new_sp_range = static_prop_range;
+      if (p < kNumStaticProperties) {
+        JXL_ASSERT(static_cast<uint32_t>(dequant + 1) <= new_sp_range[p][1]);
+        new_sp_range[p][1] = dequant + 1;
+        JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]);
+      }
+      nodes.push_back(NodeInfo{(*tree)[pos].rchild, begin, best->pos,
+                               used_properties, new_sp_range});
+      new_sp_range = static_prop_range;
+      if (p < kNumStaticProperties) {
+        JXL_ASSERT(new_sp_range[p][0] <= static_cast<uint32_t>(dequant + 1));
+        new_sp_range[p][0] = dequant + 1;
+        JXL_ASSERT(new_sp_range[p][0] < new_sp_range[p][1]);
+      }
+      nodes.push_back(NodeInfo{(*tree)[pos].lchild, best->pos, end,
+                               used_properties, new_sp_range});
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(FindBestSplit);  // Local function.
+
+void ComputeBestTree(TreeSamples &tree_samples, float threshold,
+                     const std::vector<ModularMultiplierInfo> &mul_info,
+                     StaticPropRange static_prop_range,
+                     float fast_decode_multiplier, Tree *tree) {
+  // TODO(veluca): take into account that different contexts can have different
+  // uint configs.
+  //
+  // Initialize tree.
+  tree->emplace_back();
+  tree->back().property = -1;
+  tree->back().predictor = tree_samples.PredictorFromIndex(0);
+  tree->back().predictor_offset = 0;
+  tree->back().multiplier = 1;
+  JXL_ASSERT(tree_samples.NumProperties() < 64);
+
+  JXL_ASSERT(tree_samples.NumDistinctSamples() <=
+             std::numeric_limits<uint32_t>::max());
+  HWY_DYNAMIC_DISPATCH(FindBestSplit)
+  (tree_samples, threshold, mul_info, static_prop_range, fast_decode_multiplier,
+   tree);
+}
+
+constexpr int32_t TreeSamples::kPropertyRange;
+constexpr uint32_t TreeSamples::kDedupEntryUnused;
+
+Status TreeSamples::SetPredictor(Predictor predictor,
+                                 ModularOptions::TreeMode wp_tree_mode) {
+  if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) {
+    predictors = {Predictor::Weighted};
+    residuals.resize(1);
+    return true;
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP &&
+      predictor == Predictor::Weighted) {
+    return JXL_FAILURE("Invalid predictor settings");
+  }
+  if (predictor == Predictor::Variable) {
+    for (size_t i = 0; i < kNumModularPredictors; i++) {
+      predictors.push_back(static_cast<Predictor>(i));
+    }
+    std::swap(predictors[0], predictors[static_cast<int>(Predictor::Weighted)]);
+    std::swap(predictors[1], predictors[static_cast<int>(Predictor::Gradient)]);
+  } else if (predictor == Predictor::Best) {
+    predictors = {Predictor::Weighted, Predictor::Gradient};
+  } else {
+    predictors = {predictor};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) {
+    auto wp_it =
+        std::find(predictors.begin(), predictors.end(), Predictor::Weighted);
+    if (wp_it != predictors.end()) {
+      predictors.erase(wp_it);
+    }
+  }
+  residuals.resize(predictors.size());
+  return true;
+}
+
+Status TreeSamples::SetProperties(const std::vector<uint32_t> &properties,
+                                  ModularOptions::TreeMode wp_tree_mode) {
+  props_to_use = properties;
+  if (wp_tree_mode == ModularOptions::TreeMode::kWPOnly) {
+    props_to_use = {static_cast<uint32_t>(kWPProp)};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kGradientOnly) {
+    props_to_use = {static_cast<uint32_t>(kGradientProp)};
+  }
+  if (wp_tree_mode == ModularOptions::TreeMode::kNoWP) {
+    auto it = std::find(props_to_use.begin(), props_to_use.end(), kWPProp);
+    if (it != props_to_use.end()) {
+      props_to_use.erase(it);
+    }
+  }
+  if (props_to_use.empty()) {
+    return JXL_FAILURE("Invalid property set configuration");
+  }
+  props.resize(props_to_use.size());
+  return true;
+}
+
+void TreeSamples::InitTable(size_t size) {
+  JXL_DASSERT((size & (size - 1)) == 0);
+  if (dedup_table_.size() == size) return;
+  dedup_table_.resize(size, kDedupEntryUnused);
+  for (size_t i = 0; i < NumDistinctSamples(); i++) {
+    if (sample_counts[i] != std::numeric_limits<uint16_t>::max()) {
+      AddToTable(i);
+    }
+  }
+}
+
+bool TreeSamples::AddToTableAndMerge(size_t a) {
+  size_t pos1 = Hash1(a);
+  size_t pos2 = Hash2(a);
+  if (dedup_table_[pos1] != kDedupEntryUnused &&
+      IsSameSample(a, dedup_table_[pos1])) {
+    JXL_DASSERT(sample_counts[a] == 1);
+    sample_counts[dedup_table_[pos1]]++;
+    // Remove from hash table samples that are saturated.
+    if (sample_counts[dedup_table_[pos1]] ==
+        std::numeric_limits<uint16_t>::max()) {
+      dedup_table_[pos1] = kDedupEntryUnused;
+    }
+    return true;
+  }
+  if (dedup_table_[pos2] != kDedupEntryUnused &&
+      IsSameSample(a, dedup_table_[pos2])) {
+    JXL_DASSERT(sample_counts[a] == 1);
+    sample_counts[dedup_table_[pos2]]++;
+    // Remove from hash table samples that are saturated.
+    if (sample_counts[dedup_table_[pos2]] ==
+        std::numeric_limits<uint16_t>::max()) {
+      dedup_table_[pos2] = kDedupEntryUnused;
+    }
+    return true;
+  }
+  AddToTable(a);
+  return false;
+}
+
+void TreeSamples::AddToTable(size_t a) {
+  size_t pos1 = Hash1(a);
+  size_t pos2 = Hash2(a);
+  if (dedup_table_[pos1] == kDedupEntryUnused) {
+    dedup_table_[pos1] = a;
+  } else if (dedup_table_[pos2] == kDedupEntryUnused) {
+    dedup_table_[pos2] = a;
+  }
+}
+
+void TreeSamples::PrepareForSamples(size_t num_samples) {
+  for (auto &res : residuals) {
+    res.reserve(res.size() + num_samples);
+  }
+  for (auto &p : props) {
+    p.reserve(p.size() + num_samples);
+  }
+  size_t total_num_samples = num_samples + sample_counts.size();
+  size_t next_pow2 = 1LLU << CeilLog2Nonzero(total_num_samples * 3 / 2);
+  InitTable(next_pow2);
+}
+
+size_t TreeSamples::Hash1(size_t a) const {
+  constexpr uint64_t constant = 0x1e35a7bd;
+  uint64_t h = constant;
+  for (const auto &r : residuals) {
+    h = h * constant + r[a].tok;
+    h = h * constant + r[a].nbits;
+  }
+  for (const auto &p : props) {
+    h = h * constant + p[a];
+  }
+  return (h >> 16) & (dedup_table_.size() - 1);
+}
+size_t TreeSamples::Hash2(size_t a) const {
+  constexpr uint64_t constant = 0x1e35a7bd1e35a7bd;
+  uint64_t h = constant;
+  for (const auto &p : props) {
+    h = h * constant ^ p[a];
+  }
+  for (const auto &r : residuals) {
+    h = h * constant ^ r[a].tok;
+    h = h * constant ^ r[a].nbits;
+  }
+  return (h >> 16) & (dedup_table_.size() - 1);
+}
+
+bool TreeSamples::IsSameSample(size_t a, size_t b) const {
+  bool ret = true;
+  for (const auto &r : residuals) {
+    if (r[a].tok != r[b].tok) {
+      ret = false;
+    }
+    if (r[a].nbits != r[b].nbits) {
+      ret = false;
+    }
+  }
+  for (const auto &p : props) {
+    if (p[a] != p[b]) {
+      ret = false;
+    }
+  }
+  return ret;
+}
+
+void TreeSamples::AddSample(pixel_type_w pixel, const Properties &properties,
+                            const pixel_type_w *predictions) {
+  for (size_t i = 0; i < predictors.size(); i++) {
+    pixel_type v = pixel - predictions[static_cast<int>(predictors[i])];
+    uint32_t tok, nbits, bits;
+    HybridUintConfig(4, 1, 2).Encode(PackSigned(v), &tok, &nbits, &bits);
+    JXL_DASSERT(tok < 256);
+    JXL_DASSERT(nbits < 256);
+    residuals[i].emplace_back(
+        ResidualToken{static_cast<uint8_t>(tok), static_cast<uint8_t>(nbits)});
+  }
+  for (size_t i = 0; i < props_to_use.size(); i++) {
+    props[i].push_back(QuantizeProperty(i, properties[props_to_use[i]]));
+  }
+  sample_counts.push_back(1);
+  num_samples++;
+  if (AddToTableAndMerge(sample_counts.size() - 1)) {
+    for (auto &r : residuals) r.pop_back();
+    for (auto &p : props) p.pop_back();
+    sample_counts.pop_back();
+  }
+}
+
+void TreeSamples::Swap(size_t a, size_t b) {
+  if (a == b) return;
+  for (auto &r : residuals) {
+    std::swap(r[a], r[b]);
+  }
+  for (auto &p : props) {
+    std::swap(p[a], p[b]);
+  }
+  std::swap(sample_counts[a], sample_counts[b]);
+}
+
+void TreeSamples::ThreeShuffle(size_t a, size_t b, size_t c) {
+  if (b == c) return Swap(a, b);
+  for (auto &r : residuals) {
+    auto tmp = r[a];
+    r[a] = r[c];
+    r[c] = r[b];
+    r[b] = tmp;
+  }
+  for (auto &p : props) {
+    auto tmp = p[a];
+    p[a] = p[c];
+    p[c] = p[b];
+    p[b] = tmp;
+  }
+  auto tmp = sample_counts[a];
+  sample_counts[a] = sample_counts[c];
+  sample_counts[c] = sample_counts[b];
+  sample_counts[b] = tmp;
+}
+
+namespace {
+std::vector<int32_t> QuantizeHistogram(const std::vector<uint32_t> &histogram,
+                                       size_t num_chunks) {
+  if (histogram.empty()) return {};
+  // TODO(veluca): selecting distinct quantiles is likely not the best
+  // way to go about this.
+  std::vector<int32_t> thresholds;
+  size_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU);
+  size_t cumsum = 0;
+  size_t threshold = 0;
+  for (size_t i = 0; i + 1 < histogram.size(); i++) {
+    cumsum += histogram[i];
+    if (cumsum > (threshold + 1) * sum / num_chunks) {
+      thresholds.push_back(i);
+      while (cumsum >= (threshold + 1) * sum / num_chunks) threshold++;
+    }
+  }
+  return thresholds;
+}
+
+std::vector<int32_t> QuantizeSamples(const std::vector<int32_t> &samples,
+                                     size_t num_chunks) {
+  if (samples.empty()) return {};
+  int min = *std::min_element(samples.begin(), samples.end());
+  constexpr int kRange = 512;
+  min = std::min(std::max(min, -kRange), kRange);
+  std::vector<uint32_t> counts(2 * kRange + 1);
+  for (int s : samples) {
+    uint32_t sample_offset = std::min(std::max(s, -kRange), kRange) - min;
+    counts[sample_offset]++;
+  }
+  std::vector<int32_t> thresholds = QuantizeHistogram(counts, num_chunks);
+  for (auto &v : thresholds) v += min;
+  return thresholds;
+}
+}  // namespace
+
+void TreeSamples::PreQuantizeProperties(
+    const StaticPropRange &range,
+    const std::vector<ModularMultiplierInfo> &multiplier_info,
+    const std::vector<uint32_t> &group_pixel_count,
+    const std::vector<uint32_t> &channel_pixel_count,
+    std::vector<pixel_type> &pixel_samples,
+    std::vector<pixel_type> &diff_samples, size_t max_property_values) {
+  // If we have forced splits because of multipliers, choose channel and group
+  // thresholds accordingly.
+  std::vector<int32_t> group_multiplier_thresholds;
+  std::vector<int32_t> channel_multiplier_thresholds;
+  for (const auto &v : multiplier_info) {
+    if (v.range[0][0] != range[0][0]) {
+      channel_multiplier_thresholds.push_back(v.range[0][0] - 1);
+    }
+    if (v.range[0][1] != range[0][1]) {
+      channel_multiplier_thresholds.push_back(v.range[0][1] - 1);
+    }
+    if (v.range[1][0] != range[1][0]) {
+      group_multiplier_thresholds.push_back(v.range[1][0] - 1);
+    }
+    if (v.range[1][1] != range[1][1]) {
+      group_multiplier_thresholds.push_back(v.range[1][1] - 1);
+    }
+  }
+  std::sort(channel_multiplier_thresholds.begin(),
+            channel_multiplier_thresholds.end());
+  channel_multiplier_thresholds.resize(
+      std::unique(channel_multiplier_thresholds.begin(),
+                  channel_multiplier_thresholds.end()) -
+      channel_multiplier_thresholds.begin());
+  std::sort(group_multiplier_thresholds.begin(),
+            group_multiplier_thresholds.end());
+  group_multiplier_thresholds.resize(
+      std::unique(group_multiplier_thresholds.begin(),
+                  group_multiplier_thresholds.end()) -
+      group_multiplier_thresholds.begin());
+
+  compact_properties.resize(props_to_use.size());
+  auto quantize_channel = [&]() {
+    if (!channel_multiplier_thresholds.empty()) {
+      return channel_multiplier_thresholds;
+    }
+    return QuantizeHistogram(channel_pixel_count, max_property_values);
+  };
+  auto quantize_group_id = [&]() {
+    if (!group_multiplier_thresholds.empty()) {
+      return group_multiplier_thresholds;
+    }
+    return QuantizeHistogram(group_pixel_count, max_property_values);
+  };
+  auto quantize_coordinate = [&]() {
+    std::vector<int32_t> quantized;
+    quantized.reserve(max_property_values - 1);
+    for (size_t i = 0; i + 1 < max_property_values; i++) {
+      quantized.push_back((i + 1) * 256 / max_property_values - 1);
+    }
+    return quantized;
+  };
+  std::vector<int32_t> abs_pixel_thr;
+  std::vector<int32_t> pixel_thr;
+  auto quantize_pixel_property = [&]() {
+    if (pixel_thr.empty()) {
+      pixel_thr = QuantizeSamples(pixel_samples, max_property_values);
+    }
+    return pixel_thr;
+  };
+  auto quantize_abs_pixel_property = [&]() {
+    if (abs_pixel_thr.empty()) {
+      quantize_pixel_property();  // Compute the non-abs thresholds.
+      for (auto &v : pixel_samples) v = std::abs(v);
+      abs_pixel_thr = QuantizeSamples(pixel_samples, max_property_values);
+    }
+    return abs_pixel_thr;
+  };
+  std::vector<int32_t> abs_diff_thr;
+  std::vector<int32_t> diff_thr;
+  auto quantize_diff_property = [&]() {
+    if (diff_thr.empty()) {
+      diff_thr = QuantizeSamples(diff_samples, max_property_values);
+    }
+    return diff_thr;
+  };
+  auto quantize_abs_diff_property = [&]() {
+    if (abs_diff_thr.empty()) {
+      quantize_diff_property();  // Compute the non-abs thresholds.
+      for (auto &v : diff_samples) v = std::abs(v);
+      abs_diff_thr = QuantizeSamples(diff_samples, max_property_values);
+    }
+    return abs_diff_thr;
+  };
+  auto quantize_wp = [&]() {
+    if (max_property_values < 32) {
+      return std::vector<int32_t>{-127, -63, -31, -15, -7, -3, -1, 0,
+                                  1,    3,   7,   15,  31, 63, 127};
+    }
+    if (max_property_values < 64) {
+      return std::vector<int32_t>{-255, -191, -127, -95, -63, -47, -31, -23,
+                                  -15,  -11,  -7,   -5,  -3,  -1,  0,   1,
+                                  3,    5,    7,    11,  15,  23,  31,  47,
+                                  63,   95,   127,  191, 255};
+    }
+    return std::vector<int32_t>{
+        -255, -223, -191, -159, -127, -111, -95, -79, -63, -55, -47,
+        -39,  -31,  -27,  -23,  -19,  -15,  -13, -11, -9,  -7,  -6,
+        -5,   -4,   -3,   -2,   -1,   0,    1,   2,   3,   4,   5,
+        6,    7,    9,    11,   13,   15,   19,  23,  27,  31,  39,
+        47,   55,   63,   79,   95,   111,  127, 159, 191, 223, 255};
+  };
+
+  property_mapping.resize(props_to_use.size());
+  for (size_t i = 0; i < props_to_use.size(); i++) {
+    if (props_to_use[i] == 0) {
+      compact_properties[i] = quantize_channel();
+    } else if (props_to_use[i] == 1) {
+      compact_properties[i] = quantize_group_id();
+    } else if (props_to_use[i] == 2 || props_to_use[i] == 3) {
+      compact_properties[i] = quantize_coordinate();
+    } else if (props_to_use[i] == 6 || props_to_use[i] == 7 ||
+               props_to_use[i] == 8 ||
+               (props_to_use[i] >= kNumNonrefProperties &&
+                (props_to_use[i] - kNumNonrefProperties) % 4 == 1)) {
+      compact_properties[i] = quantize_pixel_property();
+    } else if (props_to_use[i] == 4 || props_to_use[i] == 5 ||
+               (props_to_use[i] >= kNumNonrefProperties &&
+                (props_to_use[i] - kNumNonrefProperties) % 4 == 0)) {
+      compact_properties[i] = quantize_abs_pixel_property();
+    } else if (props_to_use[i] >= kNumNonrefProperties &&
+               (props_to_use[i] - kNumNonrefProperties) % 4 == 2) {
+      compact_properties[i] = quantize_abs_diff_property();
+    } else if (props_to_use[i] == kWPProp) {
+      compact_properties[i] = quantize_wp();
+    } else {
+      compact_properties[i] = quantize_diff_property();
+    }
+    property_mapping[i].resize(kPropertyRange * 2 + 1);
+    size_t mapped = 0;
+    for (size_t j = 0; j < property_mapping[i].size(); j++) {
+      while (mapped < compact_properties[i].size() &&
+             static_cast<int>(j) - kPropertyRange >
+                 compact_properties[i][mapped]) {
+        mapped++;
+      }
+      // property_mapping[i] of a value V is `mapped` if
+      // compact_properties[i][mapped] <= j and
+      // compact_properties[i][mapped-1] > j
+      // This is because the decision node in the tree splits on (property) > j,
+      // hence everything that is not > of a threshold should be clustered
+      // together.
+      property_mapping[i][j] = mapped;
+    }
+  }
+}
+
+void CollectPixelSamples(const Image &image, const ModularOptions &options,
+                         size_t group_id,
+                         std::vector<uint32_t> &group_pixel_count,
+                         std::vector<uint32_t> &channel_pixel_count,
+                         std::vector<pixel_type> &pixel_samples,
+                         std::vector<pixel_type> &diff_samples) {
+  if (options.nb_repeats == 0) return;
+  if (group_pixel_count.size() <= group_id) {
+    group_pixel_count.resize(group_id + 1);
+  }
+  if (channel_pixel_count.size() < image.channel.size()) {
+    channel_pixel_count.resize(image.channel.size());
+  }
+  Rng rng(group_id);
+  // Sample 10% of the final number of samples for property quantization.
+  float fraction = std::min(options.nb_repeats * 0.1, 0.99);
+  Rng::GeometricDistribution dist(fraction);
+  size_t total_pixels = 0;
+  std::vector<size_t> channel_ids;
+  for (size_t i = 0; i < image.channel.size(); i++) {
+    if (image.channel[i].w <= 1 || image.channel[i].h == 0) {
+      continue;  // skip empty or width-1 channels.
+    }
+    if (i >= image.nb_meta_channels &&
+        (image.channel[i].w > options.max_chan_size ||
+         image.channel[i].h > options.max_chan_size)) {
+      break;
+    }
+    channel_ids.push_back(i);
+    group_pixel_count[group_id] += image.channel[i].w * image.channel[i].h;
+    channel_pixel_count[i] += image.channel[i].w * image.channel[i].h;
+    total_pixels += image.channel[i].w * image.channel[i].h;
+  }
+  if (channel_ids.empty()) return;
+  pixel_samples.reserve(pixel_samples.size() + fraction * total_pixels);
+  diff_samples.reserve(diff_samples.size() + fraction * total_pixels);
+  size_t i = 0;
+  size_t y = 0;
+  size_t x = 0;
+  auto advance = [&](size_t amount) {
+    x += amount;
+    // Detect row overflow (rare).
+    while (x >= image.channel[channel_ids[i]].w) {
+      x -= image.channel[channel_ids[i]].w;
+      y++;
+      // Detect end-of-channel (even rarer).
+      if (y == image.channel[channel_ids[i]].h) {
+        i++;
+        y = 0;
+        if (i >= channel_ids.size()) {
+          return;
+        }
+      }
+    }
+  };
+  advance(rng.Geometric(dist));
+  for (; i < channel_ids.size(); advance(rng.Geometric(dist) + 1)) {
+    const pixel_type *row = image.channel[channel_ids[i]].Row(y);
+    pixel_samples.push_back(row[x]);
+    size_t xp = x == 0 ? 1 : x - 1;
+    diff_samples.push_back((int64_t)row[x] - row[xp]);
+  }
+}
+
+// TODO(veluca): very simple encoding scheme. This should be improved.
+void TokenizeTree(const Tree &tree, std::vector<Token> *tokens,
+                  Tree *decoder_tree) {
+  JXL_ASSERT(tree.size() <= kMaxTreeSize);
+  std::queue<int> q;
+  q.push(0);
+  size_t leaf_id = 0;
+  decoder_tree->clear();
+  while (!q.empty()) {
+    int cur = q.front();
+    q.pop();
+    JXL_ASSERT(tree[cur].property >= -1);
+    tokens->emplace_back(kPropertyContext, tree[cur].property + 1);
+    if (tree[cur].property == -1) {
+      tokens->emplace_back(kPredictorContext,
+                           static_cast<int>(tree[cur].predictor));
+      tokens->emplace_back(kOffsetContext,
+                           PackSigned(tree[cur].predictor_offset));
+      uint32_t mul_log = Num0BitsBelowLS1Bit_Nonzero(tree[cur].multiplier);
+      uint32_t mul_bits = (tree[cur].multiplier >> mul_log) - 1;
+      tokens->emplace_back(kMultiplierLogContext, mul_log);
+      tokens->emplace_back(kMultiplierBitsContext, mul_bits);
+      JXL_ASSERT(tree[cur].predictor < Predictor::Best);
+      decoder_tree->emplace_back(-1, 0, leaf_id++, 0, tree[cur].predictor,
+                                 tree[cur].predictor_offset,
+                                 tree[cur].multiplier);
+      continue;
+    }
+    decoder_tree->emplace_back(tree[cur].property, tree[cur].splitval,
+                               decoder_tree->size() + q.size() + 1,
+                               decoder_tree->size() + q.size() + 2,
+                               Predictor::Zero, 0, 1);
+    q.push(tree[cur].lchild);
+    q.push(tree[cur].rchild);
+    tokens->emplace_back(kSplitValContext, PackSigned(tree[cur].splitval));
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h
new file mode 100644
index 0000000000..ede37c8023
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_ma.h
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
+#define LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
+
+#include <numeric>
+
+#include "lib/jxl/enc_ans.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// Struct to collect all the data needed to build a tree.
+struct TreeSamples {
+  bool HasSamples() const {
+    return !residuals.empty() && !residuals[0].empty();
+  }
+  size_t NumDistinctSamples() const { return sample_counts.size(); }
+  size_t NumSamples() const { return num_samples; }
+  // Set the predictor to use. Must be called before adding any samples.
+  Status SetPredictor(Predictor predictor,
+                      ModularOptions::TreeMode wp_tree_mode);
+  // Set the properties to use. Must be called before adding any samples.
+  Status SetProperties(const std::vector<uint32_t> &properties,
+                       ModularOptions::TreeMode wp_tree_mode);
+
+  size_t Token(size_t pred, size_t i) const { return residuals[pred][i].tok; }
+  size_t NBits(size_t pred, size_t i) const { return residuals[pred][i].nbits; }
+  size_t Count(size_t i) const { return sample_counts[i]; }
+  size_t PredictorIndex(Predictor predictor) const {
+    const auto predictor_elem =
+        std::find(predictors.begin(), predictors.end(), predictor);
+    JXL_DASSERT(predictor_elem != predictors.end());
+    return predictor_elem - predictors.begin();
+  }
+  size_t PropertyIndex(size_t property) const {
+    const auto property_elem =
+        std::find(props_to_use.begin(), props_to_use.end(), property);
+    JXL_DASSERT(property_elem != props_to_use.end());
+    return property_elem - props_to_use.begin();
+  }
+  size_t NumPropertyValues(size_t property_index) const {
+    return compact_properties[property_index].size() + 1;
+  }
+  // Returns the *quantized* property value.
+  size_t Property(size_t property_index, size_t i) const {
+    return props[property_index][i];
+  }
+  int UnquantizeProperty(size_t property_index, uint32_t quant) const {
+    JXL_ASSERT(quant < compact_properties[property_index].size());
+    return compact_properties[property_index][quant];
+  }
+
+  Predictor PredictorFromIndex(size_t index) const {
+    JXL_DASSERT(index < predictors.size());
+    return predictors[index];
+  }
+  size_t PropertyFromIndex(size_t index) const {
+    JXL_DASSERT(index < props_to_use.size());
+    return props_to_use[index];
+  }
+  size_t NumPredictors() const { return predictors.size(); }
+  size_t NumProperties() const { return props_to_use.size(); }
+
+  // Preallocate data for a given number of samples. MUST be called before
+  // adding any sample.
+  void PrepareForSamples(size_t num_samples);
+  // Add a sample.
+  void AddSample(pixel_type_w pixel, const Properties &properties,
+                 const pixel_type_w *predictions);
+  // Pre-cluster property values.
+  void PreQuantizeProperties(
+      const StaticPropRange &range,
+      const std::vector<ModularMultiplierInfo> &multiplier_info,
+      const std::vector<uint32_t> &group_pixel_count,
+      const std::vector<uint32_t> &channel_pixel_count,
+      std::vector<pixel_type> &pixel_samples,
+      std::vector<pixel_type> &diff_samples, size_t max_property_values);
+
+  void AllSamplesDone() { dedup_table_ = std::vector<uint32_t>(); }
+
+  uint32_t QuantizeProperty(uint32_t prop, pixel_type v) const {
+    v = std::min(std::max(v, -kPropertyRange), kPropertyRange) + kPropertyRange;
+    return property_mapping[prop][v];
+  }
+
+  // Swaps samples in position a and b. Does nothing if a == b.
+  void Swap(size_t a, size_t b);
+
+  // Cycles samples: a -> b -> c -> a. We assume a <= b <= c, so that we can
+  // just call Swap(a, b) if b==c.
+  void ThreeShuffle(size_t a, size_t b, size_t c);
+
+ private:
+  // TODO(veluca): as the total number of properties and predictors are known
+  // before adding any samples, it might be better to interleave predictors,
+  // properties and counts in a single vector to improve locality.
+  // A first attempt at doing this actually results in much slower encoding,
+  // possibly because of the more complex addressing.
+  struct ResidualToken {
+    uint8_t tok;
+    uint8_t nbits;
+  };
+  // Residual information: token and number of extra bits, per predictor.
+  std::vector<std::vector<ResidualToken>> residuals;
+  // Number of occurrences of each sample.
+  std::vector<uint16_t> sample_counts;
+  // Property values, quantized to at most 256 distinct values.
+  std::vector<std::vector<uint8_t>> props;
+  // Decompactification info for `props`.
+  std::vector<std::vector<int32_t>> compact_properties;
+  // List of properties to use.
+  std::vector<uint32_t> props_to_use;
+  // List of predictors to use.
+  std::vector<Predictor> predictors;
+  // Mapping property value -> quantized property value.
+  static constexpr int32_t kPropertyRange = 511;
+  std::vector<std::vector<uint8_t>> property_mapping;
+  // Number of samples seen.
+  size_t num_samples = 0;
+  // Table for deduplication.
+  static constexpr uint32_t kDedupEntryUnused{static_cast<uint32_t>(-1)};
+  std::vector<uint32_t> dedup_table_;
+
+  // Functions for sample deduplication.
+  bool IsSameSample(size_t a, size_t b) const;
+  size_t Hash1(size_t a) const;
+  size_t Hash2(size_t a) const;
+  void InitTable(size_t size);
+  // Returns true if `a` was already present in the table.
+  bool AddToTableAndMerge(size_t a);
+  void AddToTable(size_t a);
+};
+
+void TokenizeTree(const Tree &tree, std::vector<Token> *tokens,
+                  Tree *decoder_tree);
+
+void CollectPixelSamples(const Image &image, const ModularOptions &options,
+                         size_t group_id,
+                         std::vector<uint32_t> &group_pixel_count,
+                         std::vector<uint32_t> &channel_pixel_count,
+                         std::vector<pixel_type> &pixel_samples,
+                         std::vector<pixel_type> &diff_samples);
+
+void ComputeBestTree(TreeSamples &tree_samples, float threshold,
+                     const std::vector<ModularMultiplierInfo> &mul_info,
+                     StaticPropRange static_prop_range,
+                     float fast_decode_multiplier, Tree *tree);
+
+}  // namespace jxl
+#endif  // LIB_JXL_MODULAR_ENCODING_ENC_MA_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc
new file mode 100644
index 0000000000..9d2c3e5cf9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc
@@ -0,0 +1,622 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/encoding/encoding.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <queue>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/scope_guard.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+// Removes all nodes that use a static property (i.e. channel or group ID) from
+// the tree and collapses each node on even levels with its two children to
+// produce a flatter tree. Also computes whether the resulting tree requires
+// using the weighted predictor.
+FlatTree FilterTree(const Tree &global_tree,
+                    std::array<pixel_type, kNumStaticProperties> &static_props,
+                    size_t *num_props, bool *use_wp, bool *wp_only,
+                    bool *gradient_only) {
+  *num_props = 0;
+  bool has_wp = false;
+  bool has_non_wp = false;
+  *gradient_only = true;
+  const auto mark_property = [&](int32_t p) {
+    if (p == kWPProp) {
+      has_wp = true;
+    } else if (p >= kNumStaticProperties) {
+      has_non_wp = true;
+    }
+    if (p >= kNumStaticProperties && p != kGradientProp) {
+      *gradient_only = false;
+    }
+  };
+  FlatTree output;
+  std::queue<size_t> nodes;
+  nodes.push(0);
+  // Produces a trimmed and flattened tree by doing a BFS visit of the original
+  // tree, ignoring branches that are known to be false and proceeding two
+  // levels at a time to collapse nodes in a flatter tree; if an inner parent
+  // node has a leaf as a child, the leaf is duplicated and an implicit fake
+  // node is added. This allows to reduce the number of branches when traversing
+  // the resulting flat tree.
+  while (!nodes.empty()) {
+    size_t cur = nodes.front();
+    nodes.pop();
+    // Skip nodes that we can decide now, by jumping directly to their children.
+    while (global_tree[cur].property < kNumStaticProperties &&
+           global_tree[cur].property != -1) {
+      if (static_props[global_tree[cur].property] > global_tree[cur].splitval) {
+        cur = global_tree[cur].lchild;
+      } else {
+        cur = global_tree[cur].rchild;
+      }
+    }
+    FlatDecisionNode flat;
+    if (global_tree[cur].property == -1) {
+      flat.property0 = -1;
+      flat.childID = global_tree[cur].lchild;
+      flat.predictor = global_tree[cur].predictor;
+      flat.predictor_offset = global_tree[cur].predictor_offset;
+      flat.multiplier = global_tree[cur].multiplier;
+      *gradient_only &= flat.predictor == Predictor::Gradient;
+      has_wp |= flat.predictor == Predictor::Weighted;
+      has_non_wp |= flat.predictor != Predictor::Weighted;
+      output.push_back(flat);
+      continue;
+    }
+    flat.childID = output.size() + nodes.size() + 1;
+
+    flat.property0 = global_tree[cur].property;
+    *num_props = std::max<size_t>(flat.property0 + 1, *num_props);
+    flat.splitval0 = global_tree[cur].splitval;
+
+    for (size_t i = 0; i < 2; i++) {
+      size_t cur_child =
+          i == 0 ? global_tree[cur].lchild : global_tree[cur].rchild;
+      // Skip nodes that we can decide now.
+      while (global_tree[cur_child].property < kNumStaticProperties &&
+             global_tree[cur_child].property != -1) {
+        if (static_props[global_tree[cur_child].property] >
+            global_tree[cur_child].splitval) {
+          cur_child = global_tree[cur_child].lchild;
+        } else {
+          cur_child = global_tree[cur_child].rchild;
+        }
+      }
+      // We ended up in a leaf, add a dummy decision and two copies of the leaf.
+      if (global_tree[cur_child].property == -1) {
+        flat.properties[i] = 0;
+        flat.splitvals[i] = 0;
+        nodes.push(cur_child);
+        nodes.push(cur_child);
+      } else {
+        flat.properties[i] = global_tree[cur_child].property;
+        flat.splitvals[i] = global_tree[cur_child].splitval;
+        nodes.push(global_tree[cur_child].lchild);
+        nodes.push(global_tree[cur_child].rchild);
+        *num_props = std::max<size_t>(flat.properties[i] + 1, *num_props);
+      }
+    }
+
+    for (size_t j = 0; j < 2; j++) mark_property(flat.properties[j]);
+    mark_property(flat.property0);
+    output.push_back(flat);
+  }
+  if (*num_props > kNumNonrefProperties) {
+    *num_props =
+        DivCeil(*num_props - kNumNonrefProperties, kExtraPropsPerChannel) *
+            kExtraPropsPerChannel +
+        kNumNonrefProperties;
+  } else {
+    *num_props = kNumNonrefProperties;
+  }
+  *use_wp = has_wp;
+  *wp_only = has_wp && !has_non_wp;
+
+  return output;
+}
+
+Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
+                                 const std::vector<uint8_t> &context_map,
+                                 const Tree &global_tree,
+                                 const weighted::Header &wp_header,
+                                 pixel_type chan, size_t group_id,
+                                 Image *image) {
+  Channel &channel = image->channel[chan];
+
+  std::array<pixel_type, kNumStaticProperties> static_props = {
+      {chan, (int)group_id}};
+  // TODO(veluca): filter the tree according to static_props.
+
+  // zero pixel channel? could happen
+  if (channel.w == 0 || channel.h == 0) return true;
+
+  bool tree_has_wp_prop_or_pred = false;
+  bool is_wp_only = false;
+  bool is_gradient_only = false;
+  size_t num_props;
+  FlatTree tree =
+      FilterTree(global_tree, static_props, &num_props,
+                 &tree_has_wp_prop_or_pred, &is_wp_only, &is_gradient_only);
+
+  // From here on, tree lookup returns a *clustered* context ID.
+  // This avoids an extra memory lookup after tree traversal.
+  for (size_t i = 0; i < tree.size(); i++) {
+    if (tree[i].property0 == -1) {
+      tree[i].childID = context_map[tree[i].childID];
+    }
+  }
+
+  JXL_DEBUG_V(3, "Decoded MA tree with %" PRIuS " nodes", tree.size());
+
+  // MAANS decode
+  const auto make_pixel = [](uint64_t v, pixel_type multiplier,
+                             pixel_type_w offset) -> pixel_type {
+    JXL_DASSERT((v & 0xFFFFFFFF) == v);
+    pixel_type_w val = UnpackSigned(v);
+    // if it overflows, it overflows, and we have a problem anyway
+    return val * multiplier + offset;
+  };
+
+  if (tree.size() == 1) {
+    // special optimized case: no meta-adaptation, so no need
+    // to compute properties.
+    Predictor predictor = tree[0].predictor;
+    int64_t offset = tree[0].predictor_offset;
+    int32_t multiplier = tree[0].multiplier;
+    size_t ctx_id = tree[0].childID;
+    if (predictor == Predictor::Zero) {
+      uint32_t value;
+      if (reader->IsSingleValueAndAdvance(ctx_id, &value,
+                                          channel.w * channel.h)) {
+        // Special-case: histogram has a single symbol, with no extra bits, and
+        // we use ANS mode.
+        JXL_DEBUG_V(8, "Fastest track.");
+        pixel_type v = make_pixel(value, multiplier, offset);
+        for (size_t y = 0; y < channel.h; y++) {
+          pixel_type *JXL_RESTRICT r = channel.Row(y);
+          std::fill(r, r + channel.w, v);
+        }
+      } else {
+        JXL_DEBUG_V(8, "Fast track.");
+        if (multiplier == 1 && offset == 0) {
+          for (size_t y = 0; y < channel.h; y++) {
+            pixel_type *JXL_RESTRICT r = channel.Row(y);
+            for (size_t x = 0; x < channel.w; x++) {
+              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              r[x] = UnpackSigned(v);
+            }
+          }
+        } else {
+          for (size_t y = 0; y < channel.h; y++) {
+            pixel_type *JXL_RESTRICT r = channel.Row(y);
+            for (size_t x = 0; x < channel.w; x++) {
+              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              r[x] = make_pixel(v, multiplier, offset);
+            }
+          }
+        }
+      }
+    } else if (predictor == Predictor::Gradient && offset == 0 &&
+               multiplier == 1 && reader->HuffRleOnly()) {
+      JXL_DEBUG_V(8, "Gradient RLE (fjxl) very fast track.");
+      uint32_t run = 0;
+      uint32_t v = 0;
+      pixel_type_w sv = 0;
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        const pixel_type *JXL_RESTRICT rtop = (y ? channel.Row(y - 1) : r - 1);
+        const pixel_type *JXL_RESTRICT rtopleft =
+            (y ? channel.Row(y - 1) - 1 : r - 1);
+        pixel_type_w guess = (y ? rtop[0] : 0);
+        if (run == 0) {
+          reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run);
+          sv = UnpackSigned(v);
+        } else {
+          run--;
+        }
+        r[0] = sv + guess;
+        for (size_t x = 1; x < channel.w; x++) {
+          pixel_type left = r[x - 1];
+          pixel_type top = rtop[x];
+          pixel_type topleft = rtopleft[x];
+          pixel_type_w guess = ClampedGradient(top, left, topleft);
+          if (!run) {
+            reader->ReadHybridUintClusteredHuffRleOnly(ctx_id, br, &v, &run);
+            sv = UnpackSigned(v);
+          } else {
+            run--;
+          }
+          r[x] = sv + guess;
+        }
+      }
+    } else if (predictor == Predictor::Gradient && offset == 0 &&
+               multiplier == 1) {
+      JXL_DEBUG_V(8, "Gradient very fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          pixel_type left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+          pixel_type top = (y ? *(r + x - onerow) : left);
+          pixel_type topleft = (x && y ? *(r + x - 1 - onerow) : left);
+          pixel_type guess = ClampedGradient(top, left, topleft);
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          r[x] = make_pixel(v, 1, guess);
+        }
+      }
+    } else if (predictor != Predictor::Weighted) {
+      // special optimized case: no wp
+      JXL_DEBUG_V(8, "Quite fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          PredictionResult pred =
+              PredictNoTreeNoWP(channel.w, r + x, onerow, x, y, predictor);
+          pixel_type_w g = pred.guess + offset;
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          // NOTE: pred.multiplier is unset.
+          r[x] = make_pixel(v, multiplier, g);
+        }
+      }
+    } else {
+      JXL_DEBUG_V(8, "Somewhat fast track.");
+      const intptr_t onerow = channel.plane.PixelsPerRow();
+      weighted::State wp_state(wp_header, channel.w, channel.h);
+      for (size_t y = 0; y < channel.h; y++) {
+        pixel_type *JXL_RESTRICT r = channel.Row(y);
+        for (size_t x = 0; x < channel.w; x++) {
+          pixel_type_w g = PredictNoTreeWP(channel.w, r + x, onerow, x, y,
+                                           predictor, &wp_state)
+                               .guess +
+                           offset;
+          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          r[x] = make_pixel(v, multiplier, g);
+          wp_state.UpdateErrors(r[x], x, y, channel.w);
+        }
+      }
+    }
+    return true;
+  }
+
+  // Check if this tree is a WP-only tree with a small enough property value
+  // range.
+  // Initialized to avoid clang-tidy complaining.
+  uint8_t context_lookup[2 * kPropRangeFast] = {};
+  int8_t multipliers[2 * kPropRangeFast] = {};
+  int8_t offsets[2 * kPropRangeFast] = {};
+  if (is_wp_only) {
+    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+  }
+  if (is_gradient_only) {
+    is_gradient_only =
+        TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+  }
+
+  if (is_gradient_only) {
+    JXL_DEBUG_V(8, "Gradient fast track.");
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        int32_t guess = ClampedGradient(top, left, topleft);
+        uint32_t pos =
+            kPropRangeFast +
+            std::min<pixel_type_w>(
+                std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
+                kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+        r[x] = make_pixel(v, multipliers[pos],
+                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+      }
+    }
+  } else if (is_wp_only) {
+    JXL_DEBUG_V(8, "WP fast track.");
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    Properties properties(1);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT r = channel.Row(y);
+      for (size_t x = 0; x < channel.w; x++) {
+        size_t offset = 0;
+        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
+        pixel_type_w top = (y ? *(r + x - onerow) : left);
+        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
+        pixel_type_w topright =
+            (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top);
+        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, top, left, topright, topleft, toptop, &properties,
+            offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = context_lookup[pos];
+        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+        r[x] = make_pixel(v, multipliers[pos],
+                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+    }
+  } else if (!tree_has_wp_prop_or_pred) {
+    // special optimized case: the weighted predictor and its properties are not
+    // used, so no need to compute weights and properties.
+    JXL_DEBUG_V(8, "Slow track.");
+    MATreeLookup tree_lookup(tree);
+    Properties properties = Properties(num_props);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT p = channel.Row(y);
+      PrecomputeReferences(channel, y, *image, chan, &references);
+      InitPropsRow(&properties, static_props, y);
+      if (y > 1 && channel.w > 8 && references.w == 0) {
+        for (size_t x = 0; x < 2; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+        for (size_t x = 2; x < channel.w - 2; x++) {
+          PredictionResult res =
+              PredictTreeNoWPNEC(&properties, channel.w, p + x, onerow, x, y,
+                                 tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+        for (size_t x = channel.w - 2; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+      } else {
+        for (size_t x = 0; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
+                              tree_lookup, references);
+          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+        }
+      }
+    }
+  } else {
+    JXL_DEBUG_V(8, "Slowest track.");
+    MATreeLookup tree_lookup(tree);
+    Properties properties = Properties(num_props);
+    const intptr_t onerow = channel.plane.PixelsPerRow();
+    Channel references(properties.size() - kNumNonrefProperties, channel.w);
+    weighted::State wp_state(wp_header, channel.w, channel.h);
+    for (size_t y = 0; y < channel.h; y++) {
+      pixel_type *JXL_RESTRICT p = channel.Row(y);
+      InitPropsRow(&properties, static_props, y);
+      PrecomputeReferences(channel, y, *image, chan, &references);
+      for (size_t x = 0; x < channel.w; x++) {
+        PredictionResult res =
+            PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                          tree_lookup, references, &wp_state);
+        uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+        p[x] = make_pixel(v, res.multiplier, res.guess);
+        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      }
+    }
+  }
+  return true;
+}
+
+GroupHeader::GroupHeader() { Bundle::Init(this); }
+
+Status ValidateChannelDimensions(const Image &image,
+                                 const ModularOptions &options) {
+  size_t nb_channels = image.channel.size();
+  for (bool is_dc : {true, false}) {
+    size_t group_dim = options.group_dim * (is_dc ? kBlockDim : 1);
+    size_t c = image.nb_meta_channels;
+    for (; c < nb_channels; c++) {
+      const Channel &ch = image.channel[c];
+      if (ch.w > options.group_dim || ch.h > options.group_dim) break;
+    }
+    for (; c < nb_channels; c++) {
+      const Channel &ch = image.channel[c];
+      if (ch.w == 0 || ch.h == 0) continue;  // skip empty
+      bool is_dc_channel = std::min(ch.hshift, ch.vshift) >= 3;
+      if (is_dc_channel != is_dc) continue;
+      size_t tile_dim = group_dim >> std::max(ch.hshift, ch.vshift);
+      if (tile_dim == 0) {
+        return JXL_FAILURE("Inconsistent transforms");
+      }
+    }
+  }
+  return true;
+}
+
+Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
+                     size_t group_id, ModularOptions *options,
+                     const Tree *global_tree, const ANSCode *global_code,
+                     const std::vector<uint8_t> *global_ctx_map,
+                     bool allow_truncated_group) {
+  if (image.channel.empty()) return true;
+
+  // decode transforms
+  Status status = Bundle::Read(br, &header);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(status);
+  if (status.IsFatalError()) return status;
+  if (!br->AllReadsWithinBounds()) {
+    // Don't do/undo transforms if header is incomplete.
+    header.transforms.clear();
+    image.transform = header.transforms;
+    for (size_t c = 0; c < image.channel.size(); c++) {
+      ZeroFillImage(&image.channel[c].plane);
+    }
+    return Status(StatusCode::kNotEnoughBytes);
+  }
+
+  JXL_DEBUG_V(3, "Image data underwent %" PRIuS " transformations: ",
+              header.transforms.size());
+  image.transform = header.transforms;
+  for (Transform &transform : image.transform) {
+    JXL_RETURN_IF_ERROR(transform.MetaApply(image));
+  }
+  if (image.error) {
+    return JXL_FAILURE("Corrupt file. Aborting.");
+  }
+  JXL_RETURN_IF_ERROR(ValidateChannelDimensions(image, *options));
+
+  size_t nb_channels = image.channel.size();
+
+  size_t num_chans = 0;
+  size_t distance_multiplier = 0;
+  for (size_t i = 0; i < nb_channels; i++) {
+    Channel &channel = image.channel[i];
+    if (!channel.w || !channel.h) {
+      continue;  // skip empty channels
+    }
+    if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size ||
+                                        channel.h > options->max_chan_size)) {
+      break;
+    }
+    if (channel.w > distance_multiplier) {
+      distance_multiplier = channel.w;
+    }
+    num_chans++;
+  }
+  if (num_chans == 0) return true;
+
+  size_t next_channel = 0;
+  auto scope_guard = MakeScopeGuard([&]() {
+    // Do not do anything if truncated groups are not allowed.
+    if (!allow_truncated_group) return;
+    for (size_t c = next_channel; c < nb_channels; c++) {
+      ZeroFillImage(&image.channel[c].plane);
+    }
+  });
+
+  // Read tree.
+  Tree tree_storage;
+  std::vector<uint8_t> context_map_storage;
+  ANSCode code_storage;
+  const Tree *tree = &tree_storage;
+  const ANSCode *code = &code_storage;
+  const std::vector<uint8_t> *context_map = &context_map_storage;
+  if (!header.use_global_tree) {
+    size_t max_tree_size = 1024;
+    for (size_t i = 0; i < nb_channels; i++) {
+      Channel &channel = image.channel[i];
+      if (!channel.w || !channel.h) {
+        continue;  // skip empty channels
+      }
+      if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size ||
+                                          channel.h > options->max_chan_size)) {
+        break;
+      }
+      size_t pixels = channel.w * channel.h;
+      if (pixels / channel.w != channel.h) {
+        return JXL_FAILURE("Tree size overflow");
+      }
+      max_tree_size += pixels;
+      if (max_tree_size < pixels) return JXL_FAILURE("Tree size overflow");
+    }
+    max_tree_size = std::min(static_cast<size_t>(1 << 20), max_tree_size);
+    JXL_RETURN_IF_ERROR(DecodeTree(br, &tree_storage, max_tree_size));
+    JXL_RETURN_IF_ERROR(DecodeHistograms(br, (tree_storage.size() + 1) / 2,
+                                         &code_storage, &context_map_storage));
+  } else {
+    if (!global_tree || !global_code || !global_ctx_map ||
+        global_tree->empty()) {
+      return JXL_FAILURE("No global tree available but one was requested");
+    }
+    tree = global_tree;
+    code = global_code;
+    context_map = global_ctx_map;
+  }
+
+  // Read channels
+  ANSSymbolReader reader(code, br, distance_multiplier);
+  for (; next_channel < nb_channels; next_channel++) {
+    Channel &channel = image.channel[next_channel];
+    if (!channel.w || !channel.h) {
+      continue;  // skip empty channels
+    }
+    if (next_channel >= image.nb_meta_channels &&
+        (channel.w > options->max_chan_size ||
+         channel.h > options->max_chan_size)) {
+      break;
+    }
+    JXL_RETURN_IF_ERROR(DecodeModularChannelMAANS(
+        br, &reader, *context_map, *tree, header.wp_header, next_channel,
+        group_id, &image));
+    // Truncated group.
+    if (!br->AllReadsWithinBounds()) {
+      if (!allow_truncated_group) return JXL_FAILURE("Truncated input");
+      return Status(StatusCode::kNotEnoughBytes);
+    }
+  }
+
+  // Make sure no zero-filling happens even if next_channel < nb_channels.
+  scope_guard.Disarm();
+
+  if (!reader.CheckANSFinalState()) {
+    return JXL_FAILURE("ANS decode final state failed");
+  }
+  return true;
+}
+
+Status ModularGenericDecompress(BitReader *br, Image &image,
+                                GroupHeader *header, size_t group_id,
+                                ModularOptions *options, bool undo_transforms,
+                                const Tree *tree, const ANSCode *code,
+                                const std::vector<uint8_t> *ctx_map,
+                                bool allow_truncated_group) {
+#ifdef JXL_ENABLE_ASSERT
+  std::vector<std::pair<uint32_t, uint32_t>> req_sizes(image.channel.size());
+  for (size_t c = 0; c < req_sizes.size(); c++) {
+    req_sizes[c] = {image.channel[c].w, image.channel[c].h};
+  }
+#endif
+  GroupHeader local_header;
+  if (header == nullptr) header = &local_header;
+  size_t bit_pos = br->TotalBitsConsumed();
+  auto dec_status = ModularDecode(br, image, *header, group_id, options, tree,
+                                  code, ctx_map, allow_truncated_group);
+  if (!allow_truncated_group) JXL_RETURN_IF_ERROR(dec_status);
+  if (dec_status.IsFatalError()) return dec_status;
+  if (undo_transforms) image.undo_transforms(header->wp_header);
+  if (image.error) return JXL_FAILURE("Corrupt file. Aborting.");
+  JXL_DEBUG_V(4,
+              "Modular-decoded a %" PRIuS "x%" PRIuS " nbchans=%" PRIuS
+              " image from %" PRIuS " bytes",
+              image.w, image.h, image.channel.size(),
+              (br->TotalBitsConsumed() - bit_pos) / 8);
+  JXL_DEBUG_V(5, "Modular image: %s", image.DebugString().c_str());
+  (void)bit_pos;
+#ifdef JXL_ENABLE_ASSERT
+  // Check that after applying all transforms we are back to the requested image
+  // sizes, otherwise there's a programming error with the transformations.
+  if (undo_transforms) {
+    JXL_ASSERT(image.channel.size() == req_sizes.size());
+    for (size_t c = 0; c < req_sizes.size(); c++) {
+      JXL_ASSERT(req_sizes[c].first == image.channel[c].w);
+      JXL_ASSERT(req_sizes[c].second == image.channel[c].h);
+    }
+  }
+#endif
+  return dec_status;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h
new file mode 100644
index 0000000000..89697bce87
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.h
@@ -0,0 +1,135 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_ENCODING_H_
+#define LIB_JXL_MODULAR_ENCODING_ENCODING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+// Valid range of properties for using lookup tables instead of trees.
+constexpr int32_t kPropRangeFast = 512;
+
+struct GroupHeader : public Fields {
+  GroupHeader();
+
+  JXL_FIELDS_NAME(GroupHeader)
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &use_global_tree));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&wp_header));
+    uint32_t num_transforms = static_cast<uint32_t>(transforms.size());
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(0), Val(1), BitsOffset(4, 2),
+                                           BitsOffset(8, 18), 0,
+                                           &num_transforms));
+    if (visitor->IsReading()) transforms.resize(num_transforms);
+    for (size_t i = 0; i < num_transforms; i++) {
+      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&transforms[i]));
+    }
+    return true;
+  }
+
+  bool use_global_tree;
+  weighted::Header wp_header;
+
+  std::vector<Transform> transforms;
+};
+
+FlatTree FilterTree(const Tree &global_tree,
+                    std::array<pixel_type, kNumStaticProperties> &static_props,
+                    size_t *num_props, bool *use_wp, bool *wp_only,
+                    bool *gradient_only);
+
+template <typename T>
+bool TreeToLookupTable(const FlatTree &tree,
+                       T context_lookup[2 * kPropRangeFast],
+                       int8_t offsets[2 * kPropRangeFast],
+                       int8_t multipliers[2 * kPropRangeFast] = nullptr) {
+  struct TreeRange {
+    // Begin *excluded*, end *included*. This works best with > vs <= decision
+    // nodes.
+    int begin, end;
+    size_t pos;
+  };
+  std::vector<TreeRange> ranges;
+  ranges.push_back(TreeRange{-kPropRangeFast - 1, kPropRangeFast - 1, 0});
+  while (!ranges.empty()) {
+    TreeRange cur = ranges.back();
+    ranges.pop_back();
+    if (cur.begin < -kPropRangeFast - 1 || cur.begin >= kPropRangeFast - 1 ||
+        cur.end > kPropRangeFast - 1) {
+      // Tree is outside the allowed range, exit.
+      return false;
+    }
+    auto &node = tree[cur.pos];
+    // Leaf.
+    if (node.property0 == -1) {
+      if (node.predictor_offset < std::numeric_limits<int8_t>::min() ||
+          node.predictor_offset > std::numeric_limits<int8_t>::max()) {
+        return false;
+      }
+      if (node.multiplier < std::numeric_limits<int8_t>::min() ||
+          node.multiplier > std::numeric_limits<int8_t>::max()) {
+        return false;
+      }
+      if (multipliers == nullptr && node.multiplier != 1) {
+        return false;
+      }
+      for (int i = cur.begin + 1; i < cur.end + 1; i++) {
+        context_lookup[i + kPropRangeFast] = node.childID;
+        if (multipliers) multipliers[i + kPropRangeFast] = node.multiplier;
+        offsets[i + kPropRangeFast] = node.predictor_offset;
+      }
+      continue;
+    }
+    // > side of top node.
+    if (node.properties[0] >= kNumStaticProperties) {
+      ranges.push_back(TreeRange({node.splitvals[0], cur.end, node.childID}));
+      ranges.push_back(
+          TreeRange({node.splitval0, node.splitvals[0], node.childID + 1}));
+    } else {
+      ranges.push_back(TreeRange({node.splitval0, cur.end, node.childID}));
+    }
+    // <= side
+    if (node.properties[1] >= kNumStaticProperties) {
+      ranges.push_back(
+          TreeRange({node.splitvals[1], node.splitval0, node.childID + 2}));
+      ranges.push_back(
+          TreeRange({cur.begin, node.splitvals[1], node.childID + 3}));
+    } else {
+      ranges.push_back(
+          TreeRange({cur.begin, node.splitval0, node.childID + 2}));
+    }
+  }
+  return true;
+}
+// TODO(veluca): make cleaner interfaces.
+
+Status ValidateChannelDimensions(const Image &image,
+                                 const ModularOptions &options);
+
+Status ModularGenericDecompress(BitReader *br, Image &image,
+                                GroupHeader *header, size_t group_id,
+                                ModularOptions *options,
+                                bool undo_transforms = true,
+                                const Tree *tree = nullptr,
+                                const ANSCode *code = nullptr,
+                                const std::vector<uint8_t> *ctx_map = nullptr,
+                                bool allow_truncated_group = false);
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_ENCODING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h
new file mode 100644
index 0000000000..71b7847321
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/ma_common.h
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
+#define LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
+
+#include <stddef.h>
+
+namespace jxl {
+
+enum MATreeContext : size_t {
+  kSplitValContext = 0,
+  kPropertyContext = 1,
+  kPredictorContext = 2,
+  kOffsetContext = 3,
+  kMultiplierLogContext = 4,
+  kMultiplierBitsContext = 5,
+
+  kNumTreeContexts = 6,
+};
+
+static constexpr size_t kMaxTreeSize = 1 << 22;
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_ENCODING_MA_COMMON_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc
new file mode 100644
index 0000000000..785d0c5443
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc
@@ -0,0 +1,77 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/modular_image.h"
+
+#include <sstream>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+void Image::undo_transforms(const weighted::Header &wp_header,
+                            jxl::ThreadPool *pool) {
+  while (!transform.empty()) {
+    Transform t = transform.back();
+    JXL_DEBUG_V(4, "Undoing transform");
+    Status result = t.Inverse(*this, wp_header, pool);
+    if (result == false) {
+      JXL_NOTIFY_ERROR("Error while undoing transform.");
+      error = true;
+      return;
+    }
+    JXL_DEBUG_V(8, "Undoing transform: done");
+    transform.pop_back();
+  }
+}
+
+Image::Image(size_t iw, size_t ih, int bitdepth, int nb_chans)
+    : w(iw), h(ih), bitdepth(bitdepth), nb_meta_channels(0), error(false) {
+  for (int i = 0; i < nb_chans; i++) channel.emplace_back(Channel(iw, ih));
+}
+
+Image::Image() : w(0), h(0), bitdepth(8), nb_meta_channels(0), error(true) {}
+
+Image &Image::operator=(Image &&other) noexcept {
+  w = other.w;
+  h = other.h;
+  bitdepth = other.bitdepth;
+  nb_meta_channels = other.nb_meta_channels;
+  error = other.error;
+  channel = std::move(other.channel);
+  transform = std::move(other.transform);
+  return *this;
+}
+
+Image Image::clone() {
+  Image c(w, h, bitdepth, 0);
+  c.nb_meta_channels = nb_meta_channels;
+  c.error = error;
+  c.transform = transform;
+  for (Channel &ch : channel) {
+    Channel a(ch.w, ch.h, ch.hshift, ch.vshift);
+    CopyImageTo(ch.plane, &a.plane);
+    c.channel.push_back(std::move(a));
+  }
+  return c;
+}
+
+std::string Image::DebugString() const {
+  std::ostringstream os;
+  os << w << "x" << h << ", depth: " << bitdepth;
+  if (!channel.empty()) {
+    os << ", channels:";
+    for (size_t i = 0; i < channel.size(); ++i) {
+      os << " " << channel[i].w << "x" << channel[i].h
+         << "(shift: " << channel[i].hshift << "," << channel[i].vshift << ")";
+      if (i < nb_meta_channels) os << "*";
+    }
+  }
+  return os.str();
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/modular_image.h b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h
new file mode 100644
index 0000000000..3e9b5a8a08
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/modular_image.h
@@ -0,0 +1,118 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_MODULAR_IMAGE_H_
+#define LIB_JXL_MODULAR_MODULAR_IMAGE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+typedef int32_t pixel_type;  // can use int16_t if it's only for 8-bit images.
+                             // Need some wiggle room for YCoCg / Squeeze etc
+
+typedef int64_t pixel_type_w;
+
+namespace weighted {
+struct Header;
+}
+
+class Channel {
+ public:
+  jxl::Plane<pixel_type> plane;
+  size_t w, h;
+  int hshift, vshift;  // w ~= image.w >> hshift;  h ~= image.h >> vshift
+  Channel(size_t iw, size_t ih, int hsh = 0, int vsh = 0)
+      : plane(iw, ih), w(iw), h(ih), hshift(hsh), vshift(vsh) {}
+
+  Channel(const Channel& other) = delete;
+  Channel& operator=(const Channel& other) = delete;
+
+  // Move assignment
+  Channel& operator=(Channel&& other) noexcept {
+    w = other.w;
+    h = other.h;
+    hshift = other.hshift;
+    vshift = other.vshift;
+    plane = std::move(other.plane);
+    return *this;
+  }
+
+  // Move constructor
+  Channel(Channel&& other) noexcept = default;
+
+  void shrink() {
+    if (plane.xsize() == w && plane.ysize() == h) return;
+    jxl::Plane<pixel_type> resizedplane(w, h);
+    plane = std::move(resizedplane);
+  }
+  void shrink(int nw, int nh) {
+    w = nw;
+    h = nh;
+    shrink();
+  }
+
+  JXL_INLINE pixel_type* Row(const size_t y) { return plane.Row(y); }
+  JXL_INLINE const pixel_type* Row(const size_t y) const {
+    return plane.Row(y);
+  }
+};
+
+class Transform;
+
+class Image {
+ public:
+  // image data, transforms can dramatically change the number of channels and
+  // their semantics
+  std::vector<Channel> channel;
+  // transforms that have been applied (and that have to be undone)
+  std::vector<Transform> transform;
+
+  // image dimensions (channels may have different dimensions due to transforms)
+  size_t w, h;
+  int bitdepth;
+  size_t nb_meta_channels;  // first few channels might contain palette(s)
+  bool error;               // true if a fatal error occurred, false otherwise
+
+  Image(size_t iw, size_t ih, int bitdepth, int nb_chans);
+  Image();
+
+  Image(const Image& other) = delete;
+  Image& operator=(const Image& other) = delete;
+
+  Image& operator=(Image&& other) noexcept;
+  Image(Image&& other) noexcept = default;
+
+  bool empty() const {
+    for (const auto& ch : channel) {
+      if (ch.w && ch.h) return false;
+    }
+    return true;
+  }
+
+  Image clone();
+
+  void undo_transforms(const weighted::Header& wp_header,
+                       jxl::ThreadPool* pool = nullptr);
+
+  std::string DebugString() const;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_MODULAR_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/options.h b/third_party/jpeg-xl/lib/jxl/modular/options.h
new file mode 100644
index 0000000000..ce6596b912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/options.h
@@ -0,0 +1,117 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_OPTIONS_H_
+#define LIB_JXL_MODULAR_OPTIONS_H_
+
+#include <stdint.h>
+
+#include <array>
+#include <vector>
+
+namespace jxl {
+
+using PropertyVal = int32_t;
+using Properties = std::vector<PropertyVal>;
+
+enum class Predictor : uint32_t {
+  Zero = 0,
+  Left = 1,
+  Top = 2,
+  Average0 = 3,
+  Select = 4,
+  Gradient = 5,
+  Weighted = 6,
+  TopRight = 7,
+  TopLeft = 8,
+  LeftLeft = 9,
+  Average1 = 10,
+  Average2 = 11,
+  Average3 = 12,
+  Average4 = 13,
+  // The following predictors are encoder-only.
+  Best = 14,  // Best of Gradient and Weighted
+  Variable =
+      15,  // Find the best decision tree for predictors/predictor per row
+};
+
+constexpr size_t kNumModularPredictors =
+    static_cast<size_t>(Predictor::Average4) + 1;
+constexpr size_t kNumModularEncoderPredictors =
+    static_cast<size_t>(Predictor::Variable) + 1;
+
+static constexpr ssize_t kNumStaticProperties = 2;  // channel, group_id.
+
+using StaticPropRange =
+    std::array<std::array<uint32_t, 2>, kNumStaticProperties>;
+
+struct ModularMultiplierInfo {
+  StaticPropRange range;
+  uint32_t multiplier;
+};
+
+struct ModularOptions {
+  /// Used in both encode and decode:
+
+  // Stop encoding/decoding when reaching a (non-meta) channel that has a
+  // dimension bigger than max_chan_size.
+  size_t max_chan_size = 0xFFFFFF;
+
+  // Used during decoding for validation of transforms (sqeeezing) scheme.
+  size_t group_dim = 0x1FFFFFFF;
+
+  /// Encode options:
+  // Fraction of pixels to look at to learn a MA tree
+  // Number of iterations to do to learn a MA tree
+  // (if zero there is no MA context model)
+  float nb_repeats = .5f;
+
+  // Maximum number of (previous channel) properties to use in the MA trees
+  int max_properties = 0;  // no previous channels
+
+  // Alternative heuristic tweaks.
+  // Properties default to channel, group, weighted, gradient residual, W-NW,
+  // NW-N, N-NE, N-NN
+  std::vector<uint32_t> splitting_heuristics_properties = {0,  1,  15, 9,
+                                                           10, 11, 12, 13};
+  float splitting_heuristics_node_threshold = 96;
+  size_t max_property_values = 32;
+
+  // Predictor to use for each channel.
+  Predictor predictor = static_cast<Predictor>(-1);
+
+  int wp_mode = 0;
+
+  float fast_decode_multiplier = 1.01f;
+
+  // Forces the encoder to produce a tree that is compatible with the WP-only
+  // decode path (or with the no-wp path, or the gradient-only path).
+  enum class TreeMode { kGradientOnly, kWPOnly, kNoWP, kDefault };
+  TreeMode wp_tree_mode = TreeMode::kDefault;
+
+  // Skip fast paths in the encoder.
+  bool skip_encoder_fast_path = false;
+
+  // Kind of tree to use.
+  // TODO(veluca): add tree kinds for JPEG recompression with CfL enabled,
+  // general AC metadata, different DC qualities, and others.
+  enum class TreeKind {
+    kTrivialTreeNoPredictor,
+    kLearn,
+    kJpegTranscodeACMeta,
+    kFalconACMeta,
+    kACMeta,
+    kWPFixedDC,
+    kGradientFixedDC,
+  };
+  TreeKind tree_kind = TreeKind::kLearn;
+
+  // Ignore the image and just pretend all tokens are zeroes
+  bool zero_tokens = false;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_OPTIONS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc
new file mode 100644
index 0000000000..bc31445bc5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.cc
@@ -0,0 +1,606 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_palette.h"
+
+#include <array>
+#include <map>
+#include <set>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/enc_transform.h"
+#include "lib/jxl/modular/transform/palette.h"
+
+namespace jxl {
+
+namespace palette_internal {
+
+static constexpr bool kEncodeToHighQualityImplicitPalette = true;
+
+// Inclusive.
+static constexpr int kMinImplicitPaletteIndex = -(2 * 72 - 1);
+
+float ColorDistance(const std::vector<float> &JXL_RESTRICT a,
+                    const std::vector<pixel_type> &JXL_RESTRICT b) {
+  JXL_ASSERT(a.size() == b.size());
+  float distance = 0;
+  float ave3 = 0;
+  if (a.size() >= 3) {
+    ave3 = (a[0] + b[0] + a[1] + b[1] + a[2] + b[2]) * (1.21f / 3.0f);
+  }
+  float sum_a = 0, sum_b = 0;
+  for (size_t c = 0; c < a.size(); ++c) {
+    const float difference =
+        static_cast<float>(a[c]) - static_cast<float>(b[c]);
+    float weight = c == 0 ? 3 : c == 1 ? 5 : 2;
+    if (c < 3 && (a[c] + b[c] >= ave3)) {
+      const float add_w[3] = {
+          1.15,
+          1.15,
+          1.12,
+      };
+      weight += add_w[c];
+      if (c == 2 && ((a[2] + b[2]) < 1.22 * ave3)) {
+        weight -= 0.5;
+      }
+    }
+    distance += difference * difference * weight * weight;
+    const int sum_weight = c == 0 ? 3 : c == 1 ? 5 : 1;
+    sum_a += a[c] * sum_weight;
+    sum_b += b[c] * sum_weight;
+  }
+  distance *= 4;
+  float sum_difference = sum_a - sum_b;
+  distance += sum_difference * sum_difference;
+  return distance;
+}
+
+static int QuantizeColorToImplicitPaletteIndex(
+    const std::vector<pixel_type> &color, const int palette_size,
+    const int bit_depth, bool high_quality) {
+  int index = 0;
+  if (high_quality) {
+    int multiplier = 1;
+    for (size_t c = 0; c < color.size(); c++) {
+      int quantized = ((kLargeCube - 1) * color[c] + (1 << (bit_depth - 1))) /
+                      ((1 << bit_depth) - 1);
+      JXL_ASSERT((quantized % kLargeCube) == quantized);
+      index += quantized * multiplier;
+      multiplier *= kLargeCube;
+    }
+    return index + palette_size + kLargeCubeOffset;
+  } else {
+    int multiplier = 1;
+    for (size_t c = 0; c < color.size(); c++) {
+      int value = color[c];
+      value -= 1 << (std::max(0, bit_depth - 3));
+      value = std::max(0, value);
+      int quantized = ((kLargeCube - 1) * value + (1 << (bit_depth - 1))) /
+                      ((1 << bit_depth) - 1);
+      JXL_ASSERT((quantized % kLargeCube) == quantized);
+      if (quantized > kSmallCube - 1) {
+        quantized = kSmallCube - 1;
+      }
+      index += quantized * multiplier;
+      multiplier *= kSmallCube;
+    }
+    return index + palette_size;
+  }
+}
+
+}  // namespace palette_internal
+
+int RoundInt(int value, int div) {  // symmetric rounding around 0
+  if (value < 0) return -RoundInt(-value, div);
+  return (value + div / 2) / div;
+}
+
+struct PaletteIterationData {
+  static constexpr int kMaxDeltas = 128;
+  bool final_run = false;
+  std::vector<pixel_type> deltas[3];
+  std::vector<double> delta_distances;
+  std::vector<pixel_type> frequent_deltas[3];
+
+  // Populates `frequent_deltas` with items from `deltas` based on frequencies
+  // and color distances.
+  void FindFrequentColorDeltas(int num_pixels, int bitdepth) {
+    using pixel_type_3d = std::array<pixel_type, 3>;
+    std::map<pixel_type_3d, double> delta_frequency_map;
+    pixel_type bucket_size = 3 << std::max(0, bitdepth - 8);
+    // Store frequency weighted by delta distance from quantized value.
+    for (size_t i = 0; i < deltas[0].size(); ++i) {
+      pixel_type_3d delta = {
+          {RoundInt(deltas[0][i], bucket_size),
+           RoundInt(deltas[1][i], bucket_size),
+           RoundInt(deltas[2][i], bucket_size)}};  // a basic form of clustering
+      if (delta[0] == 0 && delta[1] == 0 && delta[2] == 0) continue;
+      delta_frequency_map[delta] += sqrt(sqrt(delta_distances[i]));
+    }
+
+    const float delta_distance_multiplier = 1.0f / num_pixels;
+
+    // Weigh frequencies by magnitude and normalize.
+    for (auto &delta_frequency : delta_frequency_map) {
+      std::vector<pixel_type> current_delta = {delta_frequency.first[0],
+                                               delta_frequency.first[1],
+                                               delta_frequency.first[2]};
+      float delta_distance =
+          sqrt(palette_internal::ColorDistance({0, 0, 0}, current_delta)) + 1;
+      delta_frequency.second *= delta_distance * delta_distance_multiplier;
+    }
+
+    // Sort by weighted frequency.
+    using pixel_type_3d_frequency = std::pair<pixel_type_3d, double>;
+    std::vector<pixel_type_3d_frequency> sorted_delta_frequency_map(
+        delta_frequency_map.begin(), delta_frequency_map.end());
+    std::sort(
+        sorted_delta_frequency_map.begin(), sorted_delta_frequency_map.end(),
+        [](const pixel_type_3d_frequency &a, const pixel_type_3d_frequency &b) {
+          return a.second > b.second;
+        });
+
+    // Store the top deltas.
+    for (auto &delta_frequency : sorted_delta_frequency_map) {
+      if (frequent_deltas[0].size() >= kMaxDeltas) break;
+      // Number obtained by optimizing on jyrki31 corpus:
+      if (delta_frequency.second < 17) break;
+      for (int c = 0; c < 3; ++c) {
+        frequent_deltas[c].push_back(delta_frequency.first[c] * bucket_size);
+      }
+    }
+  }
+};
+
+Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
+                           uint32_t &nb_colors, uint32_t &nb_deltas,
+                           bool ordered, bool lossy, Predictor &predictor,
+                           const weighted::Header &wp_header,
+                           PaletteIterationData &palette_iteration_data) {
+  JXL_QUIET_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
+  JXL_ASSERT(begin_c >= input.nb_meta_channels);
+  uint32_t nb = end_c - begin_c + 1;
+
+  size_t w = input.channel[begin_c].w;
+  size_t h = input.channel[begin_c].h;
+
+  if (!lossy && nb == 1) {
+    // Channel palette special case
+    if (nb_colors == 0) return false;
+    std::vector<pixel_type> lookup;
+    pixel_type minval, maxval;
+    compute_minmax(input.channel[begin_c], &minval, &maxval);
+    size_t lookup_table_size =
+        static_cast<int64_t>(maxval) - static_cast<int64_t>(minval) + 1;
+    if (lookup_table_size > palette_internal::kMaxPaletteLookupTableSize) {
+      // a lookup table would use too much memory, instead use a slower approach
+      // with std::set
+      std::set<pixel_type> chpalette;
+      pixel_type idx = 0;
+      for (size_t y = 0; y < h; y++) {
+        const pixel_type *p = input.channel[begin_c].Row(y);
+        for (size_t x = 0; x < w; x++) {
+          const bool new_color = chpalette.insert(p[x]).second;
+          if (new_color) {
+            idx++;
+            if (idx > (int)nb_colors) return false;
+          }
+        }
+      }
+      JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
+      Channel pch(idx, 1);
+      pch.hshift = -1;
+      pch.vshift = -1;
+      nb_colors = idx;
+      idx = 0;
+      pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+      for (pixel_type p : chpalette) {
+        p_palette[idx++] = p;
+      }
+      for (size_t y = 0; y < h; y++) {
+        pixel_type *p = input.channel[begin_c].Row(y);
+        for (size_t x = 0; x < w; x++) {
+          for (idx = 0; p[x] != p_palette[idx] && idx < (int)nb_colors; idx++) {
+          }
+          JXL_DASSERT(idx < (int)nb_colors);
+          p[x] = idx;
+        }
+      }
+      predictor = Predictor::Zero;
+      input.nb_meta_channels++;
+      input.channel.insert(input.channel.begin(), std::move(pch));
+
+      return true;
+    }
+    lookup.resize(lookup_table_size, 0);
+    pixel_type idx = 0;
+    for (size_t y = 0; y < h; y++) {
+      const pixel_type *p = input.channel[begin_c].Row(y);
+      for (size_t x = 0; x < w; x++) {
+        if (lookup[p[x] - minval] == 0) {
+          lookup[p[x] - minval] = 1;
+          idx++;
+          if (idx > (int)nb_colors) return false;
+        }
+      }
+    }
+    JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
+    Channel pch(idx, 1);
+    pch.hshift = -1;
+    pch.vshift = -1;
+    nb_colors = idx;
+    idx = 0;
+    pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+    for (size_t i = 0; i < lookup_table_size; i++) {
+      if (lookup[i]) {
+        p_palette[idx] = i + minval;
+        lookup[i] = idx;
+        idx++;
+      }
+    }
+    for (size_t y = 0; y < h; y++) {
+      pixel_type *p = input.channel[begin_c].Row(y);
+      for (size_t x = 0; x < w; x++) p[x] = lookup[p[x] - minval];
+    }
+    predictor = Predictor::Zero;
+    input.nb_meta_channels++;
+    input.channel.insert(input.channel.begin(), std::move(pch));
+    return true;
+  }
+
+  Image quantized_input;
+  if (lossy) {
+    quantized_input = Image(w, h, input.bitdepth, nb);
+    for (size_t c = 0; c < nb; c++) {
+      CopyImageTo(input.channel[begin_c + c].plane,
+                  &quantized_input.channel[c].plane);
+    }
+  }
+
+  JXL_DEBUG_V(
+      7, "Trying to represent channels %i-%i using at most a %i-color palette.",
+      begin_c, end_c, nb_colors);
+  nb_deltas = 0;
+  bool delta_used = false;
+  std::set<std::vector<pixel_type>>
+      candidate_palette;  // ordered lexicographically
+  std::vector<std::vector<pixel_type>> candidate_palette_imageorder;
+  std::vector<pixel_type> color(nb);
+  std::vector<float> color_with_error(nb);
+  std::vector<const pixel_type *> p_in(nb);
+
+  if (lossy) {
+    palette_iteration_data.FindFrequentColorDeltas(w * h, input.bitdepth);
+    nb_deltas = palette_iteration_data.frequent_deltas[0].size();
+
+    // Count color frequency for colors that make a cross.
+    std::map<std::vector<pixel_type>, size_t> color_freq_map;
+    for (size_t y = 1; y + 1 < h; y++) {
+      for (uint32_t c = 0; c < nb; c++) {
+        p_in[c] = input.channel[begin_c + c].Row(y);
+      }
+      for (size_t x = 1; x + 1 < w; x++) {
+        for (uint32_t c = 0; c < nb; c++) {
+          color[c] = p_in[c][x];
+        }
+        int offsets[4][2] = {{1, 0}, {-1, 0}, {0, 1}, {0, -1}};
+        bool makes_cross = true;
+        for (int i = 0; i < 4 && makes_cross; ++i) {
+          int dx = offsets[i][0];
+          int dy = offsets[i][1];
+          for (uint32_t c = 0; c < nb && makes_cross; c++) {
+            if (input.channel[begin_c + c].Row(y + dy)[x + dx] != color[c]) {
+              makes_cross = false;
+            }
+          }
+        }
+        if (makes_cross) color_freq_map[color] += 1;
+      }
+    }
+    // Add colors satisfying frequency condition to the palette.
+    constexpr float kImageFraction = 0.01f;
+    size_t color_frequency_lower_bound = 5 + input.h * input.w * kImageFraction;
+    for (const auto &color_freq : color_freq_map) {
+      if (color_freq.second > color_frequency_lower_bound) {
+        candidate_palette.insert(color_freq.first);
+        candidate_palette_imageorder.push_back(color_freq.first);
+      }
+    }
+  }
+
+  for (size_t y = 0; y < h; y++) {
+    for (uint32_t c = 0; c < nb; c++) {
+      p_in[c] = input.channel[begin_c + c].Row(y);
+    }
+    for (size_t x = 0; x < w; x++) {
+      if (lossy && candidate_palette.size() >= nb_colors) break;
+      for (uint32_t c = 0; c < nb; c++) {
+        color[c] = p_in[c][x];
+      }
+      const bool new_color = candidate_palette.insert(color).second;
+      if (new_color) {
+        candidate_palette_imageorder.push_back(color);
+      }
+      if (candidate_palette.size() > nb_colors) {
+        return false;  // too many colors
+      }
+    }
+  }
+
+  nb_colors = nb_deltas + candidate_palette.size();
+  JXL_DEBUG_V(6, "Channels %i-%i can be represented using a %i-color palette.",
+              begin_c, end_c, nb_colors);
+
+  Channel pch(nb_colors, nb);
+  pch.hshift = -1;
+  pch.vshift = -1;
+  pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
+  intptr_t onerow = pch.plane.PixelsPerRow();
+  intptr_t onerow_image = input.channel[begin_c].plane.PixelsPerRow();
+  const int bit_depth = std::min(input.bitdepth, 24);
+
+  if (lossy) {
+    for (uint32_t i = 0; i < nb_deltas; i++) {
+      for (size_t c = 0; c < 3; c++) {
+        p_palette[c * onerow + i] =
+            palette_iteration_data.frequent_deltas[c][i];
+      }
+    }
+  }
+
+  int x = 0;
+  if (ordered) {
+    JXL_DEBUG_V(7, "Palette of %i colors, using lexicographic order",
+                nb_colors);
+    for (auto pcol : candidate_palette) {
+      JXL_DEBUG_V(9, "  Color %i :  ", x);
+      for (size_t i = 0; i < nb; i++) {
+        p_palette[nb_deltas + i * onerow + x] = pcol[i];
+      }
+      for (size_t i = 0; i < nb; i++) {
+        JXL_DEBUG_V(9, "%i ", pcol[i]);
+      }
+      x++;
+    }
+  } else {
+    JXL_DEBUG_V(7, "Palette of %i colors, using image order", nb_colors);
+    for (auto pcol : candidate_palette_imageorder) {
+      JXL_DEBUG_V(9, "  Color %i :  ", x);
+      for (size_t i = 0; i < nb; i++)
+        p_palette[nb_deltas + i * onerow + x] = pcol[i];
+      for (size_t i = 0; i < nb; i++) JXL_DEBUG_V(9, "%i ", pcol[i]);
+      x++;
+    }
+  }
+  std::vector<weighted::State> wp_states;
+  for (size_t c = 0; c < nb; c++) {
+    wp_states.emplace_back(wp_header, w, h);
+  }
+  std::vector<pixel_type *> p_quant(nb);
+  // Three rows of error for dithering: y to y + 2.
+  // Each row has two pixels of padding in the ends, which is
+  // beneficial for both precision and encoding speed.
+  std::vector<std::vector<float>> error_row[3];
+  if (lossy) {
+    for (int i = 0; i < 3; ++i) {
+      error_row[i].resize(nb);
+      for (size_t c = 0; c < nb; ++c) {
+        error_row[i][c].resize(w + 4);
+      }
+    }
+  }
+  for (size_t y = 0; y < h; y++) {
+    for (size_t c = 0; c < nb; c++) {
+      p_in[c] = input.channel[begin_c + c].Row(y);
+      if (lossy) p_quant[c] = quantized_input.channel[c].Row(y);
+    }
+    pixel_type *JXL_RESTRICT p = input.channel[begin_c].Row(y);
+    for (size_t x = 0; x < w; x++) {
+      int index;
+      if (!lossy) {
+        for (size_t c = 0; c < nb; c++) color[c] = p_in[c][x];
+        // Exact search.
+        for (index = 0; static_cast<uint32_t>(index) < nb_colors; index++) {
+          bool found = true;
+          for (size_t c = 0; c < nb; c++) {
+            if (color[c] != p_palette[c * onerow + index]) {
+              found = false;
+              break;
+            }
+          }
+          if (found) break;
+        }
+        if (index < static_cast<int>(nb_deltas)) {
+          delta_used = true;
+        }
+      } else {
+        int best_index = 0;
+        bool best_is_delta = false;
+        float best_distance = std::numeric_limits<float>::infinity();
+        std::vector<pixel_type> best_val(nb, 0);
+        std::vector<pixel_type> ideal_residual(nb, 0);
+        std::vector<pixel_type> quantized_val(nb);
+        std::vector<pixel_type> predictions(nb);
+        static const double kDiffusionMultiplier[] = {0.55, 0.75};
+        for (int diffusion_index = 0; diffusion_index < 2; ++diffusion_index) {
+          for (size_t c = 0; c < nb; c++) {
+            color_with_error[c] =
+                p_in[c][x] + palette_iteration_data.final_run *
+                                 kDiffusionMultiplier[diffusion_index] *
+                                 error_row[0][c][x + 2];
+            color[c] = Clamp1(lroundf(color_with_error[c]), 0l,
+                              (1l << input.bitdepth) - 1);
+          }
+
+          for (size_t c = 0; c < nb; ++c) {
+            predictions[c] = PredictNoTreeWP(w, p_quant[c] + x, onerow_image, x,
+                                             y, predictor, &wp_states[c])
+                                 .guess;
+          }
+          const auto TryIndex = [&](const int index) {
+            for (size_t c = 0; c < nb; c++) {
+              quantized_val[c] = palette_internal::GetPaletteValue(
+                  p_palette, index, /*c=*/c,
+                  /*palette_size=*/nb_colors,
+                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+              if (index < static_cast<int>(nb_deltas)) {
+                quantized_val[c] += predictions[c];
+              }
+            }
+            const float color_distance =
+                32.0 / (1LL << std::max(0, 2 * (bit_depth - 8))) *
+                palette_internal::ColorDistance(color_with_error,
+                                                quantized_val);
+            float index_penalty = 0;
+            if (index == -1) {
+              index_penalty = -124;
+            } else if (index < 0) {
+              index_penalty = -2 * index;
+            } else if (index < static_cast<int>(nb_deltas)) {
+              index_penalty = 250;
+            } else if (index < static_cast<int>(nb_colors)) {
+              index_penalty = 150;
+            } else if (index < static_cast<int>(nb_colors) +
+                                   palette_internal::kLargeCubeOffset) {
+              index_penalty = 70;
+            } else {
+              index_penalty = 256;
+            }
+            const float distance = color_distance + index_penalty;
+            if (distance < best_distance) {
+              best_distance = distance;
+              best_index = index;
+              best_is_delta = index < static_cast<int>(nb_deltas);
+              best_val.swap(quantized_val);
+              for (size_t c = 0; c < nb; ++c) {
+                ideal_residual[c] = color_with_error[c] - predictions[c];
+              }
+            }
+          };
+          for (index = palette_internal::kMinImplicitPaletteIndex;
+               index < static_cast<int32_t>(nb_colors); index++) {
+            TryIndex(index);
+          }
+          TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex(
+              color, nb_colors, bit_depth,
+              /*high_quality=*/false));
+          if (palette_internal::kEncodeToHighQualityImplicitPalette) {
+            TryIndex(palette_internal::QuantizeColorToImplicitPaletteIndex(
+                color, nb_colors, bit_depth,
+                /*high_quality=*/true));
+          }
+        }
+        index = best_index;
+        delta_used |= best_is_delta;
+        if (!palette_iteration_data.final_run) {
+          for (size_t c = 0; c < 3; ++c) {
+            palette_iteration_data.deltas[c].push_back(ideal_residual[c]);
+          }
+          palette_iteration_data.delta_distances.push_back(best_distance);
+        }
+
+        for (size_t c = 0; c < nb; ++c) {
+          wp_states[c].UpdateErrors(best_val[c], x, y, w);
+          p_quant[c][x] = best_val[c];
+        }
+        float len_error = 0;
+        for (size_t c = 0; c < nb; ++c) {
+          float local_error = color_with_error[c] - best_val[c];
+          len_error += local_error * local_error;
+        }
+        len_error = sqrt(len_error);
+        float modulate = 1.0;
+        int len_limit = 38 << std::max(0, bit_depth - 8);
+        if (len_error > len_limit) {
+          modulate *= len_limit / len_error;
+        }
+        for (size_t c = 0; c < nb; ++c) {
+          float total_error = (color_with_error[c] - best_val[c]);
+
+          // If the neighboring pixels have some error in the opposite
+          // direction of total_error, cancel some or all of it out before
+          // spreading among them.
+          constexpr int offsets[12][2] = {{1, 2}, {0, 3}, {0, 4}, {1, 1},
+                                          {1, 3}, {2, 2}, {1, 0}, {1, 4},
+                                          {2, 1}, {2, 3}, {2, 0}, {2, 4}};
+          float total_available = 0;
+          for (int i = 0; i < 11; ++i) {
+            const int row = offsets[i][0];
+            const int col = offsets[i][1];
+            if (std::signbit(error_row[row][c][x + col]) !=
+                std::signbit(total_error)) {
+              total_available += error_row[row][c][x + col];
+            }
+          }
+          float weight =
+              std::abs(total_error) / (std::abs(total_available) + 1e-3);
+          weight = std::min(weight, 1.0f);
+          for (int i = 0; i < 11; ++i) {
+            const int row = offsets[i][0];
+            const int col = offsets[i][1];
+            if (std::signbit(error_row[row][c][x + col]) !=
+                std::signbit(total_error)) {
+              total_error += weight * error_row[row][c][x + col];
+              error_row[row][c][x + col] *= (1 - weight);
+            }
+          }
+          total_error *= modulate;
+          const float remaining_error = (1.0f / 14.) * total_error;
+          error_row[0][c][x + 3] += 2 * remaining_error;
+          error_row[0][c][x + 4] += remaining_error;
+          error_row[1][c][x + 0] += remaining_error;
+          for (int i = 0; i < 5; ++i) {
+            error_row[1][c][x + i] += remaining_error;
+            error_row[2][c][x + i] += remaining_error;
+          }
+        }
+      }
+      if (palette_iteration_data.final_run) p[x] = index;
+    }
+    if (lossy) {
+      for (size_t c = 0; c < nb; ++c) {
+        error_row[0][c].swap(error_row[1][c]);
+        error_row[1][c].swap(error_row[2][c]);
+        std::fill(error_row[2][c].begin(), error_row[2][c].end(), 0.f);
+      }
+    }
+  }
+  if (!delta_used) {
+    predictor = Predictor::Zero;
+  }
+  if (palette_iteration_data.final_run) {
+    input.nb_meta_channels++;
+    input.channel.erase(input.channel.begin() + begin_c + 1,
+                        input.channel.begin() + end_c + 1);
+    input.channel.insert(input.channel.begin(), std::move(pch));
+  }
+  nb_colors -= nb_deltas;
+  return true;
+}
+
+Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                  uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered,
+                  bool lossy, Predictor &predictor,
+                  const weighted::Header &wp_header) {
+  PaletteIterationData palette_iteration_data;
+  uint32_t nb_colors_orig = nb_colors;
+  uint32_t nb_deltas_orig = nb_deltas;
+  // preprocessing pass in case of lossy palette
+  if (lossy && input.bitdepth >= 8) {
+    JXL_RETURN_IF_ERROR(FwdPaletteIteration(
+        input, begin_c, end_c, nb_colors_orig, nb_deltas_orig, ordered, lossy,
+        predictor, wp_header, palette_iteration_data));
+  }
+  palette_iteration_data.final_run = true;
+  return FwdPaletteIteration(input, begin_c, end_c, nb_colors, nb_deltas,
+                             ordered, lossy, predictor, wp_header,
+                             palette_iteration_data);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h
new file mode 100644
index 0000000000..0f3d66825b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_palette.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+Status FwdPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                  uint32_t &nb_colors, uint32_t &nb_deltas, bool ordered,
+                  bool lossy, Predictor &predictor,
+                  const weighted::Header &wp_header);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_PALETTE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc
new file mode 100644
index 0000000000..050563a3c2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.cc
@@ -0,0 +1,73 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_rct.h"
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+Status FwdRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
+  if (rct_type == 0) {  // noop
+    return false;
+  }
+  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
+  int permutation = rct_type / 7;
+  // 0-5 values have the low bit corresponding to Third and the high bits
+  // corresponding to Second. 6 corresponds to YCoCg.
+  //
+  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
+  //
+  // Third: 0=nop, 1=SubtractFirst
+  int custom = rct_type % 7;
+  size_t m = begin_c;
+  size_t w = input.channel[m + 0].w;
+  size_t h = input.channel[m + 0].h;
+  int second = (custom % 7) >> 1;
+  int third = (custom % 7) & 1;
+  const auto do_rct = [&](const int y, const int thread) {
+    const pixel_type* in0 = input.channel[m + (permutation % 3)].Row(y);
+    const pixel_type* in1 =
+        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
+    const pixel_type* in2 =
+        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
+    pixel_type* out0 = input.channel[m].Row(y);
+    pixel_type* out1 = input.channel[m + 1].Row(y);
+    pixel_type* out2 = input.channel[m + 2].Row(y);
+    if (custom == 6) {
+      for (size_t x = 0; x < w; x++) {
+        pixel_type R = in0[x];
+        pixel_type G = in1[x];
+        pixel_type B = in2[x];
+        out1[x] = R - B;
+        pixel_type tmp = B + (out1[x] >> 1);
+        out2[x] = G - tmp;
+        out0[x] = tmp + (out2[x] >> 1);
+      }
+    } else {
+      for (size_t x = 0; x < w; x++) {
+        pixel_type First = in0[x];
+        pixel_type Second = in1[x];
+        pixel_type Third = in2[x];
+        if (second == 1) {
+          Second = Second - First;
+        } else if (second == 2) {
+          Second = Second - ((First + Third) >> 1);
+        }
+        if (third) Third = Third - First;
+        out0[x] = First;
+        out1[x] = Second;
+        out2[x] = Third;
+      }
+    }
+  };
+  return RunOnPool(pool, 0, h, ThreadPool::NoInit, do_rct, "FwdRCT");
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h
new file mode 100644
index 0000000000..cb5a193c8d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_rct.h
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
+
+#include "lib/jxl/modular/modular_image.h"
+
+namespace jxl {
+
+Status FwdRCT(Image &input, size_t begin_c, size_t rct_type, ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_RCT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc
new file mode 100644
index 0000000000..dfd90cde68
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.cc
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_squeeze.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/squeeze.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+void FwdHSqueeze(Image &input, int c, int rc) {
+  const Channel &chin = input.channel[c];
+
+  JXL_DEBUG_V(4, "Doing horizontal squeeze of channel %i to new channel %i", c,
+              rc);
+
+  Channel chout((chin.w + 1) / 2, chin.h, chin.hshift + 1, chin.vshift);
+  Channel chout_residual(chin.w - chout.w, chout.h, chin.hshift + 1,
+                         chin.vshift);
+
+  for (size_t y = 0; y < chout.h; y++) {
+    const pixel_type *JXL_RESTRICT p_in = chin.Row(y);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y);
+    for (size_t x = 0; x < chout_residual.w; x++) {
+      pixel_type A = p_in[x * 2];
+      pixel_type B = p_in[x * 2 + 1];
+      pixel_type avg = (A + B + (A > B)) >> 1;
+      p_out[x] = avg;
+
+      pixel_type diff = A - B;
+
+      pixel_type next_avg = avg;
+      if (x + 1 < chout_residual.w) {
+        next_avg = (p_in[x * 2 + 2] + p_in[x * 2 + 3] +
+                    (p_in[x * 2 + 2] > p_in[x * 2 + 3])) >>
+                   1;  // which will be chout.value(y,x+1)
+      } else if (chin.w & 1)
+        next_avg = p_in[x * 2 + 2];
+      pixel_type left = (x > 0 ? p_in[x * 2 - 1] : avg);
+      pixel_type tendency = SmoothTendency(left, avg, next_avg);
+
+      p_res[x] = diff - tendency;
+    }
+    if (chin.w & 1) {
+      int x = chout.w - 1;
+      p_out[x] = p_in[x * 2];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  input.channel.insert(input.channel.begin() + rc, std::move(chout_residual));
+}
+
+void FwdVSqueeze(Image &input, int c, int rc) {
+  const Channel &chin = input.channel[c];
+
+  JXL_DEBUG_V(4, "Doing vertical squeeze of channel %i to new channel %i", c,
+              rc);
+
+  Channel chout(chin.w, (chin.h + 1) / 2, chin.hshift, chin.vshift + 1);
+  Channel chout_residual(chin.w, chin.h - chout.h, chin.hshift,
+                         chin.vshift + 1);
+  intptr_t onerow_in = chin.plane.PixelsPerRow();
+  for (size_t y = 0; y < chout_residual.h; y++) {
+    const pixel_type *JXL_RESTRICT p_in = chin.Row(y * 2);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    pixel_type *JXL_RESTRICT p_res = chout_residual.Row(y);
+    for (size_t x = 0; x < chout.w; x++) {
+      pixel_type A = p_in[x];
+      pixel_type B = p_in[x + onerow_in];
+      pixel_type avg = (A + B + (A > B)) >> 1;
+      p_out[x] = avg;
+
+      pixel_type diff = A - B;
+
+      pixel_type next_avg = avg;
+      if (y + 1 < chout_residual.h) {
+        next_avg = (p_in[x + 2 * onerow_in] + p_in[x + 3 * onerow_in] +
+                    (p_in[x + 2 * onerow_in] > p_in[x + 3 * onerow_in])) >>
+                   1;  // which will be chout.value(y+1,x)
+      } else if (chin.h & 1) {
+        next_avg = p_in[x + 2 * onerow_in];
+      }
+      pixel_type top =
+          (y > 0 ? p_in[static_cast<ssize_t>(x) - onerow_in] : avg);
+      pixel_type tendency = SmoothTendency(top, avg, next_avg);
+
+      p_res[x] = diff - tendency;
+    }
+  }
+  if (chin.h & 1) {
+    size_t y = chout.h - 1;
+    const pixel_type *p_in = chin.Row(y * 2);
+    pixel_type *p_out = chout.Row(y);
+    for (size_t x = 0; x < chout.w; x++) {
+      p_out[x] = p_in[x];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  input.channel.insert(input.channel.begin() + rc, std::move(chout_residual));
+}
+
+Status FwdSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  if (parameters.empty()) {
+    DefaultSqueezeParameters(&parameters, input);
+  }
+  // if nothing to do, don't do squeeze
+  if (parameters.empty()) return false;
+  for (size_t i = 0; i < parameters.size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
+    bool horizontal = parameters[i].horizontal;
+    bool in_place = parameters[i].in_place;
+    uint32_t beginc = parameters[i].begin_c;
+    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
+    uint32_t offset;
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = input.channel.size();
+    }
+    for (uint32_t c = beginc; c <= endc; c++) {
+      if (horizontal) {
+        FwdHSqueeze(input, c, offset + c - beginc);
+      } else {
+        FwdVSqueeze(input, c, offset + c - beginc);
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h
new file mode 100644
index 0000000000..39b001017b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_squeeze.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Status FwdSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_SQUEEZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc
new file mode 100644
index 0000000000..bdaaf9f87e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.cc
@@ -0,0 +1,46 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/enc_transform.h"
+
+#include "lib/jxl/modular/transform/enc_palette.h"
+#include "lib/jxl/modular/transform/enc_rct.h"
+#include "lib/jxl/modular/transform/enc_squeeze.h"
+
+namespace jxl {
+
+Status TransformForward(Transform &t, Image &input,
+                        const weighted::Header &wp_header, ThreadPool *pool) {
+  switch (t.id) {
+    case TransformId::kRCT:
+      return FwdRCT(input, t.begin_c, t.rct_type, pool);
+    case TransformId::kSqueeze:
+      return FwdSqueeze(input, t.squeezes, pool);
+    case TransformId::kPalette:
+      return FwdPalette(input, t.begin_c, t.begin_c + t.num_c - 1, t.nb_colors,
+                        t.nb_deltas, t.ordered_palette, t.lossy_palette,
+                        t.predictor, wp_header);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(t.id));
+  }
+}
+
+void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max) {
+  pixel_type realmin = std::numeric_limits<pixel_type>::max();
+  pixel_type realmax = std::numeric_limits<pixel_type>::min();
+  for (size_t y = 0; y < ch.h; y++) {
+    const pixel_type *JXL_RESTRICT p = ch.Row(y);
+    for (size_t x = 0; x < ch.w; x++) {
+      if (p[x] < realmin) realmin = p[x];
+      if (p[x] > realmax) realmax = p[x];
+    }
+  }
+
+  if (min) *min = realmin;
+  if (max) *max = realmax;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h
new file mode 100644
index 0000000000..07659e1b0a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/enc_transform.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
+#define LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
+
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+namespace jxl {
+
+Status TransformForward(Transform &t, Image &input,
+                        const weighted::Header &wp_header, ThreadPool *pool);
+
+void compute_minmax(const Channel &ch, pixel_type *min, pixel_type *max);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_ENC_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc
new file mode 100644
index 0000000000..46129f19f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.cc
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/palette.h"
+
+namespace jxl {
+
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool) {
+  if (input.nb_meta_channels < 1) {
+    return JXL_FAILURE("Error: Palette transform without palette.");
+  }
+  std::atomic<int> num_errors{0};
+  int nb = input.channel[0].h;
+  uint32_t c0 = begin_c + 1;
+  if (c0 >= input.channel.size()) {
+    return JXL_FAILURE("Channel is out of range.");
+  }
+  size_t w = input.channel[c0].w;
+  size_t h = input.channel[c0].h;
+  if (nb < 1) return JXL_FAILURE("Corrupted transforms");
+  for (int i = 1; i < nb; i++) {
+    input.channel.insert(
+        input.channel.begin() + c0 + 1,
+        Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift));
+  }
+  const Channel &palette = input.channel[0];
+  const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0);
+  intptr_t onerow = input.channel[0].plane.PixelsPerRow();
+  intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow();
+  const int bit_depth = std::min(input.bitdepth, 24);
+
+  if (w == 0) {
+    // Nothing to do.
+    // Avoid touching "empty" channels with non-zero height.
+  } else if (nb_deltas == 0 && predictor == Predictor::Zero) {
+    if (nb == 1) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            pixel_type *p = input.channel[c0].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = Clamp1<int>(p[x], 0, (pixel_type)palette.w - 1);
+              p[x] = palette_internal::GetPaletteValue(
+                  p_palette, index, /*c=*/0,
+                  /*palette_size=*/palette.w,
+                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+            }
+          },
+          "UndoChannelPalette"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            std::vector<pixel_type *> p_out(nb);
+            const pixel_type *p_index = input.channel[c0].Row(y);
+            for (int c = 0; c < nb; c++)
+              p_out[c] = input.channel[c0 + c].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = p_index[x];
+              for (int c = 0; c < nb; c++) {
+                p_out[c][x] = palette_internal::GetPaletteValue(
+                    p_palette, index, /*c=*/c,
+                    /*palette_size=*/palette.w,
+                    /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+              }
+            }
+          },
+          "UndoPalette"));
+    }
+  } else {
+    // Parallelized per channel.
+    ImageI indices = CopyImage(input.channel[c0].plane);
+    if (predictor == Predictor::Weighted) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            weighted::State wp_state(wp_header, channel.w, channel.h);
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w, /*onerow=*/onerow,
+                        /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred =
+                      PredictNoTreeWP(channel.w, p + x, onerow_image, x, y,
+                                      predictor, &wp_state);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+                wp_state.UpdateErrors(p[x], x, y, channel.w);
+              }
+            }
+          },
+          "UndoDeltaPaletteWP"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w,
+                        /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred = PredictNoTreeNoWP(
+                      channel.w, p + x, onerow_image, x, y, predictor);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+              }
+            }
+          },
+          "UndoDeltaPaletteNoWP"));
+    }
+  }
+  if (c0 >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels--;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(static_cast<int>(input.nb_meta_channels) >= 2 - nb);
+    input.nb_meta_channels -= 2 - nb;
+    JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels);
+  }
+  input.channel.erase(input.channel.begin(), input.channel.begin() + 1);
+  return num_errors.load(std::memory_order_relaxed) == 0;
+}
+
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
+
+  size_t nb = end_c - begin_c + 1;
+  if (begin_c >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels++;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(end_c < input.nb_meta_channels);
+    // we remove nb-1 metachannels and add one
+    input.nb_meta_channels += 2 - nb;
+  }
+  input.channel.erase(input.channel.begin() + begin_c + 1,
+                      input.channel.begin() + end_c + 1);
+  Channel pch(nb_colors + nb_deltas, nb);
+  pch.hshift = -1;
+  pch.vshift = -1;
+  input.channel.insert(input.channel.begin(), std::move(pch));
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h
new file mode 100644
index 0000000000..cc0f67960b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/palette.h
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
+
+#include <atomic>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+namespace palette_internal {
+
+static constexpr int kMaxPaletteLookupTableSize = 1 << 16;
+
+static constexpr int kRgbChannels = 3;
+
+// 5x5x5 color cube for the larger cube.
+static constexpr int kLargeCube = 5;
+
+// Smaller interleaved color cube to fill the holes of the larger cube.
+static constexpr int kSmallCube = 4;
+static constexpr int kSmallCubeBits = 2;
+// kSmallCube ** 3
+static constexpr int kLargeCubeOffset = kSmallCube * kSmallCube * kSmallCube;
+
+static inline pixel_type Scale(uint64_t value, uint64_t bit_depth,
+                               uint64_t denom) {
+  // return (value * ((static_cast<pixel_type_w>(1) << bit_depth) - 1)) / denom;
+  // We only call this function with kSmallCube or kLargeCube - 1 as denom,
+  // allowing us to avoid a division here.
+  JXL_ASSERT(denom == 4);
+  return (value * ((static_cast<uint64_t>(1) << bit_depth) - 1)) >> 2;
+}
+
+// The purpose of this function is solely to extend the interpretation of
+// palette indices to implicit values. If index < nb_deltas, indicating that the
+// result is a delta palette entry, it is the responsibility of the caller to
+// treat it as such.
+static JXL_MAYBE_UNUSED pixel_type
+GetPaletteValue(const pixel_type *const palette, int index, const size_t c,
+                const int palette_size, const int onerow, const int bit_depth) {
+  if (index < 0) {
+    static constexpr std::array<std::array<pixel_type, 3>, 72> kDeltaPalette = {
+        {
+            {{0, 0, 0}},       {{4, 4, 4}},       {{11, 0, 0}},
+            {{0, 0, -13}},     {{0, -12, 0}},     {{-10, -10, -10}},
+            {{-18, -18, -18}}, {{-27, -27, -27}}, {{-18, -18, 0}},
+            {{0, 0, -32}},     {{-32, 0, 0}},     {{-37, -37, -37}},
+            {{0, -32, -32}},   {{24, 24, 45}},    {{50, 50, 50}},
+            {{-45, -24, -24}}, {{-24, -45, -45}}, {{0, -24, -24}},
+            {{-34, -34, 0}},   {{-24, 0, -24}},   {{-45, -45, -24}},
+            {{64, 64, 64}},    {{-32, 0, -32}},   {{0, -32, 0}},
+            {{-32, 0, 32}},    {{-24, -45, -24}}, {{45, 24, 45}},
+            {{24, -24, -45}},  {{-45, -24, 24}},  {{80, 80, 80}},
+            {{64, 0, 0}},      {{0, 0, -64}},     {{0, -64, -64}},
+            {{-24, -24, 45}},  {{96, 96, 96}},    {{64, 64, 0}},
+            {{45, -24, -24}},  {{34, -34, 0}},    {{112, 112, 112}},
+            {{24, -45, -45}},  {{45, 45, -24}},   {{0, -32, 32}},
+            {{24, -24, 45}},   {{0, 96, 96}},     {{45, -24, 24}},
+            {{24, -45, -24}},  {{-24, -45, 24}},  {{0, -64, 0}},
+            {{96, 0, 0}},      {{128, 128, 128}}, {{64, 0, 64}},
+            {{144, 144, 144}}, {{96, 96, 0}},     {{-36, -36, 36}},
+            {{45, -24, -45}},  {{45, -45, -24}},  {{0, 0, -96}},
+            {{0, 128, 128}},   {{0, 96, 0}},      {{45, 24, -45}},
+            {{-128, 0, 0}},    {{24, -45, 24}},   {{-45, 24, -45}},
+            {{64, 0, -64}},    {{64, -64, -64}},  {{96, 0, 96}},
+            {{45, -45, 24}},   {{24, 45, -45}},   {{64, 64, -64}},
+            {{128, 128, 0}},   {{0, 0, -128}},    {{-24, 45, -45}},
+        }};
+    if (c >= kRgbChannels) {
+      return 0;
+    }
+    // Do not open the brackets, otherwise INT32_MIN negation could overflow.
+    index = -(index + 1);
+    index %= 1 + 2 * (kDeltaPalette.size() - 1);
+    static constexpr int kMultiplier[] = {-1, 1};
+    pixel_type result =
+        kDeltaPalette[((index + 1) >> 1)][c] * kMultiplier[index & 1];
+    if (bit_depth > 8) {
+      result *= static_cast<pixel_type>(1) << (bit_depth - 8);
+    }
+    return result;
+  } else if (palette_size <= index && index < palette_size + kLargeCubeOffset) {
+    if (c >= kRgbChannels) return 0;
+    index -= palette_size;
+    index >>= c * kSmallCubeBits;
+    return Scale(index % kSmallCube, bit_depth, kSmallCube) +
+           (1 << (std::max(0, bit_depth - 3)));
+  } else if (palette_size + kLargeCubeOffset <= index) {
+    if (c >= kRgbChannels) return 0;
+    index -= palette_size + kLargeCubeOffset;
+    // TODO(eustas): should we take care of ambiguity created by
+    //               index >= kLargeCube ** 3 ?
+    switch (c) {
+      case 0:
+        break;
+      case 1:
+        index /= kLargeCube;
+        break;
+      case 2:
+        index /= kLargeCube * kLargeCube;
+        break;
+    }
+    return Scale(index % kLargeCube, bit_depth, kLargeCube - 1);
+  }
+  return palette[c * onerow + static_cast<size_t>(index)];
+}
+
+}  // namespace palette_internal
+
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool);
+
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_PALETTE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc
new file mode 100644
index 0000000000..f3002a5ac3
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.cc
@@ -0,0 +1,153 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/rct.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+template <int transform_type>
+void InvRCTRow(const pixel_type* in0, const pixel_type* in1,
+               const pixel_type* in2, pixel_type* out0, pixel_type* out1,
+               pixel_type* out2, size_t w) {
+  static_assert(transform_type >= 0 && transform_type < 7,
+                "Invalid transform type");
+  int second = transform_type >> 1;
+  int third = transform_type & 1;
+
+  size_t x = 0;
+  const HWY_FULL(pixel_type) d;
+  const size_t N = Lanes(d);
+  for (; x + N - 1 < w; x += N) {
+    if (transform_type == 6) {
+      auto Y = Load(d, in0 + x);
+      auto Co = Load(d, in1 + x);
+      auto Cg = Load(d, in2 + x);
+      Y = Sub(Y, ShiftRight<1>(Cg));
+      auto G = Add(Cg, Y);
+      Y = Sub(Y, ShiftRight<1>(Co));
+      auto R = Add(Y, Co);
+      Store(R, d, out0 + x);
+      Store(G, d, out1 + x);
+      Store(Y, d, out2 + x);
+    } else {
+      auto First = Load(d, in0 + x);
+      auto Second = Load(d, in1 + x);
+      auto Third = Load(d, in2 + x);
+      if (third) Third = Add(Third, First);
+      if (second == 1) {
+        Second = Add(Second, First);
+      } else if (second == 2) {
+        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
+      }
+      Store(First, d, out0 + x);
+      Store(Second, d, out1 + x);
+      Store(Third, d, out2 + x);
+    }
+  }
+  for (; x < w; x++) {
+    if (transform_type == 6) {
+      pixel_type Y = in0[x];
+      pixel_type Co = in1[x];
+      pixel_type Cg = in2[x];
+      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
+      pixel_type G = PixelAdd(Cg, tmp);
+      pixel_type B = PixelAdd(tmp, -(Co >> 1));
+      pixel_type R = PixelAdd(B, Co);
+      out0[x] = R;
+      out1[x] = G;
+      out2[x] = B;
+    } else {
+      pixel_type First = in0[x];
+      pixel_type Second = in1[x];
+      pixel_type Third = in2[x];
+      if (third) Third = PixelAdd(Third, First);
+      if (second == 1) {
+        Second = PixelAdd(Second, First);
+      } else if (second == 2) {
+        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
+      }
+      out0[x] = First;
+      out1[x] = Second;
+      out2[x] = Third;
+    }
+  }
+}
+
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
+  size_t m = begin_c;
+  Channel& c0 = input.channel[m + 0];
+  size_t w = c0.w;
+  size_t h = c0.h;
+  if (rct_type == 0) {  // noop
+    return true;
+  }
+  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
+  int permutation = rct_type / 7;
+  JXL_CHECK(permutation < 6);
+  // 0-5 values have the low bit corresponding to Third and the high bits
+  // corresponding to Second. 6 corresponds to YCoCg.
+  //
+  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
+  //
+  // Third: 0=nop, 1=SubtractFirst
+  int custom = rct_type % 7;
+  // Special case: permute-only. Swap channels around.
+  if (custom == 0) {
+    Channel ch0 = std::move(input.channel[m]);
+    Channel ch1 = std::move(input.channel[m + 1]);
+    Channel ch2 = std::move(input.channel[m + 2]);
+    input.channel[m + (permutation % 3)] = std::move(ch0);
+    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
+        std::move(ch1);
+    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
+        std::move(ch2);
+    return true;
+  }
+  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
+      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
+      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
+  JXL_RETURN_IF_ERROR(RunOnPool(
+      pool, 0, h, ThreadPool::NoInit,
+      [&](const uint32_t task, size_t /* thread */) {
+        const size_t y = task;
+        const pixel_type* in0 = input.channel[m].Row(y);
+        const pixel_type* in1 = input.channel[m + 1].Row(y);
+        const pixel_type* in2 = input.channel[m + 2].Row(y);
+        pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
+        pixel_type* out1 =
+            input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
+        pixel_type* out2 =
+            input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
+        inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
+      },
+      "InvRCT"));
+  return true;
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(InvRCT);
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
+  return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h
new file mode 100644
index 0000000000..aef65621d5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/rct.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_RCT_H_
+#define LIB_JXL_MODULAR_TRANSFORM_RCT_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
+
+namespace jxl {
+
+Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_RCT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc
new file mode 100644
index 0000000000..8440d9e804
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.cc
@@ -0,0 +1,478 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/squeeze.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::MulEven;
+using hwy::HWY_NAMESPACE::Ne;
+using hwy::HWY_NAMESPACE::Neg;
+using hwy::HWY_NAMESPACE::OddEven;
+using hwy::HWY_NAMESPACE::RebindToUnsigned;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Xor;
+
+#if HWY_TARGET != HWY_SCALAR
+
+JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual,
+                              const pixel_type *JXL_RESTRICT p_avg,
+                              const pixel_type *JXL_RESTRICT p_navg,
+                              const pixel_type *p_pout,
+                              pixel_type *JXL_RESTRICT p_out,
+                              pixel_type *p_nout) {
+  const HWY_CAPPED(pixel_type, 8) d;
+  const RebindToUnsigned<decltype(d)> du;
+  const size_t N = Lanes(d);
+  auto onethird = Set(d, 0x55555556);
+  for (size_t x = 0; x < 8; x += N) {
+    auto avg = Load(d, p_avg + x);
+    auto next_avg = Load(d, p_navg + x);
+    auto top = Load(d, p_pout + x);
+    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
+    auto Ba = Sub(top, avg);
+    auto an = Sub(avg, next_avg);
+    auto nonmono = Xor(Ba, an);
+    auto absBa = Abs(Ba);
+    auto absan = Abs(an);
+    auto absBn = Abs(Sub(top, next_avg));
+    // Compute a3 = absBa / 3
+    auto a3e = BitCast(d, ShiftRight<32>(MulEven(absBa, onethird)));
+    auto a3oi = MulEven(Reverse(d, absBa), onethird);
+    auto a3o = BitCast(
+        d, Reverse(hwy::HWY_NAMESPACE::Repartition<pixel_type_w, decltype(d)>(),
+                   a3oi));
+    auto a3 = OddEven(a3o, a3e);
+    a3 = Add(a3, Add(absBn, Set(d, 2)));
+    auto absdiff = ShiftRight<2>(a3);
+    auto skipdiff = Ne(Ba, Zero(d));
+    skipdiff = And(skipdiff, Ne(an, Zero(d)));
+    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
+    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
+    absdiff = IfThenElse(Gt(absdiff, absBa2),
+                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
+    auto absan2 = ShiftLeft<1>(absan);
+    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
+                         absan2, absdiff);
+    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
+    auto tendency = IfThenZeroElse(skipdiff, diff1);
+
+    auto diff_minus_tendency = Load(d, p_residual + x);
+    auto diff = Add(diff_minus_tendency, tendency);
+    auto out =
+        Add(avg, ShiftRight<1>(
+                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
+    Store(out, d, p_out + x);
+    Store(Sub(out, diff), d, p_nout + x);
+  }
+}
+
+#endif
+
+Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
+  JXL_ASSERT(c < input.channel.size());
+  JXL_ASSERT(rc < input.channel.size());
+  Channel &chin = input.channel[c];
+  const Channel &chin_residual = input.channel[rc];
+  // These must be valid since we ran MetaApply already.
+  JXL_ASSERT(chin.w == DivCeil(chin.w + chin_residual.w, 2));
+  JXL_ASSERT(chin.h == chin_residual.h);
+
+  if (chin_residual.w == 0) {
+    // Short-circuit: output channel has same dimensions as input.
+    input.channel[c].hshift--;
+    return true;
+  }
+
+  // Note: chin.w >= chin_residual.w and at most 1 different.
+  Channel chout(chin.w + chin_residual.w, chin.h, chin.hshift - 1, chin.vshift);
+  JXL_DEBUG_V(4,
+              "Undoing horizontal squeeze of channel %i using residuals in "
+              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
+              c, rc, chin.w, chout.w);
+
+  if (chin_residual.h == 0) {
+    // Short-circuit: channel with no pixels.
+    input.channel[c] = std::move(chout);
+    return true;
+  }
+  auto unsqueeze_row = [&](size_t y, size_t x0) {
+    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
+    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
+    for (size_t x = x0; x < chin_residual.w; x++) {
+      pixel_type_w diff_minus_tendency = p_residual[x];
+      pixel_type_w avg = p_avg[x];
+      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
+      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
+      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
+      pixel_type_w diff = diff_minus_tendency + tendency;
+      pixel_type_w A = avg + (diff / 2);
+      p_out[(x << 1)] = A;
+      pixel_type_w B = A - diff;
+      p_out[(x << 1) + 1] = B;
+    }
+    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
+  };
+
+  // somewhat complicated trickery just to be able to SIMD this.
+  // Horizontal unsqueeze has horizontal data dependencies, so we do
+  // 8 rows at a time and treat it as a vertical unsqueeze of a
+  // transposed 8x8 block (or 9x8 for one input).
+  static constexpr const size_t kRowsPerThread = 8;
+  const auto unsqueeze_span = [&](const uint32_t task, size_t /* thread */) {
+    const size_t y0 = task * kRowsPerThread;
+    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
+    size_t x = 0;
+
+#if HWY_TARGET != HWY_SCALAR
+    intptr_t onerow_in = chin.plane.PixelsPerRow();
+    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
+    intptr_t onerow_out = chout.plane.PixelsPerRow();
+    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
+    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
+    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
+    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
+    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
+    const HWY_CAPPED(pixel_type, 8) d;
+    const size_t N = Lanes(d);
+    if (chin_residual.w > 16 && rows == kRowsPerThread) {
+      for (; x < chin_residual.w - 9; x += 8) {
+        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
+        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
+        for (size_t y = 0; y < kRowsPerThread; y++) {
+          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
+        }
+        for (size_t i = 0; i < 8; i++) {
+          FastUnsqueeze(
+              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
+              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
+              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
+        }
+
+        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
+        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
+        for (size_t y = 0; y < kRowsPerThread; y++) {
+          for (size_t i = 0; i < kRowsPerThread; i += N) {
+            auto even = Load(d, b_p_out_evenT + 8 * y + i);
+            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
+            StoreInterleaved(d, even, odd,
+                             p_out + ((x + i) << 1) + onerow_out * y);
+          }
+        }
+      }
+    }
+#endif
+    for (size_t y = 0; y < rows; y++) {
+      unsqueeze_row(y0 + y, x);
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
+                                ThreadPool::NoInit, unsqueeze_span,
+                                "InvHorizontalSqueeze"));
+  input.channel[c] = std::move(chout);
+  return true;
+}
+
+Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
+  JXL_ASSERT(c < input.channel.size());
+  JXL_ASSERT(rc < input.channel.size());
+  const Channel &chin = input.channel[c];
+  const Channel &chin_residual = input.channel[rc];
+  // These must be valid since we ran MetaApply already.
+  JXL_ASSERT(chin.h == DivCeil(chin.h + chin_residual.h, 2));
+  JXL_ASSERT(chin.w == chin_residual.w);
+
+  if (chin_residual.h == 0) {
+    // Short-circuit: output channel has same dimensions as input.
+    input.channel[c].vshift--;
+    return true;
+  }
+
+  // Note: chin.h >= chin_residual.h and at most 1 different.
+  Channel chout(chin.w, chin.h + chin_residual.h, chin.hshift, chin.vshift - 1);
+  JXL_DEBUG_V(
+      4,
+      "Undoing vertical squeeze of channel %i using residuals in channel "
+      "%i (going from height %" PRIuS " to %" PRIuS ")",
+      c, rc, chin.h, chout.h);
+
+  if (chin_residual.w == 0) {
+    // Short-circuit: channel with no pixels.
+    input.channel[c] = std::move(chout);
+    return true;
+  }
+
+  static constexpr const int kColsPerThread = 64;
+  const auto unsqueeze_slice = [&](const uint32_t task, size_t /* thread */) {
+    const size_t x0 = task * kColsPerThread;
+    const size_t x1 = std::min((size_t)(task + 1) * kColsPerThread, chin.w);
+    const size_t w = x1 - x0;
+    // We only iterate up to std::min(chin_residual.h, chin.h) which is
+    // always chin_residual.h.
+    for (size_t y = 0; y < chin_residual.h; y++) {
+      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
+      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
+      const pixel_type *JXL_RESTRICT p_navg =
+          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
+      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
+      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
+      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
+      size_t x = 0;
+#if HWY_TARGET != HWY_SCALAR
+      for (; x + 7 < w; x += 8) {
+        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
+                      p_out + x, p_nout + x);
+      }
+#endif
+      for (; x < w; x++) {
+        pixel_type_w avg = p_avg[x];
+        pixel_type_w next_avg = p_navg[x];
+        pixel_type_w top = p_pout[x];
+        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
+        pixel_type_w diff_minus_tendency = p_residual[x];
+        pixel_type_w diff = diff_minus_tendency + tendency;
+        pixel_type_w out = avg + (diff / 2);
+        p_out[x] = out;
+        // If the chin_residual.h == chin.h, the output has an even number
+        // of rows so the next line is fine. Otherwise, this loop won't
+        // write to the last output row which is handled separately.
+        p_nout[x] = out - diff;
+      }
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
+                                ThreadPool::NoInit, unsqueeze_slice,
+                                "InvVertSqueeze"));
+
+  if (chout.h & 1) {
+    size_t y = chin.h - 1;
+    const pixel_type *p_avg = chin.Row(y);
+    pixel_type *p_out = chout.Row(y << 1);
+    for (size_t x = 0; x < chin.w; x++) {
+      p_out[x] = p_avg[x];
+    }
+  }
+  input.channel[c] = std::move(chout);
+  return true;
+}
+
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  for (int i = parameters.size() - 1; i >= 0; i--) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
+    bool horizontal = parameters[i].horizontal;
+    bool in_place = parameters[i].in_place;
+    uint32_t beginc = parameters[i].begin_c;
+    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
+    uint32_t offset;
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = input.channel.size() + beginc - endc - 1;
+    }
+    if (beginc < input.nb_meta_channels) {
+      // This is checked in MetaSqueeze.
+      JXL_ASSERT(input.nb_meta_channels > parameters[i].num_c);
+      input.nb_meta_channels -= parameters[i].num_c;
+    }
+
+    for (uint32_t c = beginc; c <= endc; c++) {
+      uint32_t rc = offset + c - beginc;
+      // MetaApply should imply that `rc` is within range, otherwise there's a
+      // programming bug.
+      JXL_ASSERT(rc < input.channel.size());
+      if ((input.channel[c].w < input.channel[rc].w) ||
+          (input.channel[c].h < input.channel[rc].h)) {
+        return JXL_FAILURE("Corrupted squeeze transform");
+      }
+      if (horizontal) {
+        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
+      } else {
+        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
+      }
+    }
+    input.channel.erase(input.channel.begin() + offset,
+                        input.channel.begin() + offset + (endc - beginc + 1));
+  }
+  return true;
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(InvSqueeze);
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool) {
+  return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool);
+}
+
+void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
+                              const Image &image) {
+  int nb_channels = image.channel.size() - image.nb_meta_channels;
+
+  parameters->clear();
+  size_t w = image.channel[image.nb_meta_channels].w;
+  size_t h = image.channel[image.nb_meta_channels].h;
+  JXL_DEBUG_V(
+      7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h);
+
+  // do horizontal first on wide images; vertical first on tall images
+  bool wide = (w > h);
+
+  if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w &&
+      image.channel[image.nb_meta_channels + 1].h == h) {
+    // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0
+    // previews
+    JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h);
+    SqueezeParams params;
+    // horizontal chroma squeeze
+    params.horizontal = true;
+    params.in_place = false;
+    params.begin_c = image.nb_meta_channels + 1;
+    params.num_c = 2;
+    parameters->push_back(params);
+    params.horizontal = false;
+    // vertical chroma squeeze
+    parameters->push_back(params);
+  }
+  SqueezeParams params;
+  params.begin_c = image.nb_meta_channels;
+  params.num_c = nb_channels;
+  params.in_place = true;
+
+  if (!wide) {
+    if (h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = false;
+      parameters->push_back(params);
+      h = (h + 1) / 2;
+      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+  }
+  while (w > JXL_MAX_FIRST_PREVIEW_SIZE || h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+    if (w > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = true;
+      parameters->push_back(params);
+      w = (w + 1) / 2;
+      JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+    if (h > JXL_MAX_FIRST_PREVIEW_SIZE) {
+      params.horizontal = false;
+      parameters->push_back(params);
+      h = (h + 1) / 2;
+      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
+    }
+  }
+  JXL_DEBUG_V(7, "that's it");
+}
+
+Status CheckMetaSqueezeParams(const SqueezeParams &parameter,
+                              int num_channels) {
+  int c1 = parameter.begin_c;
+  int c2 = parameter.begin_c + parameter.num_c - 1;
+  if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) {
+    return JXL_FAILURE("Invalid channel range");
+  }
+  return true;
+}
+
+Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) {
+  if (parameters->empty()) {
+    DefaultSqueezeParameters(parameters, image);
+  }
+
+  for (size_t i = 0; i < parameters->size(); i++) {
+    JXL_RETURN_IF_ERROR(
+        CheckMetaSqueezeParams((*parameters)[i], image.channel.size()));
+    bool horizontal = (*parameters)[i].horizontal;
+    bool in_place = (*parameters)[i].in_place;
+    uint32_t beginc = (*parameters)[i].begin_c;
+    uint32_t endc = (*parameters)[i].begin_c + (*parameters)[i].num_c - 1;
+
+    uint32_t offset;
+    if (beginc < image.nb_meta_channels) {
+      if (endc >= image.nb_meta_channels) {
+        return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels");
+      }
+      if (!in_place) {
+        return JXL_FAILURE(
+            "Invalid squeeze: meta channels require in-place residuals");
+      }
+      image.nb_meta_channels += (*parameters)[i].num_c;
+    }
+    if (in_place) {
+      offset = endc + 1;
+    } else {
+      offset = image.channel.size();
+    }
+    for (uint32_t c = beginc; c <= endc; c++) {
+      if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) {
+        return JXL_FAILURE("Too many squeezes: shift > 30");
+      }
+      size_t w = image.channel[c].w;
+      size_t h = image.channel[c].h;
+      if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel");
+      if (horizontal) {
+        image.channel[c].w = (w + 1) / 2;
+        if (image.channel[c].hshift >= 0) image.channel[c].hshift++;
+        w = w - (w + 1) / 2;
+      } else {
+        image.channel[c].h = (h + 1) / 2;
+        if (image.channel[c].vshift >= 0) image.channel[c].vshift++;
+        h = h - (h + 1) / 2;
+      }
+      image.channel[c].shrink();
+      Channel dummy(w, h);
+      dummy.hshift = image.channel[c].hshift;
+      dummy.vshift = image.channel[c].vshift;
+
+      image.channel.insert(image.channel.begin() + offset + (c - beginc),
+                           std::move(dummy));
+      JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s",
+                  image.DebugString().c_str());
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h
new file mode 100644
index 0000000000..fb18710a6f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/squeeze.h
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
+#define LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
+
+// Haar-like transform: halves the resolution in one direction
+// A B   -> (A+B)>>1              in one channel (average)  -> same range as
+// original channel
+//          A-B - tendency        in a new channel ('residual' needed to make
+//          the transform reversible)
+//                                        -> theoretically range could be 2.5
+//                                        times larger (2 times without the
+//                                        'tendency'), but there should be lots
+//                                        of zeroes
+// Repeated application (alternating horizontal and vertical squeezes) results
+// in downscaling
+//
+// The default coefficient ordering is low-frequency to high-frequency, as in
+// M. Antonini, M. Barlaud, P. Mathieu and I. Daubechies, "Image coding using
+// wavelet transform", IEEE Transactions on Image Processing, vol. 1, no. 2, pp.
+// 205-220, April 1992, doi: 10.1109/83.136597.
+
+#include <stdlib.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/transform.h"
+
+#define JXL_MAX_FIRST_PREVIEW_SIZE 8
+
+namespace jxl {
+
+/*
+        int avg=(A+B)>>1;
+        int diff=(A-B);
+        int rA=(diff+(avg<<1)+(diff&1))>>1;
+        int rB=rA-diff;
+
+*/
+//         |A B|C D|E F|
+//           p   a   n             p=avg(A,B), a=avg(C,D), n=avg(E,F)
+//
+// Goal: estimate C-D (avoiding ringing artifacts)
+// (ensuring that in smooth areas, a zero residual corresponds to a smooth
+// gradient)
+
+// best estimate for C: (B + 2*a)/3
+// best estimate for D: (n + 3*a)/4
+// best estimate for C-D:  4*B - 3*n - a /12
+
+// avoid ringing by 1) only doing this if B <= a <= n  or  B >= a >= n
+// (otherwise, this is not a smooth area and we cannot really estimate C-D)
+//                  2) making sure that B <= C <= D <= n  or B >= C >= D >= n
+
+inline pixel_type_w SmoothTendency(pixel_type_w B, pixel_type_w a,
+                                   pixel_type_w n) {
+  pixel_type_w diff = 0;
+  if (B >= a && a >= n) {
+    diff = (4 * B - 3 * n - a + 6) / 12;
+    //      2C = a<<1 + diff - diff&1 <= 2B  so diff - diff&1 <= 2B - 2a
+    //      2D = a<<1 - diff - diff&1 >= 2n  so diff + diff&1 <= 2a - 2n
+    if (diff - (diff & 1) > 2 * (B - a)) diff = 2 * (B - a) + 1;
+    if (diff + (diff & 1) > 2 * (a - n)) diff = 2 * (a - n);
+  } else if (B <= a && a <= n) {
+    diff = (4 * B - 3 * n - a - 6) / 12;
+    //      2C = a<<1 + diff + diff&1 >= 2B  so diff + diff&1 >= 2B - 2a
+    //      2D = a<<1 - diff + diff&1 <= 2n  so diff - diff&1 >= 2a - 2n
+    if (diff + (diff & 1) < 2 * (B - a)) diff = 2 * (B - a) - 1;
+    if (diff - (diff & 1) < 2 * (a - n)) diff = 2 * (a - n);
+  }
+  return diff;
+}
+
+void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
+                              const Image &image);
+
+Status CheckMetaSqueezeParams(const SqueezeParams &parameter, int num_channels);
+
+Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters);
+
+Status InvSqueeze(Image &input, std::vector<SqueezeParams> parameters,
+                  ThreadPool *pool);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_SQUEEZE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
new file mode 100644
index 0000000000..d9f2b435bf
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/transform.h"
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/transform/palette.h"
+#include "lib/jxl/modular/transform/rct.h"
+#include "lib/jxl/modular/transform/squeeze.h"
+
+namespace jxl {
+
+SqueezeParams::SqueezeParams() { Bundle::Init(this); }
+Transform::Transform(TransformId id) {
+  Bundle::Init(this);
+  this->id = id;
+}
+
+Status Transform::Inverse(Image &input, const weighted::Header &wp_header,
+                          ThreadPool *pool) {
+  JXL_DEBUG_V(6, "Input channels (%" PRIuS ", %" PRIuS " meta): ",
+              input.channel.size(), input.nb_meta_channels);
+  switch (id) {
+    case TransformId::kRCT:
+      return InvRCT(input, begin_c, rct_type, pool);
+    case TransformId::kSqueeze:
+      return InvSqueeze(input, squeezes, pool);
+    case TransformId::kPalette:
+      return InvPalette(input, begin_c, nb_colors, nb_deltas, predictor,
+                        wp_header, pool);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(id));
+  }
+}
+
+Status Transform::MetaApply(Image &input) {
+  JXL_DEBUG_V(6, "MetaApply input: %s", input.DebugString().c_str());
+  switch (id) {
+    case TransformId::kRCT:
+      JXL_DEBUG_V(2, "Transform: kRCT, rct_type=%" PRIu32, rct_type);
+      return CheckEqualChannels(input, begin_c, begin_c + 2);
+    case TransformId::kSqueeze:
+      JXL_DEBUG_V(2, "Transform: kSqueeze:");
+#if JXL_DEBUG_V_LEVEL >= 2
+      {
+        auto squeezes_copy = squeezes;
+        if (squeezes_copy.empty()) {
+          DefaultSqueezeParameters(&squeezes_copy, input);
+        }
+        for (const auto &params : squeezes_copy) {
+          JXL_DEBUG_V(
+              2,
+              "  squeeze params: horizontal=%d, in_place=%d, begin_c=%" PRIu32
+              ", num_c=%" PRIu32,
+              params.horizontal, params.in_place, params.begin_c, params.num_c);
+        }
+      }
+#endif
+      return MetaSqueeze(input, &squeezes);
+    case TransformId::kPalette:
+      JXL_DEBUG_V(2,
+                  "Transform: kPalette, begin_c=%" PRIu32 ", num_c=%" PRIu32
+                  ", nb_colors=%" PRIu32 ", nb_deltas=%" PRIu32,
+                  begin_c, num_c, nb_colors, nb_deltas);
+      return MetaPalette(input, begin_c, begin_c + num_c - 1, nb_colors,
+                         nb_deltas, lossy_palette);
+    default:
+      return JXL_FAILURE("Unknown transformation (ID=%u)",
+                         static_cast<unsigned int>(id));
+  }
+}
+
+Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2) {
+  if (c1 > image.channel.size() || c2 >= image.channel.size() || c2 < c1) {
+    return JXL_FAILURE("Invalid channel range: %u..%u (there are only %" PRIuS
+                       " channels)",
+                       c1, c2, image.channel.size());
+  }
+  if (c1 < image.nb_meta_channels && c2 >= image.nb_meta_channels) {
+    return JXL_FAILURE("Invalid: transforming mix of meta and nonmeta");
+  }
+  const auto &ch1 = image.channel[c1];
+  for (size_t c = c1 + 1; c <= c2; c++) {
+    const auto &ch2 = image.channel[c];
+    if (ch1.w != ch2.w || ch1.h != ch2.h || ch1.hshift != ch2.hshift ||
+        ch1.vshift != ch2.vshift) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h
new file mode 100644
index 0000000000..d5d3259f7a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular/transform/transform.h
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
+#define LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/options.h"
+
+namespace jxl {
+
+enum class TransformId : uint32_t {
+  // G, R-G, B-G and variants (including YCoCg).
+  kRCT = 0,
+
+  // Color palette. Parameters are: [begin_c] [end_c] [nb_colors]
+  kPalette = 1,
+
+  // Squeezing (Haar-style)
+  kSqueeze = 2,
+
+  // Invalid for now.
+  kInvalid = 3,
+};
+
+struct SqueezeParams : public Fields {
+  JXL_FIELDS_NAME(SqueezeParams)
+  bool horizontal;
+  bool in_place;
+  uint32_t begin_c;
+  uint32_t num_c;
+  SqueezeParams();
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &horizontal));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &in_place));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(3), BitsOffset(6, 8),
+                                           BitsOffset(10, 72),
+                                           BitsOffset(13, 1096), 0, &begin_c));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->U32(Val(1), Val(2), Val(3), BitsOffset(4, 4), 2, &num_c));
+    return true;
+  }
+};
+
+class Transform : public Fields {
+ public:
+  TransformId id;
+  // for Palette and RCT.
+  uint32_t begin_c;
+  // for RCT. 42 possible values starting from 0.
+  uint32_t rct_type;
+  // Only for Palette and NearLossless.
+  uint32_t num_c;
+  // Only for Palette.
+  uint32_t nb_colors;
+  uint32_t nb_deltas;
+  // for Squeeze. Default squeeze if empty.
+  std::vector<SqueezeParams> squeezes;
+  // for NearLossless, not serialized.
+  int max_delta_error;
+  // Serialized for Palette.
+  Predictor predictor;
+  // for Palette, not serialized.
+  bool ordered_palette = true;
+  bool lossy_palette = false;
+
+  explicit Transform(TransformId id);
+  // default constructor for bundles.
+  Transform() : Transform(TransformId::kInvalid) {}
+
+  Status VisitFields(Visitor *JXL_RESTRICT visitor) override {
+    JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+        Val((uint32_t)TransformId::kRCT), Val((uint32_t)TransformId::kPalette),
+        Val((uint32_t)TransformId::kSqueeze),
+        Val((uint32_t)TransformId::kInvalid), (uint32_t)TransformId::kRCT,
+        reinterpret_cast<uint32_t *>(&id)));
+    if (id == TransformId::kInvalid) {
+      return JXL_FAILURE("Invalid transform ID");
+    }
+    if (visitor->Conditional(id == TransformId::kRCT ||
+                             id == TransformId::kPalette)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Bits(3), BitsOffset(6, 8), BitsOffset(10, 72),
+                       BitsOffset(13, 1096), 0, &begin_c));
+    }
+    if (visitor->Conditional(id == TransformId::kRCT)) {
+      // 0-41, default YCoCg.
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(6), Bits(2), BitsOffset(4, 2),
+                                             BitsOffset(6, 10), 6, &rct_type));
+      if (rct_type >= 42) {
+        return JXL_FAILURE("Invalid transform RCT type");
+      }
+    }
+    if (visitor->Conditional(id == TransformId::kPalette)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(1), Val(3), Val(4), BitsOffset(13, 1), 3, &num_c));
+      JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+          BitsOffset(8, 0), BitsOffset(10, 256), BitsOffset(12, 1280),
+          BitsOffset(16, 5376), 256, &nb_colors));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), BitsOffset(8, 1), BitsOffset(10, 257),
+                       BitsOffset(16, 1281), 0, &nb_deltas));
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Bits(4, (uint32_t)Predictor::Zero,
+                        reinterpret_cast<uint32_t *>(&predictor)));
+      if (predictor >= Predictor::Best) {
+        return JXL_FAILURE("Invalid predictor");
+      }
+    }
+
+    if (visitor->Conditional(id == TransformId::kSqueeze)) {
+      uint32_t num_squeezes = static_cast<uint32_t>(squeezes.size());
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->U32(Val(0), BitsOffset(4, 1), BitsOffset(6, 9),
+                       BitsOffset(8, 41), 0, &num_squeezes));
+      if (visitor->IsReading()) squeezes.resize(num_squeezes);
+      for (size_t i = 0; i < num_squeezes; i++) {
+        JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&squeezes[i]));
+      }
+    }
+    return true;
+  }
+
+  JXL_FIELDS_NAME(Transform)
+
+  Status Inverse(Image &input, const weighted::Header &wp_header,
+                 ThreadPool *pool = nullptr);
+  Status MetaApply(Image &input);
+};
+
+Status CheckEqualChannels(const Image &image, uint32_t c1, uint32_t c2);
+
+static inline pixel_type PixelAdd(pixel_type a, pixel_type b) {
+  return static_cast<pixel_type>(static_cast<uint32_t>(a) +
+                                 static_cast<uint32_t>(b));
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MODULAR_TRANSFORM_TRANSFORM_H_
diff --git a/third_party/jpeg-xl/lib/jxl/modular_test.cc b/third_party/jpeg-xl/lib/jxl/modular_test.cc
new file mode 100644
index 0000000000..293f59ff87
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/modular_test.cc
@@ -0,0 +1,541 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <array>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/jxl.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/modular/encoding/enc_encoding.h"
+#include "lib/jxl/modular/encoding/encoding.h"
+#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+using test::Roundtrip;
+
+void TestLosslessGroups(size_t group_size_shift) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.modular_group_size_shift = group_size_shift;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 4, io.ysize() / 4);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 280000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+}
+
+TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); }
+
+TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups512)) {
+  TestLosslessGroups(2);
+}
+
+TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) {
+  TestLosslessGroups(3);
+}
+
+TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  // 9 = permute to GBR, to test the special case of permutation-only
+  cparams.colorspace = 9;
+  // slowest speed so different WP modes are tried
+  cparams.speed_tier = SpeedTier::kTortoise;
+  cparams.options.predictor = {Predictor::Weighted};
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(100, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 10150u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+}
+
+TEST(ModularTest, RoundtripLossyDeltaPalette) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.lossy_palette = true;
+  cparams.palette_colors = 0;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(300, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 6800u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.5));
+}
+TEST(ModularTest, RoundtripLossyDeltaPaletteWP) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.lossy_palette = true;
+  cparams.palette_colors = 0;
+  cparams.options.predictor = jxl::Predictor::Weighted;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(300, 100);
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 7000u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(10.1));
+}
+
+TEST(ModularTest, RoundtripLossy) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.butteraugli_distance = 2.f;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 30000u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(2.3));
+}
+
+TEST(ModularTest, RoundtripLossy16) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png");
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.butteraugli_distance = 2.f;
+
+  CodecInOut io_out;
+
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  JXL_CHECK(!io.metadata.m.have_preview);
+  JXL_CHECK(io.frames.size() == 1);
+  JXL_CHECK(io.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms()));
+  io.metadata.m.color_encoding = ColorEncoding::SRGB();
+
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 300u);
+  cparams.ba_params.intensity_target = 80.0f;
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.6));
+}
+
+TEST(ModularTest, RoundtripExtraProperties) {
+  constexpr size_t kSize = 250;
+  Image image(kSize, kSize, /*bitdepth=*/8, 3);
+  ModularOptions options;
+  options.max_properties = 4;
+  options.predictor = Predictor::Zero;
+  Rng rng(0);
+  for (size_t y = 0; y < kSize; y++) {
+    for (size_t x = 0; x < kSize; x++) {
+      image.channel[0].plane.Row(y)[x] = image.channel[2].plane.Row(y)[x] =
+          rng.UniformU(0, 9);
+    }
+  }
+  ZeroFillImage(&image.channel[1].plane);
+  BitWriter writer;
+  ASSERT_TRUE(ModularGenericCompress(image, options, &writer));
+  writer.ZeroPadToByte();
+  Image decoded(kSize, kSize, /*bitdepth=*/8, image.channel.size());
+  for (size_t i = 0; i < image.channel.size(); i++) {
+    const Channel& ch = image.channel[i];
+    decoded.channel[i] = Channel(ch.w, ch.h, ch.hshift, ch.vshift);
+  }
+  Status status = true;
+  {
+    BitReader reader(writer.GetSpan());
+    BitReaderScopedCloser closer(&reader, &status);
+    ASSERT_TRUE(ModularGenericDecompress(&reader, decoded, /*header=*/nullptr,
+                                         /*group_id=*/0, &options));
+  }
+  ASSERT_TRUE(status);
+  ASSERT_EQ(image.channel.size(), decoded.channel.size());
+  for (size_t c = 0; c < image.channel.size(); c++) {
+    for (size_t y = 0; y < image.channel[c].plane.ysize(); y++) {
+      for (size_t x = 0; x < image.channel[c].plane.xsize(); x++) {
+        EXPECT_EQ(image.channel[c].plane.Row(y)[x],
+                  decoded.channel[c].plane.Row(y)[x])
+            << "c = " << c << ", x = " << x << ",  y = " << y;
+      }
+    }
+  }
+}
+
+TEST(ModularTest, RoundtripLosslessCustomSqueeze) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.responsive = 1;
+  // Custom squeeze params, atm just for testing
+  SqueezeParams p;
+  p.horizontal = true;
+  p.in_place = false;
+  p.begin_c = 0;
+  p.num_c = 3;
+  cparams.squeezes.push_back(p);
+  p.begin_c = 1;
+  p.in_place = true;
+  p.horizontal = false;
+  cparams.squeezes.push_back(p);
+
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 265000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
+}
+
+struct RoundtripLosslessConfig {
+  int bitdepth;
+  int responsive;
+};
+class ModularTestParam
+    : public ::testing::TestWithParam<RoundtripLosslessConfig> {};
+
+std::vector<RoundtripLosslessConfig> GenerateLosslessTests() {
+  std::vector<RoundtripLosslessConfig> all;
+  for (int responsive = 0; responsive <= 1; responsive++) {
+    for (int bitdepth = 1; bitdepth < 32; bitdepth++) {
+      if (responsive && bitdepth > 30) continue;
+      all.push_back({bitdepth, responsive});
+    }
+  }
+  return all;
+}
+std::string LosslessTestDescription(
+    const testing::TestParamInfo<ModularTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param.bitdepth << "bit";
+  if (info.param.responsive) name << "Squeeze";
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RoundtripLossless, ModularTestParam,
+                                   testing::ValuesIn(GenerateLosslessTests()),
+                                   LosslessTestDescription);
+
+TEST_P(ModularTestParam, RoundtripLossless) {
+  RoundtripLosslessConfig config = GetParam();
+  int bitdepth = config.bitdepth;
+  int responsive = config.responsive;
+
+  ThreadPool* pool = nullptr;
+  Rng generator(123);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io1;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io1, pool));
+
+  // vary the dimensions a bit, in case of bugs related to
+  // even vs odd width or height.
+  size_t xsize = 423 + bitdepth;
+  size_t ysize = 467 + bitdepth;
+
+  CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.SetUintSamples(bitdepth);
+
+  double factor = ((1lu << bitdepth) - 1lu);
+  double ifactor = 1.0 / factor;
+  Image3F noise_added(xsize, ysize);
+
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      const float* in = io1.Main().color()->PlaneRow(c, y);
+      float* out = noise_added.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        // make the least significant bits random
+        float f = in[x] + generator.UniformF(0.0f, 1.f / 255.f);
+        if (f > 1.f) f = 1.f;
+        // quantize to the bitdepth we're testing
+        unsigned int u = f * factor + 0.5;
+        out[x] = u * ifactor;
+      }
+    }
+  }
+  io.SetFromImage(std::move(noise_added), jxl::ColorEncoding::SRGB(false));
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.responsive = responsive;
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, bitdepth * xsize * ysize / 3);
+  EXPECT_LE(0, ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()));
+  size_t different = 0;
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      const float* in = io.Main().color()->PlaneRow(c, y);
+      const float* out = io2.Main().color()->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        uint32_t uin = in[x] * factor + 0.5;
+        uint32_t uout = out[x] * factor + 0.5;
+        // check that the integer values are identical
+        if (uin != uout) different++;
+      }
+    }
+  }
+  EXPECT_EQ(different, 0);
+}
+
+TEST(ModularTest, RoundtripLosslessCustomFloat) {
+  CodecInOut io;
+  size_t xsize = 100, ysize = 300;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.bit_depth.bits_per_sample = 18;
+  io.metadata.m.bit_depth.exponent_bits_per_sample = 6;
+  io.metadata.m.bit_depth.floating_point_sample = true;
+  io.metadata.m.modular_16_bit_buffer_sufficient = false;
+  ColorEncoding color_encoding;
+  color_encoding.tf.SetTransferFunction(TransferFunction::kLinear);
+  color_encoding.SetColorSpace(ColorSpace::kRGB);
+  Image3F testimage(xsize, ysize);
+  float factor = 1.f / (1 << 14);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ysize; y++) {
+      float* const JXL_RESTRICT row = testimage.PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; x++) {
+        row[x] = factor * (x ^ y);
+      }
+    }
+  }
+  io.SetFromImage(std::move(testimage), color_encoding);
+  io.metadata.m.color_encoding = color_encoding;
+  io.metadata.m.SetIntensityTarget(255);
+
+  CompressParams cparams;
+  cparams.modular_mode = true;
+  cparams.color_transform = jxl::ColorTransform::kNone;
+  cparams.butteraugli_distance = 0.f;
+  cparams.options.predictor = {Predictor::Zero};
+  cparams.speed_tier = SpeedTier::kThunder;
+  cparams.decoding_speed_tier = 2;
+
+  CodecInOut io2;
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 23000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
+}
+
+void WriteHeaders(BitWriter* writer, size_t xsize, size_t ysize) {
+  BitWriter::Allotment allotment(writer, 16);
+  writer->Write(8, 0xFF);
+  writer->Write(8, kCodestreamMarker);
+  allotment.ReclaimAndCharge(writer, 0, nullptr);
+  CodecMetadata metadata;
+  EXPECT_TRUE(metadata.size.Set(xsize, ysize));
+  EXPECT_TRUE(WriteSizeHeader(metadata.size, writer, 0, nullptr));
+  metadata.m.color_encoding = ColorEncoding::LinearSRGB(/*is_gray=*/true);
+  metadata.m.xyb_encoded = false;
+  metadata.m.SetUintSamples(31);
+  EXPECT_TRUE(WriteImageMetadata(metadata.m, writer, 0, nullptr));
+  metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
+  EXPECT_TRUE(Bundle::Write(metadata.transform_data, writer, 0, nullptr));
+  writer->ZeroPadToByte();
+  FrameHeader frame_header(&metadata);
+  frame_header.encoding = FrameEncoding::kModular;
+  frame_header.loop_filter.gab = false;
+  frame_header.loop_filter.epf_iters = 0;
+  EXPECT_TRUE(WriteFrameHeader(frame_header, writer, nullptr));
+}
+
+// Tree with single node, zero predictor, offset is 1 and multiplier is 1,
+// entropy code is prefix tree with alphabet size 256 and all bits lengths 8.
+void WriteHistograms(BitWriter* writer) {
+  writer->Write(1, 1);  // default DC quant
+  writer->Write(1, 1);  // has_tree
+  // tree histograms
+  writer->Write(1, 0);         // LZ77 disabled
+  writer->Write(3, 1);         // simple context map
+  writer->Write(1, 1);         // prefix code
+  writer->Write(7, 0x63);      // UnintConfig(3, 2, 1)
+  writer->Write(12, 0xfef);    // alphabet_size = 256
+  writer->Write(32, 0x10003);  // all bit lengths 8
+  // tree tokens
+  writer->Write(8, 0);   // tree leaf
+  writer->Write(8, 0);   // zero predictor
+  writer->Write(8, 64);  // offset = UnpackSigned(ReverseBits(64)) = 1
+  writer->Write(16, 0);  // multiplier = 1
+  // histograms
+  writer->Write(1, 0);         // LZ77 disabled
+  writer->Write(1, 1);         // prefix code
+  writer->Write(7, 0x63);      // UnintConfig(3, 2, 1)
+  writer->Write(12, 0xfef);    // alphabet_size = 256
+  writer->Write(32, 0x10003);  // all bit lengths 8
+}
+
+TEST(ModularTest, PredictorIntegerOverflow) {
+  const size_t xsize = 1;
+  const size_t ysize = 1;
+  BitWriter writer;
+  WriteHeaders(&writer, xsize, ysize);
+  std::vector<BitWriter> group_codes(1);
+  {
+    BitWriter* bw = &group_codes[0];
+    BitWriter::Allotment allotment(bw, 1 << 20);
+    WriteHistograms(bw);
+    GroupHeader header;
+    header.use_global_tree = true;
+    EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr));
+    // After UnpackSigned this becomes (1 << 31) - 1, the largest pixel_type,
+    // and after adding the offset we get -(1 << 31).
+    bw->Write(8, 119);
+    bw->Write(28, 0xfffffff);
+    bw->ZeroPadToByte();
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
+  }
+  EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
+  writer.AppendByteAligned(group_codes);
+
+  PaddedBytes compressed = std::move(writer).TakeBytes();
+  extras::PackedPixelFile ppf;
+  extras::JXLDecompressParams params;
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
+                             nullptr, &ppf));
+  ASSERT_EQ(1, ppf.frames.size());
+  const auto& img = ppf.frames[0].color;
+  const auto pixels = reinterpret_cast<const float*>(img.pixels());
+  EXPECT_EQ(-1.0f, pixels[0]);
+}
+
+TEST(ModularTest, UnsqueezeIntegerOverflow) {
+  // Image width is 9 so we can test both the SIMD and non-vector code paths.
+  const size_t xsize = 9;
+  const size_t ysize = 2;
+  BitWriter writer;
+  WriteHeaders(&writer, xsize, ysize);
+  std::vector<BitWriter> group_codes(1);
+  {
+    BitWriter* bw = &group_codes[0];
+    BitWriter::Allotment allotment(bw, 1 << 20);
+    WriteHistograms(bw);
+    GroupHeader header;
+    header.use_global_tree = true;
+    header.transforms.emplace_back();
+    header.transforms[0].id = TransformId::kSqueeze;
+    SqueezeParams params;
+    params.horizontal = false;
+    params.in_place = true;
+    params.begin_c = 0;
+    params.num_c = 1;
+    header.transforms[0].squeezes.emplace_back(params);
+    EXPECT_TRUE(Bundle::Write(header, bw, 0, nullptr));
+    for (size_t i = 0; i < xsize * ysize; ++i) {
+      // After UnpackSigned and adding offset, this becomes (1 << 31) - 1, both
+      // in the image and in the residual channels, and unsqueeze makes them
+      // ~(3 << 30) and (1 << 30) (in pixel_type_w) and the first wraps around
+      // to about -(1 << 30).
+      bw->Write(8, 119);
+      bw->Write(28, 0xffffffe);
+    }
+    bw->ZeroPadToByte();
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
+  }
+  EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
+  writer.AppendByteAligned(group_codes);
+
+  PaddedBytes compressed = std::move(writer).TakeBytes();
+  extras::PackedPixelFile ppf;
+  extras::JXLDecompressParams params;
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
+                             nullptr, &ppf));
+  ASSERT_EQ(1, ppf.frames.size());
+  const auto& img = ppf.frames[0].color;
+  const auto pixels = reinterpret_cast<const float*>(img.pixels());
+  for (size_t x = 0; x < xsize; ++x) {
+    EXPECT_NEAR(-0.5f, pixels[x], 1e-10);
+    EXPECT_NEAR(0.5f, pixels[xsize + x], 1e-10);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/noise.h b/third_party/jpeg-xl/lib/jxl/noise.h
new file mode 100644
index 0000000000..d897ea3abe
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/noise.h
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_NOISE_H_
+#define LIB_JXL_NOISE_H_
+
+// Noise parameters shared by encoder/decoder.
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+const float kNoisePrecision = 1 << 10;
+
+struct NoiseParams {
+  // LUT index is an intensity of pixel / mean intensity of patch
+  static constexpr size_t kNumNoisePoints = 8;
+  float lut[kNumNoisePoints];
+
+  void Clear() {
+    for (float& i : lut) i = 0.f;
+  }
+  bool HasAny() const {
+    for (float i : lut) {
+      if (std::abs(i) > 1e-3f) return true;
+    }
+    return false;
+  }
+};
+
+static inline std::pair<int, float> IndexAndFrac(float x) {
+  constexpr size_t kScaleNumerator = NoiseParams::kNumNoisePoints - 2;
+  // TODO: instead of 1, this should be a proper Y range.
+  constexpr float kScale = kScaleNumerator / 1;
+  float scaled_x = std::max(0.f, x * kScale);
+  float floor_x;
+  float frac_x = std::modf(scaled_x, &floor_x);
+  if (JXL_UNLIKELY(scaled_x >= kScaleNumerator + 1)) {
+    floor_x = kScaleNumerator;
+    frac_x = 1.f;
+  }
+  return std::make_pair(static_cast<int>(floor_x), frac_x);
+}
+
+struct NoiseLevel {
+  float noise_level;
+  float intensity;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
new file mode 100644
index 0000000000..07fd824f14
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/matrix_ops.h"
+#include "lib/jxl/opsin_params.h"
+
+namespace jxl {
+namespace {
+
+// Convert a single linear sRGB color to xyb, using the exact image conversion
+// procedure that jpeg xl uses.
+void LinearSrgbToOpsin(float rgb_r, float rgb_g, float rgb_b,
+                       float* JXL_RESTRICT xyb_x, float* JXL_RESTRICT xyb_y,
+                       float* JXL_RESTRICT xyb_b) {
+  Image3F linear(1, 1);
+  linear.PlaneRow(0, 0)[0] = rgb_r;
+  linear.PlaneRow(1, 0)[0] = rgb_g;
+  linear.PlaneRow(2, 0)[0] = rgb_b;
+
+  ImageMetadata metadata;
+  metadata.SetFloat32Samples();
+  metadata.color_encoding = ColorEncoding::LinearSRGB();
+  ImageBundle ib(&metadata);
+  ib.SetFromImage(std::move(linear), metadata.color_encoding);
+  Image3F opsin(1, 1);
+  (void)ToXYB(ib, /*pool=*/nullptr, &opsin, GetJxlCms());
+
+  *xyb_x = opsin.PlaneRow(0, 0)[0];
+  *xyb_y = opsin.PlaneRow(1, 0)[0];
+  *xyb_b = opsin.PlaneRow(2, 0)[0];
+}
+
+// Convert a single XYB color to linear sRGB, using the exact image conversion
+// procedure that jpeg xl uses.
+void OpsinToLinearSrgb(float xyb_x, float xyb_y, float xyb_b,
+                       float* JXL_RESTRICT rgb_r, float* JXL_RESTRICT rgb_g,
+                       float* JXL_RESTRICT rgb_b) {
+  Image3F opsin(1, 1);
+  opsin.PlaneRow(0, 0)[0] = xyb_x;
+  opsin.PlaneRow(1, 0)[0] = xyb_y;
+  opsin.PlaneRow(2, 0)[0] = xyb_b;
+  Image3F linear(1, 1);
+  OpsinParams opsin_params;
+  opsin_params.Init(/*intensity_target=*/255.0f);
+  OpsinToLinear(opsin, Rect(opsin), nullptr, &linear, opsin_params);
+  *rgb_r = linear.PlaneRow(0, 0)[0];
+  *rgb_g = linear.PlaneRow(1, 0)[0];
+  *rgb_b = linear.PlaneRow(2, 0)[0];
+}
+
+void OpsinRoundtripTestRGB(float r, float g, float b) {
+  float xyb_x, xyb_y, xyb_b;
+  LinearSrgbToOpsin(r, g, b, &xyb_x, &xyb_y, &xyb_b);
+  float r2, g2, b2;
+  OpsinToLinearSrgb(xyb_x, xyb_y, xyb_b, &r2, &g2, &b2);
+  EXPECT_NEAR(r, r2, 1e-3);
+  EXPECT_NEAR(g, g2, 1e-3);
+  EXPECT_NEAR(b, b2, 1e-3);
+}
+
+TEST(OpsinImageTest, VerifyOpsinAbsorbanceInverseMatrix) {
+  float matrix[9];  // writable copy
+  for (int i = 0; i < 9; i++) {
+    matrix[i] = GetOpsinAbsorbanceInverseMatrix()[i];
+  }
+  EXPECT_TRUE(Inv3x3Matrix(matrix));
+  for (int i = 0; i < 9; i++) {
+    EXPECT_NEAR(matrix[i], kOpsinAbsorbanceMatrix[i], 1e-6);
+  }
+}
+
+TEST(OpsinImageTest, OpsinRoundtrip) {
+  OpsinRoundtripTestRGB(0, 0, 0);
+  OpsinRoundtripTestRGB(1. / 255, 1. / 255, 1. / 255);
+  OpsinRoundtripTestRGB(128. / 255, 128. / 255, 128. / 255);
+  OpsinRoundtripTestRGB(1, 1, 1);
+
+  OpsinRoundtripTestRGB(0, 0, 1. / 255);
+  OpsinRoundtripTestRGB(0, 0, 128. / 255);
+  OpsinRoundtripTestRGB(0, 0, 1);
+
+  OpsinRoundtripTestRGB(0, 1. / 255, 0);
+  OpsinRoundtripTestRGB(0, 128. / 255, 0);
+  OpsinRoundtripTestRGB(0, 1, 0);
+
+  OpsinRoundtripTestRGB(1. / 255, 0, 0);
+  OpsinRoundtripTestRGB(128. / 255, 0, 0);
+  OpsinRoundtripTestRGB(1, 0, 0);
+}
+
+TEST(OpsinImageTest, VerifyZero) {
+  // Test that black color (zero energy) is 0,0,0 in xyb.
+  float x, y, b;
+  LinearSrgbToOpsin(0, 0, 0, &x, &y, &b);
+  EXPECT_NEAR(0, x, 1e-9);
+  EXPECT_NEAR(0, y, 1e-7);
+  EXPECT_NEAR(0, b, 1e-7);
+}
+
+TEST(OpsinImageTest, VerifyGray) {
+  // Test that grayscale colors have a fixed y/b ratio and x==0.
+  for (size_t i = 1; i < 255; i++) {
+    float x, y, b;
+    LinearSrgbToOpsin(i / 255., i / 255., i / 255., &x, &y, &b);
+    EXPECT_NEAR(0, x, 1e-6);
+    EXPECT_NEAR(kYToBRatio, b / y, 3e-5);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
new file mode 100644
index 0000000000..088253c2ce
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
@@ -0,0 +1,57 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/color_management.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(OpsinInverseTest, LinearInverseInverts) {
+  Image3F linear(128, 128);
+  RandomFillImage(&linear, 0.0f, 1.0f);
+
+  CodecInOut io;
+  io.metadata.m.SetFloat32Samples();
+  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
+  io.SetFromImage(CopyImage(linear), io.metadata.m.color_encoding);
+  ThreadPool* null_pool = nullptr;
+  Image3F opsin(io.xsize(), io.ysize());
+  (void)ToXYB(io.Main(), null_pool, &opsin, GetJxlCms());
+
+  OpsinParams opsin_params;
+  opsin_params.Init(/*intensity_target=*/255.0f);
+  OpsinToLinearInplace(&opsin, /*pool=*/nullptr, opsin_params);
+
+  JXL_ASSERT_OK(VerifyRelativeError(linear, opsin, 3E-3, 2E-4, _));
+}
+
+TEST(OpsinInverseTest, YcbCrInverts) {
+  Image3F rgb(128, 128);
+  RandomFillImage(&rgb, 0.0f, 1.0f);
+
+  ThreadPool* null_pool = nullptr;
+  Image3F ycbcr(rgb.xsize(), rgb.ysize());
+  EXPECT_TRUE(RgbToYcbcr(rgb.Plane(0), rgb.Plane(1), rgb.Plane(2),
+                         &ycbcr.Plane(1), &ycbcr.Plane(0), &ycbcr.Plane(2),
+                         null_pool));
+
+  Image3F rgb2(rgb.xsize(), rgb.ysize());
+  YcbcrToRgb(ycbcr, &rgb2, Rect(rgb));
+
+  JXL_ASSERT_OK(VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7, _));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.cc b/third_party/jpeg-xl/lib/jxl/opsin_params.cc
new file mode 100644
index 0000000000..ec3db4ee76
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_params.cc
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/opsin_params.h"
+
+#include <stdlib.h>
+
+#include "lib/jxl/matrix_ops.h"
+
+namespace jxl {
+
+#define INVERSE_OPSIN_FROM_SPEC 1
+
+const float* GetOpsinAbsorbanceInverseMatrix() {
+#if INVERSE_OPSIN_FROM_SPEC
+  return DefaultInverseOpsinAbsorbanceMatrix();
+#else   // INVERSE_OPSIN_FROM_SPEC
+  // Compute the inverse opsin matrix from the forward matrix. Less precise
+  // than taking the values from the specification, but must be used if the
+  // forward transform is changed and the spec will require updating.
+  static const float* const kInverse = [] {
+    static float inverse[9];
+    for (int i = 0; i < 9; i++) {
+      inverse[i] = kOpsinAbsorbanceMatrix[i];
+    }
+    Inv3x3Matrix(inverse);
+    return inverse;
+  }();
+  return kInverse;
+#endif  // INVERSE_OPSIN_FROM_SPEC
+}
+
+void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse,
+                           float* JXL_RESTRICT simd_inverse,
+                           float intensity_target) {
+  for (size_t i = 0; i < 9; ++i) {
+    simd_inverse[4 * i] = simd_inverse[4 * i + 1] = simd_inverse[4 * i + 2] =
+        simd_inverse[4 * i + 3] = inverse[i] * (255.0f / intensity_target);
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/opsin_params.h b/third_party/jpeg-xl/lib/jxl/opsin_params.h
new file mode 100644
index 0000000000..3a7da97d8a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/opsin_params.h
@@ -0,0 +1,86 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_OPSIN_PARAMS_H_
+#define LIB_JXL_OPSIN_PARAMS_H_
+
+// Constants that define the XYB color space.
+
+#include <stdlib.h>
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+
+// Parameters for opsin absorbance.
+static const float kM02 = 0.078f;
+static const float kM00 = 0.30f;
+static const float kM01 = 1.0f - kM02 - kM00;
+
+static const float kM12 = 0.078f;
+static const float kM10 = 0.23f;
+static const float kM11 = 1.0f - kM12 - kM10;
+
+static const float kM20 = 0.24342268924547819f;
+static const float kM21 = 0.20476744424496821f;
+static const float kM22 = 1.0f - kM20 - kM21;
+
+static const float kBScale = 1.0f;
+static const float kYToBRatio = 1.0f;  // works better with 0.50017729543783418
+static const float kBToYRatio = 1.0f / kYToBRatio;
+
+static const float kB0 = 0.0037930732552754493f;
+static const float kB1 = kB0;
+static const float kB2 = kB0;
+
+// Opsin absorbance matrix is now frozen.
+static const float kOpsinAbsorbanceMatrix[9] = {
+    kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22,
+};
+
+// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec.
+static inline const float* DefaultInverseOpsinAbsorbanceMatrix() {
+  static float kDefaultInverseOpsinAbsorbanceMatrix[9] = {
+      11.031566901960783f,  -9.866943921568629f, -0.16462299647058826f,
+      -3.254147380392157f,  4.418770392156863f,  -0.16462299647058826f,
+      -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f};
+  return kDefaultInverseOpsinAbsorbanceMatrix;
+}
+
+// Returns 3x3 row-major matrix inverse of kOpsinAbsorbanceMatrix.
+// opsin_image_test verifies this is actually the inverse.
+const float* GetOpsinAbsorbanceInverseMatrix();
+
+void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse,
+                           float* JXL_RESTRICT simd_inverse,
+                           float intensity_target);
+
+static const float kOpsinAbsorbanceBias[3] = {
+    kB0,
+    kB1,
+    kB2,
+};
+
+static const float kNegOpsinAbsorbanceBiasRGB[4] = {
+    -kOpsinAbsorbanceBias[0], -kOpsinAbsorbanceBias[1],
+    -kOpsinAbsorbanceBias[2], 1.0f};
+
+static const float kScaledXYBOffset[3] = {
+    0.015386134f,
+    0.0f,
+    0.27770459f,
+};
+
+static const float kScaledXYBScale[3] = {
+    22.995788804f,
+    1.183000077f,
+    1.502141333f,
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_OPSIN_PARAMS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc
new file mode 100644
index 0000000000..9ca7a22423
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/padded_bytes_test.cc
@@ -0,0 +1,126 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/base/padded_bytes.h"
+
+#include <numeric>  // iota
+#include <vector>
+
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+TEST(PaddedBytesTest, TestNonEmptyFirstByteZero) {
+  PaddedBytes pb(1);
+  EXPECT_EQ(0, pb[0]);
+  // Even after resizing..
+  pb.resize(20);
+  EXPECT_EQ(0, pb[0]);
+  // And reserving.
+  pb.reserve(200);
+  EXPECT_EQ(0, pb[0]);
+}
+
+TEST(PaddedBytesTest, TestEmptyFirstByteZero) {
+  PaddedBytes pb(0);
+  // After resizing - new zero is written despite there being nothing to copy.
+  pb.resize(20);
+  EXPECT_EQ(0, pb[0]);
+}
+
+TEST(PaddedBytesTest, TestFillWithoutReserve) {
+  PaddedBytes pb;
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_GE(pb.capacity(), 170u);
+}
+
+TEST(PaddedBytesTest, TestFillWithExactReserve) {
+  PaddedBytes pb;
+  pb.reserve(170);
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_EQ(pb.capacity(), 170u);
+}
+
+TEST(PaddedBytesTest, TestFillWithMoreReserve) {
+  PaddedBytes pb;
+  pb.reserve(171);
+  for (size_t i = 0; i < 170u; ++i) {
+    pb.push_back(i);
+  }
+  EXPECT_EQ(170u, pb.size());
+  EXPECT_GT(pb.capacity(), 170u);
+}
+
+// Can assign() a subset of the valid data.
+TEST(PaddedBytesTest, TestAssignFromWithin) {
+  PaddedBytes pb;
+  pb.reserve(256);
+  for (size_t i = 0; i < 256; ++i) {
+    pb.push_back(i);
+  }
+  pb.assign(pb.data() + 64, pb.data() + 192);
+  EXPECT_EQ(128u, pb.size());
+  for (size_t i = 0; i < 128; ++i) {
+    EXPECT_EQ(i + 64, pb[i]);
+  }
+}
+
+// Can assign() a range with both valid and previously-allocated data.
+TEST(PaddedBytesTest, TestAssignReclaim) {
+  PaddedBytes pb;
+  pb.reserve(256);
+  for (size_t i = 0; i < 256; ++i) {
+    pb.push_back(i);
+  }
+
+  const uint8_t* mem = pb.data();
+  pb.resize(200);
+  // Just shrank without reallocating
+  EXPECT_EQ(mem, pb.data());
+  EXPECT_EQ(256u, pb.capacity());
+
+  // Reclaim part of initial allocation
+  pb.assign(pb.data() + 100, pb.data() + 240);
+  EXPECT_EQ(140u, pb.size());
+
+  for (size_t i = 0; i < 140; ++i) {
+    EXPECT_EQ(i + 100, pb[i]);
+  }
+}
+
+// Can assign() smaller and larger ranges outside the current allocation.
+TEST(PaddedBytesTest, TestAssignOutside) {
+  PaddedBytes pb;
+  pb.resize(400);
+  std::iota(pb.begin(), pb.end(), 1);
+
+  std::vector<uint8_t> small(64);
+  std::iota(small.begin(), small.end(), 500);
+
+  pb.assign(small.data(), small.data() + small.size());
+  EXPECT_EQ(64u, pb.size());
+  for (size_t i = 0; i < 64; ++i) {
+    EXPECT_EQ((i + 500) & 0xFF, pb[i]);
+  }
+
+  std::vector<uint8_t> large(1000);
+  std::iota(large.begin(), large.end(), 600);
+
+  pb.assign(large.data(), large.data() + large.size());
+  EXPECT_EQ(1000u, pb.size());
+  for (size_t i = 0; i < 1000; ++i) {
+    EXPECT_EQ((i + 600) & 0xFF, pb[i]);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.cc b/third_party/jpeg-xl/lib/jxl/passes_state.cc
new file mode 100644
index 0000000000..2f287ec9b6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_state.cc
@@ -0,0 +1,70 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/passes_state.h"
+
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/common.h"
+
+namespace jxl {
+
+Status InitializePassesSharedState(const FrameHeader& frame_header,
+                                   PassesSharedState* JXL_RESTRICT shared,
+                                   bool encoder) {
+  JXL_ASSERT(frame_header.nonserialized_metadata != nullptr);
+  shared->frame_header = frame_header;
+  shared->metadata = frame_header.nonserialized_metadata;
+  shared->frame_dim = frame_header.ToFrameDimensions();
+  shared->image_features.patches.SetPassesSharedState(shared);
+
+  const FrameDimensions& frame_dim = shared->frame_dim;
+
+  shared->ac_strategy =
+      AcStrategyImage(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->raw_quant_field =
+      ImageI(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->epf_sharpness =
+      ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  shared->cmap = ColorCorrelationMap(frame_dim.xsize, frame_dim.ysize);
+
+  // In the decoder, we allocate coeff orders afterwards, when we know how many
+  // we will actually need.
+  shared->coeff_order_size = kCoeffOrderMaxSize;
+  if (encoder &&
+      shared->coeff_orders.size() <
+          frame_header.passes.num_passes * kCoeffOrderMaxSize &&
+      frame_header.encoding == FrameEncoding::kVarDCT) {
+    shared->coeff_orders.resize(frame_header.passes.num_passes *
+                                kCoeffOrderMaxSize);
+  }
+
+  shared->quant_dc = ImageB(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+
+  bool use_dc_frame = !!(frame_header.flags & FrameHeader::kUseDcFrame);
+  if (!encoder && use_dc_frame) {
+    if (frame_header.dc_level == 4) {
+      return JXL_FAILURE("Invalid DC level for kUseDcFrame: %u",
+                         frame_header.dc_level);
+    }
+    shared->dc_storage = Image3F();
+    shared->dc = &shared->dc_frames[frame_header.dc_level];
+    if (shared->dc->xsize() == 0) {
+      return JXL_FAILURE(
+          "kUseDcFrame specified for dc_level %u, but no frame was decoded "
+          "with level %u",
+          frame_header.dc_level, frame_header.dc_level + 1);
+    }
+    ZeroFillImage(&shared->quant_dc);
+  } else {
+    shared->dc_storage =
+        Image3F(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+    shared->dc = &shared->dc_storage;
+  }
+
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/passes_state.h b/third_party/jpeg-xl/lib/jxl/passes_state.h
new file mode 100644
index 0000000000..8d648a8feb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_state.h
@@ -0,0 +1,133 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PASSES_STATE_H_
+#define LIB_JXL_PASSES_STATE_H_
+
+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/noise.h"
+#include "lib/jxl/quant_weights.h"
+#include "lib/jxl/quantizer.h"
+#include "lib/jxl/splines.h"
+
+// Structures that hold the (en/de)coder state for a JPEG XL kVarDCT
+// (en/de)coder.
+
+namespace jxl {
+
+struct ImageFeatures {
+  NoiseParams noise_params;
+  PatchDictionary patches;
+  Splines splines;
+};
+
+// State common to both encoder and decoder.
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct PassesSharedState {
+  PassesSharedState() : frame_header(nullptr) {}
+
+  // Headers and metadata.
+  const CodecMetadata* metadata;
+  FrameHeader frame_header;
+
+  FrameDimensions frame_dim;
+
+  // Control fields and parameters.
+  AcStrategyImage ac_strategy;
+
+  // Dequant matrices + quantizer.
+  DequantMatrices matrices;
+  Quantizer quantizer{&matrices};
+  ImageI raw_quant_field;
+
+  // Per-block side information for EPF detail preservation.
+  ImageB epf_sharpness;
+
+  ColorCorrelationMap cmap;
+
+  ImageFeatures image_features;
+
+  // Memory area for storing coefficient orders.
+  // `coeff_order_size` is the size used by *one* set of coefficient orders (at
+  // most kMaxCoeffOrderSize). A set of coefficient orders is present for each
+  // pass.
+  size_t coeff_order_size = 0;
+  std::vector<coeff_order_t> coeff_orders;
+
+  // Decoder-side DC and quantized DC.
+  ImageB quant_dc;
+  Image3F dc_storage;
+  const Image3F* JXL_RESTRICT dc = &dc_storage;
+
+  BlockCtxMap block_ctx_map;
+
+  Image3F dc_frames[4];
+
+  struct {
+    ImageBundle frame;
+    // ImageBundle doesn't yet have a simple way to state it is in XYB.
+    bool ib_is_in_xyb = false;
+  } reference_frames[4] = {};
+
+  // Number of pre-clustered set of histograms (with the same ctx map), per
+  // pass. Encoded as num_histograms_ - 1.
+  size_t num_histograms = 0;
+
+  bool IsGrayscale() const { return metadata->m.color_encoding.IsGray(); }
+
+  Rect GroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim, frame_dim.xsize,
+                    frame_dim.ysize);
+    return rect;
+  }
+
+  Rect PaddedGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim,
+                    frame_dim.xsize_padded, frame_dim.ysize_padded);
+    return rect;
+  }
+
+  Rect BlockGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_groups;
+    const size_t gy = group_index / frame_dim.xsize_groups;
+    const Rect rect(gx * (frame_dim.group_dim >> 3),
+                    gy * (frame_dim.group_dim >> 3), frame_dim.group_dim >> 3,
+                    frame_dim.group_dim >> 3, frame_dim.xsize_blocks,
+                    frame_dim.ysize_blocks);
+    return rect;
+  }
+
+  Rect DCGroupRect(size_t group_index) const {
+    const size_t gx = group_index % frame_dim.xsize_dc_groups;
+    const size_t gy = group_index / frame_dim.xsize_dc_groups;
+    const Rect rect(gx * frame_dim.group_dim, gy * frame_dim.group_dim,
+                    frame_dim.group_dim, frame_dim.group_dim,
+                    frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+    return rect;
+  }
+};
+
+// Initialized the state information that is shared between encoder and decoder.
+Status InitializePassesSharedState(const FrameHeader& frame_header,
+                                   PassesSharedState* JXL_RESTRICT shared,
+                                   bool encoder = false);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PASSES_STATE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/passes_test.cc b/third_party/jpeg-xl/lib/jxl/passes_test.cc
new file mode 100644
index 0000000000..97d776f941
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/passes_test.cc
@@ -0,0 +1,402 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <future>
+#include <string>
+#include <utility>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+using test::Roundtrip;
+using test::ThreadPoolForTests;
+
+namespace {
+
+TEST(PassesTest, RoundtripSmallPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 1.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.1));
+}
+
+TEST(PassesTest, RoundtripUnalignedPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 12, io.ysize() / 7);
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 2.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.72));
+}
+
+TEST(PassesTest, RoundtripMultiGroupPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  {
+    ThreadPoolForTests pool(4);
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  }
+  io.ShrinkTo(600, 1024);  // partial X, full Y group
+
+  auto test = [&](float target_distance, float threshold) {
+    ThreadPoolForTests pool(4);
+    CompressParams cparams;
+    cparams.butteraugli_distance = target_distance;
+    cparams.progressive_mode = true;
+    CodecInOut io2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                            /* compressed_size */ nullptr, &pool));
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, cparams.ba_params,
+                                    GetJxlCms(),
+                                    /*distmap=*/nullptr, &pool),
+                IsSlightlyBelow(target_distance + threshold));
+  };
+
+  auto run1 = std::async(std::launch::async, test, 1.0f, 0.5f);
+  auto run2 = std::async(std::launch::async, test, 2.0f, 0.5f);
+}
+
+TEST(PassesTest, RoundtripLargeFastPasses) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                          /* comrpessed_size */ nullptr, &pool));
+}
+
+// Checks for differing size/distance in two consecutive runs of distance 2,
+// which involves additional processing including adaptive reconstruction.
+// Failing this may be a sign of race conditions or invalid memory accesses.
+TEST(PassesTest, RoundtripProgressiveConsistent) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+  cparams.butteraugli_distance = 2.0;
+
+  // Try each xsize mod kBlockDim to verify right border handling.
+  for (size_t xsize = 48; xsize > 40; --xsize) {
+    io.ShrinkTo(xsize, 15);
+
+    CodecInOut io2;
+    size_t size2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &size2, &pool));
+
+    CodecInOut io3;
+    size_t size3;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io3, _, &size3, &pool));
+
+    // Exact same compressed size.
+    EXPECT_EQ(size2, size3);
+
+    // Exact same distance.
+    const float dist2 = ButteraugliDistance(io.frames, io2.frames,
+                                            cparams.ba_params, GetJxlCms(),
+                                            /*distmap=*/nullptr, &pool);
+    const float dist3 = ButteraugliDistance(io.frames, io3.frames,
+                                            cparams.ba_params, GetJxlCms(),
+                                            /*distmap=*/nullptr, &pool);
+    EXPECT_EQ(dist2, dist3);
+  }
+}
+
+TEST(PassesTest, AllDownsampleFeasible) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 240000u);
+  float target_butteraugli[9] = {};
+  target_butteraugli[1] = 2.5f;
+  target_butteraugli[2] = 16.0f;
+  target_butteraugli[4] = 20.0f;
+  target_butteraugli[8] = 80.0f;
+
+  // The default progressive encoding scheme should make all these downsampling
+  // factors achievable.
+  // TODO(veluca): re-enable downsampling 16.
+  std::vector<size_t> downsamplings = {1, 2, 4, 8};  //, 16};
+
+  auto check = [&](const uint32_t task, size_t /* thread */) -> void {
+    const size_t downsampling = downsamplings[task];
+    extras::JXLDecompressParams dparams;
+    dparams.max_downsampling = downsampling;
+    CodecInOut output;
+    ASSERT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+    EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
+    EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              target_butteraugli[downsampling])
+        << "downsampling: " << downsampling;
+  };
+  EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit,
+                        check, "TestDownsampling"));
+}
+
+TEST(PassesTest, AllDownsampleFeasibleQProgressive) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 220000u);
+
+  float target_butteraugli[9] = {};
+  target_butteraugli[1] = 3.0f;
+  target_butteraugli[2] = 6.0f;
+  target_butteraugli[4] = 10.0f;
+  target_butteraugli[8] = 80.0f;
+
+  // The default progressive encoding scheme should make all these downsampling
+  // factors achievable.
+  std::vector<size_t> downsamplings = {1, 2, 4, 8};
+
+  auto check = [&](const uint32_t task, size_t /* thread */) -> void {
+    const size_t downsampling = downsamplings[task];
+    extras::JXLDecompressParams dparams;
+    dparams.max_downsampling = downsampling;
+    CodecInOut output;
+    ASSERT_TRUE(
+        test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+    EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
+    EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, cparams.ba_params,
+                                  GetJxlCms(),
+                                  /*distmap=*/nullptr),
+              target_butteraugli[downsampling])
+        << "downsampling: " << downsampling;
+  };
+  EXPECT_TRUE(RunOnPool(&pool, 0, downsamplings.size(), ThreadPool::NoInit,
+                        check, "TestQProgressive"));
+}
+
+TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
+  CodecInOut io_orig;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  Rect rect(0, 0, io_orig.xsize(), 128);
+  // need 2 DC groups for the DC frame to actually be progressive.
+  Image3F large(4242, rect.ysize());
+  ZeroFillImage(&large);
+  CopyImageTo(rect, *io_orig.Main().color(), rect, &large);
+  CodecInOut io;
+  io.metadata = io_orig.metadata;
+  io.SetFromImage(std::move(large), io_orig.Main().c_current());
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_dc = 1;
+  cparams.responsive = true;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 10000u);
+
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 1;
+  CodecInOut output;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+
+  dparams.max_downsampling = 2;
+  CodecInOut output_d2;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output_d2));
+
+  // 0 if reading all the passes, ~15 if skipping the 8x pass.
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(),
+      /*distmap=*/nullptr);
+
+  EXPECT_LE(butteraugli_distance_down2_full, 3.2f);
+  EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
+}
+
+TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io_orig;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  Rect rect(0, 0, io_orig.xsize(), 128);
+  // need 2 DC groups for the DC frame to actually be progressive.
+  Image3F large(4242, rect.ysize());
+  ZeroFillImage(&large);
+  CopyImageTo(rect, *io_orig.Main().color(), rect, &large);
+  CodecInOut io;
+  io.SetFromImage(std::move(large), io_orig.Main().c_current());
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_dc = 1;
+  cparams.responsive = true;
+  cparams.qprogressive_mode = true;
+  cparams.butteraugli_distance = 1.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  EXPECT_LE(compressed.size(), 220000u);
+
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 1;
+  CodecInOut output;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output));
+
+  dparams.max_downsampling = 2;
+  CodecInOut output_d2;
+  ASSERT_TRUE(
+      test::DecodeFile(dparams, Span<const uint8_t>(compressed), &output_d2));
+
+  // 0 if reading all the passes, ~15 if skipping the 8x pass.
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, cparams.ba_params, GetJxlCms(),
+      /*distmap=*/nullptr);
+
+  EXPECT_LE(butteraugli_distance_down2_full, 3.0f);
+  EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
+}
+
+TEST(PassesTest, NonProgressiveDCImage) {
+  ThreadPoolForTests pool(8);
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/flower/flower.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  PaddedBytes compressed;
+  AuxOut aux;
+
+  CompressParams cparams;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.progressive_mode = false;
+  cparams.butteraugli_distance = 2.0;
+  PassesEncoderState enc_state;
+  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
+                         &aux, &pool));
+
+  // Even in non-progressive mode, it should be possible to return a DC-only
+  // image.
+  extras::JXLDecompressParams dparams;
+  dparams.max_downsampling = 100;
+  CodecInOut output;
+  ASSERT_TRUE(test::DecodeFile(dparams, Span<const uint8_t>(compressed),
+                               &output, &pool));
+  EXPECT_EQ(output.xsize(), io.xsize());
+  EXPECT_EQ(output.ysize(), io.ysize());
+}
+
+TEST(PassesTest, RoundtripSmallNoGaborishPasses) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+
+  CompressParams cparams;
+  cparams.gaborish = Override::kOff;
+  cparams.butteraugli_distance = 1.0;
+  cparams.progressive_mode = true;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      IsSlightlyBelow(1.2));
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h
new file mode 100644
index 0000000000..e4172f6db6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_internal.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
+#define LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
+
+#include "lib/jxl/dec_patch_dictionary.h"
+#include "lib/jxl/passes_state.h"  // for PassesSharedState
+
+namespace jxl {
+
+// Context numbers as specified in Section C.4.5, Listing C.2:
+enum Contexts {
+  kNumRefPatchContext = 0,
+  kReferenceFrameContext = 1,
+  kPatchSizeContext = 2,
+  kPatchReferencePositionContext = 3,
+  kPatchPositionContext = 4,
+  kPatchBlendModeContext = 5,
+  kPatchOffsetContext = 6,
+  kPatchCountContext = 7,
+  kPatchAlphaChannelContext = 8,
+  kPatchClampContext = 9,
+  kNumPatchDictionaryContexts
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PATCH_DICTIONARY_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc
new file mode 100644
index 0000000000..5cc6c13a9e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/patch_dictionary_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using ::jxl::test::Roundtrip;
+
+TEST(PatchDictionaryTest, GrayscaleModular) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.SetLossless();
+  cparams.patches = jxl::Override::kOn;
+
+  CodecInOut io2;
+  // Without patches: ~25k
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 8000u);
+  JXL_ASSERT_OK(VerifyRelativeError(*io.Main().color(), *io2.Main().color(),
+                                    1e-7f, 0, _));
+}
+
+TEST(PatchDictionaryTest, GrayscaleVarDCT) {
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/grayscale_patches.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+
+  CompressParams cparams;
+  cparams.patches = jxl::Override::kOn;
+
+  CodecInOut io2;
+  // Without patches: ~47k
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 14000u);
+  // Without patches: ~1.2
+  EXPECT_LE(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      1.1);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/preview_test.cc b/third_party/jpeg-xl/lib/jxl/preview_test.cc
new file mode 100644
index 0000000000..9d4603ca70
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/preview_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <string>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+using test::Roundtrip;
+
+TEST(PreviewTest, RoundtripGivenPreview) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io));
+  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+  // Same as main image
+  io.preview_frame = io.Main().Copy();
+  const size_t preview_xsize = 15;
+  const size_t preview_ysize = 27;
+  io.preview_frame.ShrinkTo(preview_xsize, preview_ysize);
+  io.metadata.m.have_preview = true;
+  ASSERT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(),
+                                             io.preview_frame.ysize()));
+
+  CompressParams cparams;
+  cparams.butteraugli_distance = 2.0;
+  cparams.speed_tier = SpeedTier::kSquirrel;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_EQ(preview_xsize, io2.metadata.m.preview_size.xsize());
+  EXPECT_EQ(preview_ysize, io2.metadata.m.preview_size.ysize());
+  EXPECT_EQ(preview_xsize, io2.preview_frame.xsize());
+  EXPECT_EQ(preview_ysize, io2.preview_frame.ysize());
+
+  EXPECT_LE(ButteraugliDistance(io.preview_frame, io2.preview_frame,
+                                cparams.ba_params, GetJxlCms(),
+                                /*distmap=*/nullptr),
+            2.5);
+  EXPECT_LE(
+      ButteraugliDistance(io.Main(), io2.Main(), cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr),
+      2.5);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.cc b/third_party/jpeg-xl/lib/jxl/quant_weights.cc
new file mode 100644
index 0000000000..5e3f3424aa
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights.cc
@@ -0,0 +1,1239 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+#include "lib/jxl/quant_weights.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_modular.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/quant_weights.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/fast_math-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sqrt;
+
+// kQuantWeights[N * N * c + N * y + x] is the relative weight of the (x, y)
+// coefficient in component c. Higher weights correspond to finer quantization
+// intervals and more bits spent in encoding.
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+void GetQuantWeightsDCT2(const QuantEncoding::DCT2Weights& dct2weights,
+                         float* weights) {
+  for (size_t c = 0; c < 3; c++) {
+    size_t start = c * 64;
+    weights[start] = 0xBAD;
+    weights[start + 1] = weights[start + 8] = dct2weights[c][0];
+    weights[start + 9] = dct2weights[c][1];
+    for (size_t y = 0; y < 2; y++) {
+      for (size_t x = 0; x < 2; x++) {
+        weights[start + y * 8 + x + 2] = dct2weights[c][2];
+        weights[start + (y + 2) * 8 + x] = dct2weights[c][2];
+      }
+    }
+    for (size_t y = 0; y < 2; y++) {
+      for (size_t x = 0; x < 2; x++) {
+        weights[start + (y + 2) * 8 + x + 2] = dct2weights[c][3];
+      }
+    }
+    for (size_t y = 0; y < 4; y++) {
+      for (size_t x = 0; x < 4; x++) {
+        weights[start + y * 8 + x + 4] = dct2weights[c][4];
+        weights[start + (y + 4) * 8 + x] = dct2weights[c][4];
+      }
+    }
+    for (size_t y = 0; y < 4; y++) {
+      for (size_t x = 0; x < 4; x++) {
+        weights[start + (y + 4) * 8 + x + 4] = dct2weights[c][5];
+      }
+    }
+  }
+}
+
+void GetQuantWeightsIdentity(const QuantEncoding::IdWeights& idweights,
+                             float* weights) {
+  for (size_t c = 0; c < 3; c++) {
+    for (int i = 0; i < 64; i++) {
+      weights[64 * c + i] = idweights[c][0];
+    }
+    weights[64 * c + 1] = idweights[c][1];
+    weights[64 * c + 8] = idweights[c][1];
+    weights[64 * c + 9] = idweights[c][2];
+  }
+}
+
+float Interpolate(float pos, float max, const float* array, size_t len) {
+  float scaled_pos = pos * (len - 1) / max;
+  size_t idx = scaled_pos;
+  JXL_DASSERT(idx + 1 < len);
+  float a = array[idx];
+  float b = array[idx + 1];
+  return a * FastPowf(b / a, scaled_pos - idx);
+}
+
+float Mult(float v) {
+  if (v > 0.0f) return 1.0f + v;
+  return 1.0f / (1.0f - v);
+}
+
+using DF4 = HWY_CAPPED(float, 4);
+
+hwy::HWY_NAMESPACE::Vec<DF4> InterpolateVec(
+    hwy::HWY_NAMESPACE::Vec<DF4> scaled_pos, const float* array) {
+  HWY_CAPPED(int32_t, 4) di;
+
+  auto idx = ConvertTo(di, scaled_pos);
+
+  auto frac = Sub(scaled_pos, ConvertTo(DF4(), idx));
+
+  // TODO(veluca): in theory, this could be done with 8 TableLookupBytes, but
+  // it's probably slower.
+  auto a = GatherIndex(DF4(), array, idx);
+  auto b = GatherIndex(DF4(), array + 1, idx);
+
+  return Mul(a, FastPowf(DF4(), Div(b, a), frac));
+}
+
+// Computes quant weights for a COLS*ROWS-sized transform, using num_bands
+// eccentricity bands and num_ebands eccentricity bands. If print_mode is 1,
+// prints the resulting matrix; if print_mode is 2, prints the matrix in a
+// format suitable for a 3d plot with gnuplot.
+Status GetQuantWeights(
+    size_t ROWS, size_t COLS,
+    const DctQuantWeightParams::DistanceBandsArray& distance_bands,
+    size_t num_bands, float* out) {
+  for (size_t c = 0; c < 3; c++) {
+    float bands[DctQuantWeightParams::kMaxDistanceBands] = {
+        distance_bands[c][0]};
+    if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid distance bands");
+    for (size_t i = 1; i < num_bands; i++) {
+      bands[i] = bands[i - 1] * Mult(distance_bands[c][i]);
+      if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid distance bands");
+    }
+    float scale = (num_bands - 1) / (kSqrt2 + 1e-6f);
+    float rcpcol = scale / (COLS - 1);
+    float rcprow = scale / (ROWS - 1);
+    JXL_ASSERT(COLS >= Lanes(DF4()));
+    HWY_ALIGN float l0123[4] = {0, 1, 2, 3};
+    for (uint32_t y = 0; y < ROWS; y++) {
+      float dy = y * rcprow;
+      float dy2 = dy * dy;
+      for (uint32_t x = 0; x < COLS; x += Lanes(DF4())) {
+        auto dx =
+            Mul(Add(Set(DF4(), x), Load(DF4(), l0123)), Set(DF4(), rcpcol));
+        auto scaled_distance = Sqrt(MulAdd(dx, dx, Set(DF4(), dy2)));
+        auto weight = num_bands == 1 ? Set(DF4(), bands[0])
+                                     : InterpolateVec(scaled_distance, bands);
+        StoreU(weight, DF4(), out + c * COLS * ROWS + y * COLS + x);
+      }
+    }
+  }
+  return true;
+}
+
+// TODO(veluca): SIMD-fy. With 256x256, this is actually slow.
+Status ComputeQuantTable(const QuantEncoding& encoding,
+                         float* JXL_RESTRICT table,
+                         float* JXL_RESTRICT inv_table, size_t table_num,
+                         DequantMatrices::QuantTable kind, size_t* pos) {
+  constexpr size_t N = kBlockDim;
+  size_t wrows = 8 * DequantMatrices::required_size_x[kind],
+         wcols = 8 * DequantMatrices::required_size_y[kind];
+  size_t num = wrows * wcols;
+
+  std::vector<float> weights(3 * num);
+
+  switch (encoding.mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      // Library and copy quant encoding should get replaced by the actual
+      // parameters by the caller.
+      JXL_ASSERT(false);
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      GetQuantWeightsIdentity(encoding.idweights, weights.data());
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      GetQuantWeightsDCT2(encoding.dct2weights, weights.data());
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      float weights4x4[3 * 4 * 4];
+      // Always use 4x4 GetQuantWeights for DCT4 quantization tables.
+      JXL_RETURN_IF_ERROR(
+          GetQuantWeights(4, 4, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x4));
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t y = 0; y < kBlockDim; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            weights[c * num + y * kBlockDim + x] =
+                weights4x4[c * 16 + (y / 2) * 4 + (x / 2)];
+          }
+        }
+        weights[c * num + 1] /= encoding.dct4multipliers[c][0];
+        weights[c * num + N] /= encoding.dct4multipliers[c][0];
+        weights[c * num + N + 1] /= encoding.dct4multipliers[c][1];
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      JXL_ASSERT(num == kDCTBlockSize);
+      float weights4x8[3 * 4 * 8];
+      // Always use 4x8 GetQuantWeights for DCT4X8 quantization tables.
+      JXL_RETURN_IF_ERROR(
+          GetQuantWeights(4, 8, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x8));
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t y = 0; y < kBlockDim; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            weights[c * num + y * kBlockDim + x] =
+                weights4x8[c * 32 + (y / 2) * 8 + x];
+          }
+        }
+        weights[c * num + N] /= encoding.dct4x8multipliers[c];
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(GetQuantWeights(
+          wrows, wcols, encoding.dct_params.distance_bands,
+          encoding.dct_params.num_distance_bands, weights.data()));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      if (!encoding.qraw.qtable || encoding.qraw.qtable->size() != 3 * num) {
+        return JXL_FAILURE("Invalid table encoding");
+      }
+      for (size_t i = 0; i < 3 * num; i++) {
+        weights[i] =
+            1.f / (encoding.qraw.qtable_den * (*encoding.qraw.qtable)[i]);
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      constexpr float kFreqs[] = {
+          0xBAD,
+          0xBAD,
+          0.8517778890324296,
+          5.37778436506804,
+          0xBAD,
+          0xBAD,
+          4.734747904497923,
+          5.449245381693219,
+          1.6598270267479331,
+          4,
+          7.275749096817861,
+          10.423227632456525,
+          2.662932286148962,
+          7.630657783650829,
+          8.962388608184032,
+          12.97166202570235,
+      };
+
+      float weights4x8[3 * 4 * 8];
+      JXL_RETURN_IF_ERROR((
+          GetQuantWeights(4, 8, encoding.dct_params.distance_bands,
+                          encoding.dct_params.num_distance_bands, weights4x8)));
+      float weights4x4[3 * 4 * 4];
+      JXL_RETURN_IF_ERROR((GetQuantWeights(
+          4, 4, encoding.dct_params_afv_4x4.distance_bands,
+          encoding.dct_params_afv_4x4.num_distance_bands, weights4x4)));
+
+      constexpr float lo = 0.8517778890324296;
+      constexpr float hi = 12.97166202570235f - lo + 1e-6f;
+      for (size_t c = 0; c < 3; c++) {
+        float bands[4];
+        bands[0] = encoding.afv_weights[c][5];
+        if (bands[0] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands");
+        for (size_t i = 1; i < 4; i++) {
+          bands[i] = bands[i - 1] * Mult(encoding.afv_weights[c][i + 5]);
+          if (bands[i] < kAlmostZero) return JXL_FAILURE("Invalid AFV bands");
+        }
+        size_t start = c * 64;
+        auto set_weight = [&start, &weights](size_t x, size_t y, float val) {
+          weights[start + y * 8 + x] = val;
+        };
+        weights[start] = 1;  // Not used, but causes MSAN error otherwise.
+        // Weights for (0, 1) and (1, 0).
+        set_weight(0, 1, encoding.afv_weights[c][0]);
+        set_weight(1, 0, encoding.afv_weights[c][1]);
+        // AFV special weights for 3-pixel corner.
+        set_weight(0, 2, encoding.afv_weights[c][2]);
+        set_weight(2, 0, encoding.afv_weights[c][3]);
+        set_weight(2, 2, encoding.afv_weights[c][4]);
+
+        // All other AFV weights.
+        for (size_t y = 0; y < 4; y++) {
+          for (size_t x = 0; x < 4; x++) {
+            if (x < 2 && y < 2) continue;
+            float val = Interpolate(kFreqs[y * 4 + x] - lo, hi, bands, 4);
+            set_weight(2 * x, 2 * y, val);
+          }
+        }
+
+        // Put 4x8 weights in odd rows, except (1, 0).
+        for (size_t y = 0; y < kBlockDim / 2; y++) {
+          for (size_t x = 0; x < kBlockDim; x++) {
+            if (x == 0 && y == 0) continue;
+            weights[c * num + (2 * y + 1) * kBlockDim + x] =
+                weights4x8[c * 32 + y * 8 + x];
+          }
+        }
+        // Put 4x4 weights in even rows / odd columns, except (0, 1).
+        for (size_t y = 0; y < kBlockDim / 2; y++) {
+          for (size_t x = 0; x < kBlockDim / 2; x++) {
+            if (x == 0 && y == 0) continue;
+            weights[c * num + (2 * y) * kBlockDim + 2 * x + 1] =
+                weights4x4[c * 16 + y * 4 + x];
+          }
+        }
+      }
+      break;
+    }
+  }
+  size_t prev_pos = *pos;
+  HWY_CAPPED(float, 64) d;
+  for (size_t i = 0; i < num * 3; i += Lanes(d)) {
+    auto inv_val = LoadU(d, weights.data() + i);
+    if (JXL_UNLIKELY(!AllFalse(d, Ge(inv_val, Set(d, 1.0f / kAlmostZero))) ||
+                     !AllFalse(d, Lt(inv_val, Set(d, kAlmostZero))))) {
+      return JXL_FAILURE("Invalid quantization table");
+    }
+    auto val = Div(Set(d, 1.0f), inv_val);
+    StoreU(val, d, table + *pos + i);
+    StoreU(inv_val, d, inv_table + *pos + i);
+  }
+  (*pos) += 3 * num;
+
+  // Ensure that the lowest frequencies have a 0 inverse table.
+  // This does not affect en/decoding, but allows AC strategy selection to be
+  // slightly simpler.
+  size_t xs = DequantMatrices::required_size_x[kind];
+  size_t ys = DequantMatrices::required_size_y[kind];
+  CoefficientLayout(&ys, &xs);
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t y = 0; y < ys; y++) {
+      for (size_t x = 0; x < xs; x++) {
+        inv_table[prev_pos + c * ys * xs * kDCTBlockSize + y * kBlockDim * xs +
+                  x] = 0;
+      }
+    }
+  }
+  return true;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+namespace {
+
+HWY_EXPORT(ComputeQuantTable);
+
+static constexpr const float kAlmostZero = 1e-8f;
+
+Status DecodeDctParams(BitReader* br, DctQuantWeightParams* params) {
+  params->num_distance_bands =
+      br->ReadFixedBits<DctQuantWeightParams::kLog2MaxDistanceBands>() + 1;
+  for (size_t c = 0; c < 3; c++) {
+    for (size_t i = 0; i < params->num_distance_bands; i++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Read(br, &params->distance_bands[c][i]));
+    }
+    if (params->distance_bands[c][0] < kAlmostZero) {
+      return JXL_FAILURE("Distance band seed is too small");
+    }
+    params->distance_bands[c][0] *= 64.0f;
+  }
+  return true;
+}
+
+Status Decode(BitReader* br, QuantEncoding* encoding, size_t required_size_x,
+              size_t required_size_y, size_t idx,
+              ModularFrameDecoder* modular_frame_decoder) {
+  size_t required_size = required_size_x * required_size_y;
+  required_size_x *= kBlockDim;
+  required_size_y *= kBlockDim;
+  int mode = br->ReadFixedBits<kLog2NumQuantModes>();
+  switch (mode) {
+    case QuantEncoding::kQuantModeLibrary: {
+      encoding->predefined = br->ReadFixedBits<kCeilLog2NumPredefinedTables>();
+      if (encoding->predefined >= kNumPredefinedTables) {
+        return JXL_FAILURE("Invalid predefined table");
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeID: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 3; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->idweights[c][i]));
+          if (std::abs(encoding->idweights[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("ID Quantizer is too small");
+          }
+          encoding->idweights[c][i] *= 64;
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT2: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 6; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->dct2weights[c][i]));
+          if (std::abs(encoding->dct2weights[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("Quantizer is too small");
+          }
+          encoding->dct2weights[c][i] *= 64;
+        }
+      }
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4X8: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        JXL_RETURN_IF_ERROR(
+            F16Coder::Read(br, &encoding->dct4x8multipliers[c]));
+        if (std::abs(encoding->dct4x8multipliers[c]) < kAlmostZero) {
+          return JXL_FAILURE("DCT4X8 multiplier is too small");
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT4: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 2; i++) {
+          JXL_RETURN_IF_ERROR(
+              F16Coder::Read(br, &encoding->dct4multipliers[c][i]));
+          if (std::abs(encoding->dct4multipliers[c][i]) < kAlmostZero) {
+            return JXL_FAILURE("DCT4 multiplier is too small");
+          }
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeAFV: {
+      if (required_size != 1) return JXL_FAILURE("Invalid mode");
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t i = 0; i < 9; i++) {
+          JXL_RETURN_IF_ERROR(F16Coder::Read(br, &encoding->afv_weights[c][i]));
+        }
+        for (size_t i = 0; i < 6; i++) {
+          encoding->afv_weights[c][i] *= 64;
+        }
+      }
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params_afv_4x4));
+      break;
+    }
+    case QuantEncoding::kQuantModeDCT: {
+      JXL_RETURN_IF_ERROR(DecodeDctParams(br, &encoding->dct_params));
+      break;
+    }
+    case QuantEncoding::kQuantModeRAW: {
+      // Set mode early, to avoid mem-leak.
+      encoding->mode = QuantEncoding::kQuantModeRAW;
+      JXL_RETURN_IF_ERROR(ModularFrameDecoder::DecodeQuantTable(
+          required_size_x, required_size_y, br, encoding, idx,
+          modular_frame_decoder));
+      break;
+    }
+    default:
+      return JXL_FAILURE("Invalid quantization table encoding");
+  }
+  encoding->mode = QuantEncoding::Mode(mode);
+  return true;
+}
+
+}  // namespace
+
+// These definitions are needed before C++17.
+constexpr size_t DequantMatrices::required_size_[];
+constexpr size_t DequantMatrices::required_size_x[];
+constexpr size_t DequantMatrices::required_size_y[];
+constexpr DequantMatrices::QuantTable DequantMatrices::kQuantTable[];
+
+Status DequantMatrices::Decode(BitReader* br,
+                               ModularFrameDecoder* modular_frame_decoder) {
+  size_t all_default = br->ReadBits(1);
+  size_t num_tables = all_default ? 0 : static_cast<size_t>(kNum);
+  encodings_.clear();
+  encodings_.resize(kNum, QuantEncoding::Library(0));
+  for (size_t i = 0; i < num_tables; i++) {
+    JXL_RETURN_IF_ERROR(
+        jxl::Decode(br, &encodings_[i], required_size_x[i % kNum],
+                    required_size_y[i % kNum], i, modular_frame_decoder));
+  }
+  computed_mask_ = 0;
+  return true;
+}
+
+Status DequantMatrices::DecodeDC(BitReader* br) {
+  bool all_default = br->ReadBits(1);
+  if (!br->AllReadsWithinBounds()) return JXL_FAILURE("EOS during DecodeDC");
+  if (!all_default) {
+    for (size_t c = 0; c < 3; c++) {
+      JXL_RETURN_IF_ERROR(F16Coder::Read(br, &dc_quant_[c]));
+      dc_quant_[c] *= 1.0f / 128.0f;
+      // Negative values and nearly zero are invalid values.
+      if (dc_quant_[c] < kAlmostZero) {
+        return JXL_FAILURE("Invalid dc_quant: coefficient is too small.");
+      }
+      inv_dc_quant_[c] = 1.0f / dc_quant_[c];
+    }
+  }
+  return true;
+}
+
+constexpr float V(float v) { return static_cast<float>(v); }
+
+namespace {
+struct DequantMatricesLibraryDef {
+  // DCT8
+  static constexpr QuantEncodingInternal DCT() {
+    return QuantEncodingInternal::DCT(DctQuantWeightParams({{{{
+                                                                 V(3150.0),
+                                                                 V(0.0),
+                                                                 V(-0.4),
+                                                                 V(-0.4),
+                                                                 V(-0.4),
+                                                                 V(-2.0),
+                                                             }},
+                                                             {{
+                                                                 V(560.0),
+                                                                 V(0.0),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                                 V(-0.3),
+                                                             }},
+                                                             {{
+                                                                 V(512.0),
+                                                                 V(-2.0),
+                                                                 V(-1.0),
+                                                                 V(0.0),
+                                                                 V(-1.0),
+                                                                 V(-2.0),
+                                                             }}}},
+                                                           6));
+  }
+
+  // Identity
+  static constexpr QuantEncodingInternal IDENTITY() {
+    return QuantEncodingInternal::Identity({{{{
+                                                 V(280.0),
+                                                 V(3160.0),
+                                                 V(3160.0),
+                                             }},
+                                             {{
+                                                 V(60.0),
+                                                 V(864.0),
+                                                 V(864.0),
+                                             }},
+                                             {{
+                                                 V(18.0),
+                                                 V(200.0),
+                                                 V(200.0),
+                                             }}}});
+  }
+
+  // DCT2
+  static constexpr QuantEncodingInternal DCT2X2() {
+    return QuantEncodingInternal::DCT2({{{{
+                                             V(3840.0),
+                                             V(2560.0),
+                                             V(1280.0),
+                                             V(640.0),
+                                             V(480.0),
+                                             V(300.0),
+                                         }},
+                                         {{
+                                             V(960.0),
+                                             V(640.0),
+                                             V(320.0),
+                                             V(180.0),
+                                             V(140.0),
+                                             V(120.0),
+                                         }},
+                                         {{
+                                             V(640.0),
+                                             V(320.0),
+                                             V(128.0),
+                                             V(64.0),
+                                             V(32.0),
+                                             V(16.0),
+                                         }}}});
+  }
+
+  // DCT4 (quant_kind 3)
+  static constexpr QuantEncodingInternal DCT4X4() {
+    return QuantEncodingInternal::DCT4(DctQuantWeightParams({{{{
+                                                                  V(2200.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                              }},
+                                                              {{
+                                                                  V(392.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                                  V(0.0),
+                                                              }},
+                                                              {{
+                                                                  V(112.0),
+                                                                  V(-0.25),
+                                                                  V(-0.25),
+                                                                  V(-0.5),
+                                                              }}}},
+                                                            4),
+                                       /* kMul */
+                                       {{{{
+                                             V(1.0),
+                                             V(1.0),
+                                         }},
+                                         {{
+                                             V(1.0),
+                                             V(1.0),
+                                         }},
+                                         {{
+                                             V(1.0),
+                                             V(1.0),
+                                         }}}});
+  }
+
+  // DCT16
+  static constexpr QuantEncodingInternal DCT16X16() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(8996.8725711814115328),
+                                   V(-1.3000777393353804),
+                                   V(-0.49424529824571225),
+                                   V(-0.439093774457103443),
+                                   V(-0.6350101832695744),
+                                   V(-0.90177264050827612),
+                                   V(-1.6162099239887414),
+                               }},
+                               {{
+                                   V(3191.48366296844234752),
+                                   V(-0.67424582104194355),
+                                   V(-0.80745813428471001),
+                                   V(-0.44925837484843441),
+                                   V(-0.35865440981033403),
+                                   V(-0.31322389111877305),
+                                   V(-0.37615025315725483),
+                               }},
+                               {{
+                                   V(1157.50408145487200256),
+                                   V(-2.0531423165804414),
+                                   V(-1.4),
+                                   V(-0.50687130033378396),
+                                   V(-0.42708730624733904),
+                                   V(-1.4856834539296244),
+                                   V(-4.9209142884401604),
+                               }}}},
+                             7));
+  }
+
+  // DCT32
+  static constexpr QuantEncodingInternal DCT32X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(15718.40830982518931456),
+                                   V(-1.025),
+                                   V(-0.98),
+                                   V(-0.9012),
+                                   V(-0.4),
+                                   V(-0.48819395464),
+                                   V(-0.421064),
+                                   V(-0.27),
+                               }},
+                               {{
+                                   V(7305.7636810695983104),
+                                   V(-0.8041958212306401),
+                                   V(-0.7633036457487539),
+                                   V(-0.55660379990111464),
+                                   V(-0.49785304658857626),
+                                   V(-0.43699592683512467),
+                                   V(-0.40180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(3803.53173721215041536),
+                                   V(-3.060733579805728),
+                                   V(-2.0413270132490346),
+                                   V(-2.0235650159727417),
+                                   V(-0.5495389509954993),
+                                   V(-0.4),
+                                   V(-0.4),
+                                   V(-0.3),
+                               }}}},
+                             8));
+  }
+
+  // DCT16X8
+  static constexpr QuantEncodingInternal DCT8X16() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(7240.7734393502),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.5),
+                               }},
+                               {{
+                                   V(1448.15468787004),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.2),
+                                   V(-0.2),
+                                   V(-0.2),
+                               }},
+                               {{
+                                   V(506.854140754517),
+                                   V(-1.4),
+                                   V(-0.2),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-1.5),
+                                   V(-3.6),
+                               }}}},
+                             7));
+  }
+
+  // DCT32X8
+  static constexpr QuantEncodingInternal DCT8X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(16283.2494710648897),
+                                   V(-1.7812845336559429),
+                                   V(-1.6309059012653515),
+                                   V(-1.0382179034313539),
+                                   V(-0.85),
+                                   V(-0.7),
+                                   V(-0.9),
+                                   V(-1.2360638576849587),
+                               }},
+                               {{
+                                   V(5089.15750884921511936),
+                                   V(-0.320049391452786891),
+                                   V(-0.35362849922161446),
+                                   V(-0.30340000000000003),
+                                   V(-0.61),
+                                   V(-0.5),
+                                   V(-0.5),
+                                   V(-0.6),
+                               }},
+                               {{
+                                   V(3397.77603275308720128),
+                                   V(-0.321327362693153371),
+                                   V(-0.34507619223117997),
+                                   V(-0.70340000000000003),
+                                   V(-0.9),
+                                   V(-1.0),
+                                   V(-1.0),
+                                   V(-1.1754605576265209),
+                               }}}},
+                             8));
+  }
+
+  // DCT32X16
+  static constexpr QuantEncodingInternal DCT16X32() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(13844.97076442300573),
+                                   V(-0.97113799999999995),
+                                   V(-0.658),
+                                   V(-0.42026),
+                                   V(-0.22712),
+                                   V(-0.2206),
+                                   V(-0.226),
+                                   V(-0.6),
+                               }},
+                               {{
+                                   V(4798.964084220744293),
+                                   V(-0.61125308982767057),
+                                   V(-0.83770786552491361),
+                                   V(-0.79014862079498627),
+                                   V(-0.2692727459704829),
+                                   V(-0.38272769465388551),
+                                   V(-0.22924222653091453),
+                                   V(-0.20719098826199578),
+                               }},
+                               {{
+                                   V(1807.236946760964614),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT4X8 and 8x4
+  static constexpr QuantEncodingInternal DCT4X8() {
+    return QuantEncodingInternal::DCT4X8(
+        DctQuantWeightParams({{
+                                 {{
+                                     V(2198.050556016380522),
+                                     V(-0.96269623020744692),
+                                     V(-0.76194253026666783),
+                                     V(-0.6551140670773547),
+                                 }},
+                                 {{
+                                     V(764.3655248643528689),
+                                     V(-0.92630200888366945),
+                                     V(-0.9675229603596517),
+                                     V(-0.27845290869168118),
+                                 }},
+                                 {{
+                                     V(527.107573587542228),
+                                     V(-1.4594385811273854),
+                                     V(-1.450082094097871593),
+                                     V(-1.5843722511996204),
+                                 }},
+                             }},
+                             4),
+        /* kMuls */
+        {{
+            V(1.0),
+            V(1.0),
+            V(1.0),
+        }});
+  }
+  // AFV
+  static QuantEncodingInternal AFV0() {
+    return QuantEncodingInternal::AFV(DCT4X8().dct_params, DCT4X4().dct_params,
+                                      {{{{
+                                            // 4x4/4x8 DC tendency.
+                                            V(3072.0),
+                                            V(3072.0),
+                                            // AFV corner.
+                                            V(256.0),
+                                            V(256.0),
+                                            V(256.0),
+                                            // AFV high freqs.
+                                            V(414.0),
+                                            V(0.0),
+                                            V(0.0),
+                                            V(0.0),
+                                        }},
+                                        {{
+                                            // 4x4/4x8 DC tendency.
+                                            V(1024.0),
+                                            V(1024.0),
+                                            // AFV corner.
+                                            V(50),
+                                            V(50),
+                                            V(50),
+                                            // AFV high freqs.
+                                            V(58.0),
+                                            V(0.0),
+                                            V(0.0),
+                                            V(0.0),
+                                        }},
+                                        {{
+                                            // 4x4/4x8 DC tendency.
+                                            V(384.0),
+                                            V(384.0),
+                                            // AFV corner.
+                                            V(12.0),
+                                            V(12.0),
+                                            V(12.0),
+                                            // AFV high freqs.
+                                            V(22.0),
+                                            V(-0.25),
+                                            V(-0.25),
+                                            V(-0.25),
+                                        }}}});
+  }
+
+  // DCT64
+  static QuantEncodingInternal DCT64X64() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(0.9 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(0.9 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(0.9 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT64X32
+  static QuantEncodingInternal DCT32X64() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(0.65 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(0.65 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(0.65 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+  // DCT128X128
+  static QuantEncodingInternal DCT128X128() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(1.8 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(1.8 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(1.8 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT128X64
+  static QuantEncodingInternal DCT64X128() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(1.3 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(1.3 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(1.3 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+  // DCT256X256
+  static QuantEncodingInternal DCT256X256() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(3.6 * 26629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(3.6 * 9311.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(3.6 * 4992.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+
+  // DCT256X128
+  static QuantEncodingInternal DCT128X256() {
+    return QuantEncodingInternal::DCT(
+        DctQuantWeightParams({{{{
+                                   V(2.6 * 23629.073922049845),
+                                   V(-1.025),
+                                   V(-0.78),
+                                   V(-0.65012),
+                                   V(-0.19041574084286472),
+                                   V(-0.20819395464),
+                                   V(-0.421064),
+                                   V(-0.32733845535848671),
+                               }},
+                               {{
+                                   V(2.6 * 8611.3238710010046),
+                                   V(-0.3041958212306401),
+                                   V(-0.3633036457487539),
+                                   V(-0.35660379990111464),
+                                   V(-0.3443074455424403),
+                                   V(-0.33699592683512467),
+                                   V(-0.30180866526242109),
+                                   V(-0.27321683125358037),
+                               }},
+                               {{
+                                   V(2.6 * 4492.2486445538634),
+                                   V(-1.2),
+                                   V(-1.2),
+                                   V(-0.8),
+                                   V(-0.7),
+                                   V(-0.7),
+                                   V(-0.4),
+                                   V(-0.5),
+                               }}}},
+                             8));
+  }
+};
+}  // namespace
+
+DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() {
+  static_assert(kNum == 17,
+                "Update this function when adding new quantization kinds.");
+  static_assert(kNumPredefinedTables == 1,
+                "Update this function when adding new quantization matrices to "
+                "the library.");
+
+  // The library and the indices need to be kept in sync manually.
+  static_assert(0 == DCT, "Update the DequantLibrary array below.");
+  static_assert(1 == IDENTITY, "Update the DequantLibrary array below.");
+  static_assert(2 == DCT2X2, "Update the DequantLibrary array below.");
+  static_assert(3 == DCT4X4, "Update the DequantLibrary array below.");
+  static_assert(4 == DCT16X16, "Update the DequantLibrary array below.");
+  static_assert(5 == DCT32X32, "Update the DequantLibrary array below.");
+  static_assert(6 == DCT8X16, "Update the DequantLibrary array below.");
+  static_assert(7 == DCT8X32, "Update the DequantLibrary array below.");
+  static_assert(8 == DCT16X32, "Update the DequantLibrary array below.");
+  static_assert(9 == DCT4X8, "Update the DequantLibrary array below.");
+  static_assert(10 == AFV0, "Update the DequantLibrary array below.");
+  static_assert(11 == DCT64X64, "Update the DequantLibrary array below.");
+  static_assert(12 == DCT32X64, "Update the DequantLibrary array below.");
+  static_assert(13 == DCT128X128, "Update the DequantLibrary array below.");
+  static_assert(14 == DCT64X128, "Update the DequantLibrary array below.");
+  static_assert(15 == DCT256X256, "Update the DequantLibrary array below.");
+  static_assert(16 == DCT128X256, "Update the DequantLibrary array below.");
+  return DequantMatrices::DequantLibraryInternal{{
+      DequantMatricesLibraryDef::DCT(),
+      DequantMatricesLibraryDef::IDENTITY(),
+      DequantMatricesLibraryDef::DCT2X2(),
+      DequantMatricesLibraryDef::DCT4X4(),
+      DequantMatricesLibraryDef::DCT16X16(),
+      DequantMatricesLibraryDef::DCT32X32(),
+      DequantMatricesLibraryDef::DCT8X16(),
+      DequantMatricesLibraryDef::DCT8X32(),
+      DequantMatricesLibraryDef::DCT16X32(),
+      DequantMatricesLibraryDef::DCT4X8(),
+      DequantMatricesLibraryDef::AFV0(),
+      DequantMatricesLibraryDef::DCT64X64(),
+      DequantMatricesLibraryDef::DCT32X64(),
+      // Same default for large transforms (128+) as for 64x* transforms.
+      DequantMatricesLibraryDef::DCT128X128(),
+      DequantMatricesLibraryDef::DCT64X128(),
+      DequantMatricesLibraryDef::DCT256X256(),
+      DequantMatricesLibraryDef::DCT128X256(),
+  }};
+}
+
+const QuantEncoding* DequantMatrices::Library() {
+  static const DequantMatrices::DequantLibraryInternal kDequantLibrary =
+      DequantMatrices::LibraryInit();
+  // Downcast the result to a const QuantEncoding* from QuantEncodingInternal*
+  // since the subclass (QuantEncoding) doesn't add any new members and users
+  // will need to upcast to QuantEncodingInternal to access the members of that
+  // class. This allows to have kDequantLibrary as a constexpr value while still
+  // allowing to create QuantEncoding::RAW() instances that use std::vector in
+  // C++11.
+  return reinterpret_cast<const QuantEncoding*>(kDequantLibrary.data());
+}
+
+DequantMatrices::DequantMatrices() {
+  encodings_.resize(size_t(QuantTable::kNum), QuantEncoding::Library(0));
+  size_t pos = 0;
+  size_t offsets[kNum * 3];
+  for (size_t i = 0; i < size_t(QuantTable::kNum); i++) {
+    size_t num = required_size_[i] * kDCTBlockSize;
+    for (size_t c = 0; c < 3; c++) {
+      offsets[3 * i + c] = pos + c * num;
+    }
+    pos += 3 * num;
+  }
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    for (size_t c = 0; c < 3; c++) {
+      table_offsets_[i * 3 + c] = offsets[kQuantTable[i] * 3 + c];
+    }
+  }
+}
+
+Status DequantMatrices::EnsureComputed(uint32_t acs_mask) {
+  const QuantEncoding* library = Library();
+
+  if (!table_storage_) {
+    table_storage_ = hwy::AllocateAligned<float>(2 * kTotalTableSize);
+    table_ = table_storage_.get();
+    inv_table_ = table_storage_.get() + kTotalTableSize;
+  }
+
+  size_t offsets[kNum * 3 + 1];
+  size_t pos = 0;
+  for (size_t i = 0; i < kNum; i++) {
+    size_t num = required_size_[i] * kDCTBlockSize;
+    for (size_t c = 0; c < 3; c++) {
+      offsets[3 * i + c] = pos + c * num;
+    }
+    pos += 3 * num;
+  }
+  offsets[kNum * 3] = pos;
+  JXL_ASSERT(pos == kTotalTableSize);
+
+  uint32_t kind_mask = 0;
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    if (acs_mask & (1u << i)) {
+      kind_mask |= 1u << kQuantTable[i];
+    }
+  }
+  uint32_t computed_kind_mask = 0;
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    if (computed_mask_ & (1u << i)) {
+      computed_kind_mask |= 1u << kQuantTable[i];
+    }
+  }
+  for (size_t table = 0; table < kNum; table++) {
+    if ((1 << table) & computed_kind_mask) continue;
+    if ((1 << table) & ~kind_mask) continue;
+    size_t pos = offsets[table * 3];
+    if (encodings_[table].mode == QuantEncoding::kQuantModeLibrary) {
+      JXL_CHECK(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)(
+          library[table], table_storage_.get(),
+          table_storage_.get() + kTotalTableSize, table, QuantTable(table),
+          &pos));
+    } else {
+      JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ComputeQuantTable)(
+          encodings_[table], table_storage_.get(),
+          table_storage_.get() + kTotalTableSize, table, QuantTable(table),
+          &pos));
+    }
+    JXL_ASSERT(pos == offsets[table * 3 + 3]);
+  }
+  computed_mask_ |= acs_mask;
+
+  return true;
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights.h b/third_party/jpeg-xl/lib/jxl/quant_weights.h
new file mode 100644
index 0000000000..d76fc1d1e6
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights.h
@@ -0,0 +1,448 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_QUANT_WEIGHTS_H_
+#define LIB_JXL_QUANT_WEIGHTS_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <array>
+#include <hwy/aligned_allocator.h>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T, size_t N>
+constexpr T ArraySum(T (&a)[N], size_t i = N - 1) {
+  static_assert(N > 0, "Trying to compute the sum of an empty array");
+  return i == 0 ? a[0] : a[i] + ArraySum(a, i - 1);
+}
+
+static constexpr size_t kMaxQuantTableSize = AcStrategy::kMaxCoeffArea;
+static constexpr size_t kNumPredefinedTables = 1;
+static constexpr size_t kCeilLog2NumPredefinedTables = 0;
+static constexpr size_t kLog2NumQuantModes = 3;
+
+struct DctQuantWeightParams {
+  static constexpr size_t kLog2MaxDistanceBands = 4;
+  static constexpr size_t kMaxDistanceBands = 1 + (1 << kLog2MaxDistanceBands);
+  typedef std::array<std::array<float, kMaxDistanceBands>, 3>
+      DistanceBandsArray;
+
+  size_t num_distance_bands = 0;
+  DistanceBandsArray distance_bands = {};
+
+  constexpr DctQuantWeightParams() : num_distance_bands(0) {}
+
+  constexpr DctQuantWeightParams(const DistanceBandsArray& dist_bands,
+                                 size_t num_dist_bands)
+      : num_distance_bands(num_dist_bands), distance_bands(dist_bands) {}
+
+  template <size_t num_dist_bands>
+  explicit DctQuantWeightParams(const float dist_bands[3][num_dist_bands]) {
+    num_distance_bands = num_dist_bands;
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(distance_bands[c].data(), dist_bands[c],
+             sizeof(float) * num_dist_bands);
+    }
+  }
+};
+
+// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
+struct QuantEncodingInternal {
+  enum Mode {
+    kQuantModeLibrary,
+    kQuantModeID,
+    kQuantModeDCT2,
+    kQuantModeDCT4,
+    kQuantModeDCT4X8,
+    kQuantModeAFV,
+    kQuantModeDCT,
+    kQuantModeRAW,
+  };
+
+  template <Mode mode>
+  struct Tag {};
+
+  typedef std::array<std::array<float, 3>, 3> IdWeights;
+  typedef std::array<std::array<float, 6>, 3> DCT2Weights;
+  typedef std::array<std::array<float, 2>, 3> DCT4Multipliers;
+  typedef std::array<std::array<float, 9>, 3> AFVWeights;
+  typedef std::array<float, 3> DCT4x8Multipliers;
+
+  static constexpr QuantEncodingInternal Library(uint8_t predefined) {
+    return ((predefined < kNumPredefinedTables) ||
+            JXL_ABORT("Assert predefined < kNumPredefinedTables")),
+           QuantEncodingInternal(Tag<kQuantModeLibrary>(), predefined);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeLibrary> /* tag */,
+                                  uint8_t predefined)
+      : mode(kQuantModeLibrary), predefined(predefined) {}
+
+  // Identity
+  // xybweights is an array of {xweights, yweights, bweights}.
+  static constexpr QuantEncodingInternal Identity(const IdWeights& xybweights) {
+    return QuantEncodingInternal(Tag<kQuantModeID>(), xybweights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeID> /* tag */,
+                                  const IdWeights& xybweights)
+      : mode(kQuantModeID), idweights(xybweights) {}
+
+  // DCT2
+  static constexpr QuantEncodingInternal DCT2(const DCT2Weights& xybweights) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT2>(), xybweights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT2> /* tag */,
+                                  const DCT2Weights& xybweights)
+      : mode(kQuantModeDCT2), dct2weights(xybweights) {}
+
+  // DCT4
+  static constexpr QuantEncodingInternal DCT4(
+      const DctQuantWeightParams& params, const DCT4Multipliers& xybmul) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT4>(), params, xybmul);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT4> /* tag */,
+                                  const DctQuantWeightParams& params,
+                                  const DCT4Multipliers& xybmul)
+      : mode(kQuantModeDCT4), dct_params(params), dct4multipliers(xybmul) {}
+
+  // DCT4x8
+  static constexpr QuantEncodingInternal DCT4X8(
+      const DctQuantWeightParams& params, const DCT4x8Multipliers& xybmul) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT4X8>(), params, xybmul);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT4X8> /* tag */,
+                                  const DctQuantWeightParams& params,
+                                  const DCT4x8Multipliers& xybmul)
+      : mode(kQuantModeDCT4X8), dct_params(params), dct4x8multipliers(xybmul) {}
+
+  // DCT
+  static constexpr QuantEncodingInternal DCT(
+      const DctQuantWeightParams& params) {
+    return QuantEncodingInternal(Tag<kQuantModeDCT>(), params);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeDCT> /* tag */,
+                                  const DctQuantWeightParams& params)
+      : mode(kQuantModeDCT), dct_params(params) {}
+
+  // AFV
+  static constexpr QuantEncodingInternal AFV(
+      const DctQuantWeightParams& params4x8,
+      const DctQuantWeightParams& params4x4, const AFVWeights& weights) {
+    return QuantEncodingInternal(Tag<kQuantModeAFV>(), params4x8, params4x4,
+                                 weights);
+  }
+  constexpr QuantEncodingInternal(Tag<kQuantModeAFV> /* tag */,
+                                  const DctQuantWeightParams& params4x8,
+                                  const DctQuantWeightParams& params4x4,
+                                  const AFVWeights& weights)
+      : mode(kQuantModeAFV),
+        dct_params(params4x8),
+        afv_weights(weights),
+        dct_params_afv_4x4(params4x4) {}
+
+  // This constructor is not constexpr so it can't be used in any of the
+  // constexpr cases above.
+  explicit QuantEncodingInternal(Mode mode) : mode(mode) {}
+
+  Mode mode;
+
+  // Weights for DCT4+ tables.
+  DctQuantWeightParams dct_params;
+
+  union {
+    // Weights for identity.
+    IdWeights idweights;
+
+    // Weights for DCT2.
+    DCT2Weights dct2weights;
+
+    // Extra multipliers for coefficients 01/10 and 11 for DCT4 and AFV.
+    DCT4Multipliers dct4multipliers;
+
+    // Weights for AFV. {0, 1} are used directly for coefficients (0, 1) and (1,
+    // 0);  {2, 3, 4} are used directly corner DC, (1,0) - (0,1) and (0, 1) +
+    // (1, 0) - (0, 0) inside the AFV block. Values from 5 to 8 are interpolated
+    // as in GetQuantWeights for DC and are used for other coefficients.
+    AFVWeights afv_weights = {};
+
+    // Extra multipliers for coefficients 01 or 10 for DCT4X8 and DCT8X4.
+    DCT4x8Multipliers dct4x8multipliers;
+
+    // Only used in kQuantModeRAW mode.
+    struct {
+      // explicit quantization table (like in JPEG)
+      std::vector<int>* qtable = nullptr;
+      float qtable_den = 1.f / (8 * 255);
+    } qraw;
+  };
+
+  // Weights for 4x4 sub-block in AFV.
+  DctQuantWeightParams dct_params_afv_4x4;
+
+  union {
+    // Which predefined table to use. Only used if mode is kQuantModeLibrary.
+    uint8_t predefined = 0;
+
+    // Which other quant table to copy; must copy from a table that comes before
+    // the current one. Only used if mode is kQuantModeCopy.
+    uint8_t source;
+  };
+};
+
+class QuantEncoding final : public QuantEncodingInternal {
+ public:
+  QuantEncoding(const QuantEncoding& other)
+      : QuantEncodingInternal(
+            static_cast<const QuantEncodingInternal&>(other)) {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      // Need to make a copy of the passed *qtable.
+      qraw.qtable = new std::vector<int>(*other.qraw.qtable);
+    }
+  }
+  QuantEncoding(QuantEncoding&& other) noexcept
+      : QuantEncodingInternal(
+            static_cast<const QuantEncodingInternal&>(other)) {
+    // Steal the qtable from the other object if any.
+    if (mode == kQuantModeRAW) {
+      other.qraw.qtable = nullptr;
+    }
+  }
+  QuantEncoding& operator=(const QuantEncoding& other) {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      delete qraw.qtable;
+    }
+    *static_cast<QuantEncodingInternal*>(this) =
+        QuantEncodingInternal(static_cast<const QuantEncodingInternal&>(other));
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      // Need to make a copy of the passed *qtable.
+      qraw.qtable = new std::vector<int>(*other.qraw.qtable);
+    }
+    return *this;
+  }
+
+  ~QuantEncoding() {
+    if (mode == kQuantModeRAW && qraw.qtable) {
+      delete qraw.qtable;
+    }
+  }
+
+  // Wrappers of the QuantEncodingInternal:: static functions that return a
+  // QuantEncoding instead. This is using the explicit and private cast from
+  // QuantEncodingInternal to QuantEncoding, which would be inlined anyway.
+  // In general, you should use this wrappers. The only reason to directly
+  // create a QuantEncodingInternal instance is if you need a constexpr version
+  // of this class. Note that RAW() is not supported in that case since it uses
+  // a std::vector.
+  static QuantEncoding Library(uint8_t predefined_arg) {
+    return QuantEncoding(QuantEncodingInternal::Library(predefined_arg));
+  }
+  static QuantEncoding Identity(const IdWeights& xybweights) {
+    return QuantEncoding(QuantEncodingInternal::Identity(xybweights));
+  }
+  static QuantEncoding DCT2(const DCT2Weights& xybweights) {
+    return QuantEncoding(QuantEncodingInternal::DCT2(xybweights));
+  }
+  static QuantEncoding DCT4(const DctQuantWeightParams& params,
+                            const DCT4Multipliers& xybmul) {
+    return QuantEncoding(QuantEncodingInternal::DCT4(params, xybmul));
+  }
+  static QuantEncoding DCT4X8(const DctQuantWeightParams& params,
+                              const DCT4x8Multipliers& xybmul) {
+    return QuantEncoding(QuantEncodingInternal::DCT4X8(params, xybmul));
+  }
+  static QuantEncoding DCT(const DctQuantWeightParams& params) {
+    return QuantEncoding(QuantEncodingInternal::DCT(params));
+  }
+  static QuantEncoding AFV(const DctQuantWeightParams& params4x8,
+                           const DctQuantWeightParams& params4x4,
+                           const AFVWeights& weights) {
+    return QuantEncoding(
+        QuantEncodingInternal::AFV(params4x8, params4x4, weights));
+  }
+
+  // RAW, note that this one is not a constexpr one.
+  static QuantEncoding RAW(const std::vector<int>& qtable, int shift = 0) {
+    QuantEncoding encoding(kQuantModeRAW);
+    encoding.qraw.qtable = new std::vector<int>();
+    *encoding.qraw.qtable = qtable;
+    encoding.qraw.qtable_den = (1 << shift) * (1.f / (8 * 255));
+    return encoding;
+  }
+
+ private:
+  explicit QuantEncoding(const QuantEncodingInternal& other)
+      : QuantEncodingInternal(other) {}
+
+  explicit QuantEncoding(QuantEncodingInternal::Mode mode_arg)
+      : QuantEncodingInternal(mode_arg) {}
+};
+
+// A constexpr QuantEncodingInternal instance is often downcasted to the
+// QuantEncoding subclass even if the instance wasn't an instance of the
+// subclass. This is safe because user will upcast to QuantEncodingInternal to
+// access any of its members.
+static_assert(sizeof(QuantEncoding) == sizeof(QuantEncodingInternal),
+              "Don't add any members to QuantEncoding");
+
+// Let's try to keep these 2**N for possible future simplicity.
+const float kInvDCQuant[3] = {
+    4096.0f,
+    512.0f,
+    256.0f,
+};
+
+const float kDCQuant[3] = {
+    1.0f / kInvDCQuant[0],
+    1.0f / kInvDCQuant[1],
+    1.0f / kInvDCQuant[2],
+};
+
+class ModularFrameEncoder;
+class ModularFrameDecoder;
+
+class DequantMatrices {
+ public:
+  enum QuantTable : size_t {
+    DCT = 0,
+    IDENTITY,
+    DCT2X2,
+    DCT4X4,
+    DCT16X16,
+    DCT32X32,
+    // DCT16X8
+    DCT8X16,
+    // DCT32X8
+    DCT8X32,
+    // DCT32X16
+    DCT16X32,
+    DCT4X8,
+    // DCT8X4
+    AFV0,
+    // AFV1
+    // AFV2
+    // AFV3
+    DCT64X64,
+    // DCT64X32,
+    DCT32X64,
+    DCT128X128,
+    // DCT128X64,
+    DCT64X128,
+    DCT256X256,
+    // DCT256X128,
+    DCT128X256,
+    kNum
+  };
+
+  static constexpr QuantTable kQuantTable[] = {
+      QuantTable::DCT,        QuantTable::IDENTITY,   QuantTable::DCT2X2,
+      QuantTable::DCT4X4,     QuantTable::DCT16X16,   QuantTable::DCT32X32,
+      QuantTable::DCT8X16,    QuantTable::DCT8X16,    QuantTable::DCT8X32,
+      QuantTable::DCT8X32,    QuantTable::DCT16X32,   QuantTable::DCT16X32,
+      QuantTable::DCT4X8,     QuantTable::DCT4X8,     QuantTable::AFV0,
+      QuantTable::AFV0,       QuantTable::AFV0,       QuantTable::AFV0,
+      QuantTable::DCT64X64,   QuantTable::DCT32X64,   QuantTable::DCT32X64,
+      QuantTable::DCT128X128, QuantTable::DCT64X128,  QuantTable::DCT64X128,
+      QuantTable::DCT256X256, QuantTable::DCT128X256, QuantTable::DCT128X256,
+  };
+  static_assert(AcStrategy::kNumValidStrategies ==
+                    sizeof(kQuantTable) / sizeof *kQuantTable,
+                "Update this array when adding or removing AC strategies.");
+
+  DequantMatrices();
+
+  static const QuantEncoding* Library();
+
+  typedef std::array<QuantEncodingInternal, kNumPredefinedTables * kNum>
+      DequantLibraryInternal;
+  // Return the array of library kNumPredefinedTables QuantEncoding entries as
+  // a constexpr array. Use Library() to obtain a pointer to the copy in the
+  // .cc file.
+  static DequantLibraryInternal LibraryInit();
+
+  // Returns aligned memory.
+  JXL_INLINE const float* Matrix(size_t quant_kind, size_t c) const {
+    JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies);
+    JXL_DASSERT((1 << quant_kind) & computed_mask_);
+    return &table_[table_offsets_[quant_kind * 3 + c]];
+  }
+
+  JXL_INLINE const float* InvMatrix(size_t quant_kind, size_t c) const {
+    JXL_DASSERT(quant_kind < AcStrategy::kNumValidStrategies);
+    JXL_DASSERT((1 << quant_kind) & computed_mask_);
+    return &inv_table_[table_offsets_[quant_kind * 3 + c]];
+  }
+
+  // DC quants are used in modular mode for XYB multipliers.
+  JXL_INLINE float DCQuant(size_t c) const { return dc_quant_[c]; }
+  JXL_INLINE const float* DCQuants() const { return dc_quant_; }
+
+  JXL_INLINE float InvDCQuant(size_t c) const { return inv_dc_quant_[c]; }
+
+  // For encoder.
+  void SetEncodings(const std::vector<QuantEncoding>& encodings) {
+    encodings_ = encodings;
+    computed_mask_ = 0;
+  }
+
+  // For encoder.
+  void SetDCQuant(const float dc[3]) {
+    for (size_t c = 0; c < 3; c++) {
+      dc_quant_[c] = 1.0f / dc[c];
+      inv_dc_quant_[c] = dc[c];
+    }
+  }
+
+  Status Decode(BitReader* br,
+                ModularFrameDecoder* modular_frame_decoder = nullptr);
+  Status DecodeDC(BitReader* br);
+
+  const std::vector<QuantEncoding>& encodings() const { return encodings_; }
+
+  static constexpr size_t required_size_x[] = {1, 1, 1, 1, 2,  4, 1,  1, 2,
+                                               1, 1, 8, 4, 16, 8, 32, 16};
+  static_assert(kNum == sizeof(required_size_x) / sizeof(*required_size_x),
+                "Update this array when adding or removing quant tables.");
+
+  static constexpr size_t required_size_y[] = {1, 1, 1, 1, 2,  4,  2,  4, 4,
+                                               1, 1, 8, 8, 16, 16, 32, 32};
+  static_assert(kNum == sizeof(required_size_y) / sizeof(*required_size_y),
+                "Update this array when adding or removing quant tables.");
+
+  Status EnsureComputed(uint32_t acs_mask);
+
+ private:
+  static constexpr size_t required_size_[] = {
+      1, 1, 1, 1, 4, 16, 2, 4, 8, 1, 1, 64, 32, 256, 128, 1024, 512};
+  static_assert(kNum == sizeof(required_size_) / sizeof(*required_size_),
+                "Update this array when adding or removing quant tables.");
+  static constexpr size_t kTotalTableSize =
+      ArraySum(required_size_) * kDCTBlockSize * 3;
+
+  uint32_t computed_mask_ = 0;
+  // kTotalTableSize entries followed by kTotalTableSize for inv_table
+  hwy::AlignedFreeUniquePtr<float[]> table_storage_;
+  const float* table_;
+  const float* inv_table_;
+  float dc_quant_[3] = {kDCQuant[0], kDCQuant[1], kDCQuant[2]};
+  float inv_dc_quant_[3] = {kInvDCQuant[0], kInvDCQuant[1], kInvDCQuant[2]};
+  size_t table_offsets_[AcStrategy::kNumValidStrategies * 3];
+  std::vector<QuantEncoding> encodings_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_QUANT_WEIGHTS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc
new file mode 100644
index 0000000000..f0497948a7
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quant_weights_test.cc
@@ -0,0 +1,240 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+#include "lib/jxl/quant_weights.h"
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/base.h>  // HWY_ALIGN_MAX
+#include <hwy/tests/test_util-inl.h>
+#include <numeric>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/dct_for_test.h"
+#include "lib/jxl/dec_transforms_testonly.h"
+#include "lib/jxl/enc_modular.h"
+#include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/enc_transforms.h"
+
+namespace jxl {
+namespace {
+
+template <typename T>
+void CheckSimilar(T a, T b) {
+  EXPECT_EQ(a, b);
+}
+// minimum exponent = -15.
+template <>
+void CheckSimilar(float a, float b) {
+  float m = std::max(std::abs(a), std::abs(b));
+  // 10 bits of precision are used in the format. Relative error should be
+  // below 2^-10.
+  EXPECT_LE(std::abs(a - b), m / 1024.0f) << "a: " << a << " b: " << b;
+}
+
+TEST(QuantWeightsTest, DC) {
+  DequantMatrices mat;
+  float dc_quant[3] = {1e+5, 1e+3, 1e+1};
+  DequantMatricesSetCustomDC(&mat, dc_quant);
+  for (size_t c = 0; c < 3; c++) {
+    CheckSimilar(mat.InvDCQuant(c), dc_quant[c]);
+  }
+}
+
+void RoundtripMatrices(const std::vector<QuantEncoding>& encodings) {
+  ASSERT_TRUE(encodings.size() == DequantMatrices::kNum);
+  DequantMatrices mat;
+  CodecMetadata metadata;
+  FrameHeader frame_header(&metadata);
+  ModularFrameEncoder encoder(frame_header, CompressParams{});
+  DequantMatricesSetCustom(&mat, encodings, &encoder);
+  const std::vector<QuantEncoding>& encodings_dec = mat.encodings();
+  for (size_t i = 0; i < encodings.size(); i++) {
+    const QuantEncoding& e = encodings[i];
+    const QuantEncoding& d = encodings_dec[i];
+    // Check values roundtripped correctly.
+    EXPECT_EQ(e.mode, d.mode);
+    EXPECT_EQ(e.predefined, d.predefined);
+    EXPECT_EQ(e.source, d.source);
+
+    EXPECT_EQ(static_cast<uint64_t>(e.dct_params.num_distance_bands),
+              static_cast<uint64_t>(d.dct_params.num_distance_bands));
+    for (size_t c = 0; c < 3; c++) {
+      for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) {
+        CheckSimilar(e.dct_params.distance_bands[c][j],
+                     d.dct_params.distance_bands[c][j]);
+      }
+    }
+
+    if (e.mode == QuantEncoding::kQuantModeRAW) {
+      EXPECT_FALSE(!e.qraw.qtable);
+      EXPECT_FALSE(!d.qraw.qtable);
+      EXPECT_EQ(e.qraw.qtable->size(), d.qraw.qtable->size());
+      for (size_t j = 0; j < e.qraw.qtable->size(); j++) {
+        EXPECT_EQ((*e.qraw.qtable)[j], (*d.qraw.qtable)[j]);
+      }
+      EXPECT_NEAR(e.qraw.qtable_den, d.qraw.qtable_den, 1e-7f);
+    } else {
+      // modes different than kQuantModeRAW use one of the other fields used
+      // here, which all happen to be arrays of floats.
+      for (size_t c = 0; c < 3; c++) {
+        for (size_t j = 0; j < 3; j++) {
+          CheckSimilar(e.idweights[c][j], d.idweights[c][j]);
+        }
+        for (size_t j = 0; j < 6; j++) {
+          CheckSimilar(e.dct2weights[c][j], d.dct2weights[c][j]);
+        }
+        for (size_t j = 0; j < 2; j++) {
+          CheckSimilar(e.dct4multipliers[c][j], d.dct4multipliers[c][j]);
+        }
+        CheckSimilar(e.dct4x8multipliers[c], d.dct4x8multipliers[c]);
+        for (size_t j = 0; j < 9; j++) {
+          CheckSimilar(e.afv_weights[c][j], d.afv_weights[c][j]);
+        }
+        for (size_t j = 0; j < DctQuantWeightParams::kMaxDistanceBands; j++) {
+          CheckSimilar(e.dct_params_afv_4x4.distance_bands[c][j],
+                       d.dct_params_afv_4x4.distance_bands[c][j]);
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantWeightsTest, AllDefault) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  RoundtripMatrices(encodings);
+}
+
+void TestSingleQuantMatrix(DequantMatrices::QuantTable kind) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  encodings[kind] = DequantMatrices::Library()[kind];
+  RoundtripMatrices(encodings);
+}
+
+// Ensure we can reasonably represent default quant tables.
+TEST(QuantWeightsTest, DCT) { TestSingleQuantMatrix(DequantMatrices::DCT); }
+TEST(QuantWeightsTest, IDENTITY) {
+  TestSingleQuantMatrix(DequantMatrices::IDENTITY);
+}
+TEST(QuantWeightsTest, DCT2X2) {
+  TestSingleQuantMatrix(DequantMatrices::DCT2X2);
+}
+TEST(QuantWeightsTest, DCT4X4) {
+  TestSingleQuantMatrix(DequantMatrices::DCT4X4);
+}
+TEST(QuantWeightsTest, DCT16X16) {
+  TestSingleQuantMatrix(DequantMatrices::DCT16X16);
+}
+TEST(QuantWeightsTest, DCT32X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT32X32);
+}
+TEST(QuantWeightsTest, DCT8X16) {
+  TestSingleQuantMatrix(DequantMatrices::DCT8X16);
+}
+TEST(QuantWeightsTest, DCT8X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT8X32);
+}
+TEST(QuantWeightsTest, DCT16X32) {
+  TestSingleQuantMatrix(DequantMatrices::DCT16X32);
+}
+TEST(QuantWeightsTest, DCT4X8) {
+  TestSingleQuantMatrix(DequantMatrices::DCT4X8);
+}
+TEST(QuantWeightsTest, AFV0) { TestSingleQuantMatrix(DequantMatrices::AFV0); }
+TEST(QuantWeightsTest, RAW) {
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::Library(0));
+  std::vector<int> matrix(3 * 32 * 32);
+  Rng rng(0);
+  for (size_t i = 0; i < matrix.size(); i++) matrix[i] = rng.UniformI(1, 256);
+  encodings[DequantMatrices::kQuantTable[AcStrategy::DCT32X32]] =
+      QuantEncoding::RAW(matrix, 2);
+  RoundtripMatrices(encodings);
+}
+
+class QuantWeightsTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(QuantWeightsTargetTest);
+
+TEST_P(QuantWeightsTargetTest, DCTUniform) {
+  constexpr float kUniformQuant = 4;
+  float weights[3][2] = {{1.0f / kUniformQuant, 0},
+                         {1.0f / kUniformQuant, 0},
+                         {1.0f / kUniformQuant, 0}};
+  DctQuantWeightParams dct_params(weights);
+  std::vector<QuantEncoding> encodings(DequantMatrices::kNum,
+                                       QuantEncoding::DCT(dct_params));
+  DequantMatrices dequant_matrices;
+  CodecMetadata metadata;
+  FrameHeader frame_header(&metadata);
+  ModularFrameEncoder encoder(frame_header, CompressParams{});
+  DequantMatricesSetCustom(&dequant_matrices, encodings, &encoder);
+  JXL_CHECK(dequant_matrices.EnsureComputed(~0u));
+
+  const float dc_quant[3] = {1.0f / kUniformQuant, 1.0f / kUniformQuant,
+                             1.0f / kUniformQuant};
+  DequantMatricesSetCustomDC(&dequant_matrices, dc_quant);
+
+  HWY_ALIGN_MAX float scratch_space[16 * 16 * 2];
+
+  // DCT8
+  {
+    HWY_ALIGN_MAX float pixels[64];
+    std::iota(std::begin(pixels), std::end(pixels), 0);
+    HWY_ALIGN_MAX float coeffs[64];
+    const AcStrategy::Type dct = AcStrategy::DCT;
+    TransformFromPixels(dct, pixels, 8, coeffs, scratch_space);
+    HWY_ALIGN_MAX double slow_coeffs[64];
+    for (size_t i = 0; i < 64; i++) slow_coeffs[i] = pixels[i];
+    DCTSlow<8>(slow_coeffs);
+
+    for (size_t i = 0; i < 64; i++) {
+      // DCTSlow doesn't multiply/divide by 1/N, so we do it manually.
+      slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant;
+      coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) *
+                  dequant_matrices.Matrix(dct, 0)[i];
+    }
+    IDCTSlow<8>(slow_coeffs);
+    TransformToPixels(dct, coeffs, pixels, 8, scratch_space);
+    for (size_t i = 0; i < 64; i++) {
+      EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4);
+    }
+  }
+
+  // DCT16
+  {
+    HWY_ALIGN_MAX float pixels[64 * 4];
+    std::iota(std::begin(pixels), std::end(pixels), 0);
+    HWY_ALIGN_MAX float coeffs[64 * 4];
+    const AcStrategy::Type dct = AcStrategy::DCT16X16;
+    TransformFromPixels(dct, pixels, 16, coeffs, scratch_space);
+    HWY_ALIGN_MAX double slow_coeffs[64 * 4];
+    for (size_t i = 0; i < 64 * 4; i++) slow_coeffs[i] = pixels[i];
+    DCTSlow<16>(slow_coeffs);
+
+    for (size_t i = 0; i < 64 * 4; i++) {
+      slow_coeffs[i] = roundf(slow_coeffs[i] / kUniformQuant) * kUniformQuant;
+      coeffs[i] = roundf(coeffs[i] / dequant_matrices.Matrix(dct, 0)[i]) *
+                  dequant_matrices.Matrix(dct, 0)[i];
+    }
+
+    IDCTSlow<16>(slow_coeffs);
+    TransformToPixels(dct, coeffs, pixels, 16, scratch_space);
+    for (size_t i = 0; i < 64 * 4; i++) {
+      EXPECT_NEAR(pixels[i], slow_coeffs[i], 1e-4);
+    }
+  }
+
+  // Check that all matrices have the same DC quantization, i.e. that they all
+  // have the same scaling.
+  for (size_t i = 0; i < AcStrategy::kNumValidStrategies; i++) {
+    EXPECT_NEAR(dequant_matrices.Matrix(i, 0)[0], kUniformQuant, 1e-6);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer-inl.h b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h
new file mode 100644
index 0000000000..64d273c552
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer-inl.h
@@ -0,0 +1,74 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JXL_QUANTIZER_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_QUANTIZER_INL_H_
+#undef LIB_JXL_QUANTIZER_INL_H_
+#else
+#define LIB_JXL_QUANTIZER_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::ApproximateReciprocal;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+template <class DI>
+HWY_INLINE HWY_MAYBE_UNUSED Vec<Rebind<float, DI>> AdjustQuantBias(
+    DI di, const size_t c, const Vec<DI> quant_i,
+    const float* HWY_RESTRICT biases) {
+  const Rebind<float, DI> df;
+
+  const auto quant = ConvertTo(df, quant_i);
+
+  // Compare |quant|, keep sign bit for negating result.
+  const auto kSign = BitCast(df, Set(di, INT32_MIN));
+  const auto sign = And(quant, kSign);  // TODO(janwas): = abs ^ orig
+  const auto abs_quant = AndNot(kSign, quant);
+
+  // If |x| is 1, kZeroBias creates a different bias for each channel.
+  // We're implementing the following:
+  // if (quant == 0) return 0;
+  // if (quant == 1) return biases[c];
+  // if (quant == -1) return -biases[c];
+  // return quant - biases[3] / quant;
+
+  // Integer comparison is not helpful because Clang incurs bypass penalties
+  // from unnecessarily mixing integer and float.
+  const auto is_01 = Lt(abs_quant, Set(df, 1.125f));
+  const auto not_0 = Gt(abs_quant, Zero(df));
+
+  // Bitwise logic is faster than quant * biases[c].
+  const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign));
+
+  // About 2E-5 worse than ReciprocalNR or division.
+  const auto bias =
+      NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant);
+
+  return IfThenElse(is_01, one_bias, bias);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_QUANTIZER_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.cc b/third_party/jpeg-xl/lib/jxl/quantizer.cc
new file mode 100644
index 0000000000..153cf19b21
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer.cc
@@ -0,0 +1,156 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/quantizer.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/field_encodings.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/quant_weights.h"
+
+namespace jxl {
+
+static const int32_t kDefaultQuant = 64;
+
+constexpr int32_t Quantizer::kQuantMax;
+
+Quantizer::Quantizer(const DequantMatrices* dequant)
+    : Quantizer(dequant, kDefaultQuant, kGlobalScaleDenom / kDefaultQuant) {}
+
+Quantizer::Quantizer(const DequantMatrices* dequant, int quant_dc,
+                     int global_scale)
+    : global_scale_(global_scale), quant_dc_(quant_dc), dequant_(dequant) {
+  JXL_ASSERT(dequant_ != nullptr);
+  RecomputeFromGlobalScale();
+  inv_quant_dc_ = inv_global_scale_ / quant_dc_;
+
+  memcpy(zero_bias_, kZeroBiasDefault, sizeof(kZeroBiasDefault));
+}
+
+void Quantizer::ComputeGlobalScaleAndQuant(float quant_dc, float quant_median,
+                                           float quant_median_absd) {
+  // Target value for the median value in the quant field.
+  const float kQuantFieldTarget = 5;
+  // We reduce the median of the quant field by the median absolute deviation:
+  // higher resolution on highly varying quant fields.
+  float scale = kGlobalScaleDenom * (quant_median - quant_median_absd) /
+                kQuantFieldTarget;
+  // Ensure that new_global_scale is positive and no more than 1<<15.
+  if (scale < 1) scale = 1;
+  if (scale > (1 << 15)) scale = 1 << 15;
+  int new_global_scale = static_cast<int>(scale);
+  // Ensure that quant_dc_ will always be at least
+  // 0.625 * kGlobalScaleDenom/kGlobalScaleNumerator = 10.
+  const int scaled_quant_dc =
+      static_cast<int>(quant_dc * kGlobalScaleNumerator * 1.6);
+  if (new_global_scale > scaled_quant_dc) {
+    new_global_scale = scaled_quant_dc;
+    if (new_global_scale <= 0) new_global_scale = 1;
+  }
+  global_scale_ = new_global_scale;
+  // Code below uses inv_global_scale_.
+  RecomputeFromGlobalScale();
+
+  float fval = quant_dc * inv_global_scale_ + 0.5f;
+  fval = std::min<float>(1 << 16, fval);
+  const int new_quant_dc = static_cast<int>(fval);
+  quant_dc_ = new_quant_dc;
+
+  // quant_dc_ was updated, recompute values.
+  RecomputeFromGlobalScale();
+}
+
+void Quantizer::SetQuantFieldRect(const ImageF& qf, const Rect& rect,
+                                  ImageI* JXL_RESTRICT raw_quant_field) const {
+  for (size_t y = 0; y < rect.ysize(); ++y) {
+    const float* JXL_RESTRICT row_qf = rect.ConstRow(qf, y);
+    int32_t* JXL_RESTRICT row_qi = rect.Row(raw_quant_field, y);
+    for (size_t x = 0; x < rect.xsize(); ++x) {
+      int val = ClampVal(row_qf[x] * inv_global_scale_ + 0.5f);
+      row_qi[x] = val;
+    }
+  }
+}
+
+void Quantizer::SetQuantField(const float quant_dc, const ImageF& qf,
+                              ImageI* JXL_RESTRICT raw_quant_field) {
+  std::vector<float> data(qf.xsize() * qf.ysize());
+  for (size_t y = 0; y < qf.ysize(); ++y) {
+    const float* JXL_RESTRICT row_qf = qf.Row(y);
+    for (size_t x = 0; x < qf.xsize(); ++x) {
+      float quant = row_qf[x];
+      data[qf.xsize() * y + x] = quant;
+    }
+  }
+  std::nth_element(data.begin(), data.begin() + data.size() / 2, data.end());
+  const float quant_median = data[data.size() / 2];
+  std::vector<float> deviations(data.size());
+  for (size_t i = 0; i < data.size(); i++) {
+    deviations[i] = fabsf(data[i] - quant_median);
+  }
+  std::nth_element(deviations.begin(),
+                   deviations.begin() + deviations.size() / 2,
+                   deviations.end());
+  const float quant_median_absd = deviations[deviations.size() / 2];
+  ComputeGlobalScaleAndQuant(quant_dc, quant_median, quant_median_absd);
+  if (raw_quant_field) {
+    JXL_CHECK(SameSize(*raw_quant_field, qf));
+    SetQuantFieldRect(qf, Rect(qf), raw_quant_field);
+  }
+}
+
+void Quantizer::SetQuant(float quant_dc, float quant_ac,
+                         ImageI* JXL_RESTRICT raw_quant_field) {
+  ComputeGlobalScaleAndQuant(quant_dc, quant_ac, 0);
+  int32_t val = ClampVal(quant_ac * inv_global_scale_ + 0.5f);
+  FillImage(val, raw_quant_field);
+}
+
+Status QuantizerParams::VisitFields(Visitor* JXL_RESTRICT visitor) {
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(
+      BitsOffset(11, 1), BitsOffset(11, 2049), BitsOffset(12, 4097),
+      BitsOffset(16, 8193), 1, &global_scale));
+  JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Val(16), BitsOffset(5, 1),
+                                         BitsOffset(8, 1), BitsOffset(16, 1), 1,
+                                         &quant_dc));
+  return true;
+}
+
+QuantizerParams Quantizer::GetParams() const {
+  QuantizerParams params;
+  params.global_scale = global_scale_;
+  params.quant_dc = quant_dc_;
+  return params;
+}
+
+Status Quantizer::Decode(BitReader* reader) {
+  QuantizerParams params;
+  JXL_RETURN_IF_ERROR(Bundle::Read(reader, &params));
+  global_scale_ = static_cast<int>(params.global_scale);
+  quant_dc_ = static_cast<int>(params.quant_dc);
+  RecomputeFromGlobalScale();
+  return true;
+}
+
+void Quantizer::DumpQuantizationMap(const ImageI& raw_quant_field) const {
+  printf("Global scale: %d (%.7f)\nDC quant: %d\n", global_scale_,
+         global_scale_ * 1.0 / kGlobalScaleDenom, quant_dc_);
+  printf("AC quantization Map:\n");
+  for (size_t y = 0; y < raw_quant_field.ysize(); ++y) {
+    for (size_t x = 0; x < raw_quant_field.xsize(); ++x) {
+      printf(" %3d", raw_quant_field.Row(y)[x]);
+    }
+    printf("\n");
+  }
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer.h b/third_party/jpeg-xl/lib/jxl/quantizer.h
new file mode 100644
index 0000000000..d78ba7b3fc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer.h
@@ -0,0 +1,182 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_QUANTIZER_H_
+#define LIB_JXL_QUANTIZER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/quant_weights.h"
+
+// Quantizes DC and AC coefficients, with separate quantization tables according
+// to the quant_kind (which is currently computed from the AC strategy and the
+// block index inside that strategy).
+
+namespace jxl {
+
+static constexpr int kGlobalScaleDenom = 1 << 16;
+static constexpr int kGlobalScaleNumerator = 4096;
+
+// zero-biases for quantizing channels X, Y, B
+static constexpr float kZeroBiasDefault[3] = {0.5f, 0.5f, 0.5f};
+
+// Returns adjusted version of a quantized integer, such that its value is
+// closer to the expected value of the original.
+// The residuals of AC coefficients that we quantize are not uniformly
+// distributed. Numerical experiments show that they have a distribution with
+// the "shape" of 1/(1+x^2) [up to some coefficients]. This means that the
+// expected value of a coefficient that gets quantized to x will not be x
+// itself, but (at least with reasonable approximation):
+// - 0 if x is 0
+// - x * biases[c] if x is 1 or -1
+// - x - biases[3]/x otherwise
+// This follows from computing the distribution of the quantization bias, which
+// can be approximated fairly well by <constant>/x when |x| is at least two.
+static constexpr float kBiasNumerator = 0.145f;
+
+static constexpr float kDefaultQuantBias[4] = {
+    1.0f - 0.05465007330715401f,
+    1.0f - 0.07005449891748593f,
+    1.0f - 0.049935103337343655f,
+    0.145f,
+};
+
+struct QuantizerParams;
+
+class Quantizer {
+ public:
+  explicit Quantizer(const DequantMatrices* dequant);
+  Quantizer(const DequantMatrices* dequant, int quant_dc, int global_scale);
+
+  static constexpr int32_t kQuantMax = 256;
+
+  static JXL_INLINE int32_t ClampVal(float val) {
+    return static_cast<int32_t>(
+        std::max(1.0f, std::min<float>(val, kQuantMax)));
+  }
+
+  float ScaleGlobalScale(const float scale) {
+    int new_global_scale = static_cast<int>(global_scale_ * scale + 0.5f);
+    float scale_out = new_global_scale * 1.0f / global_scale_;
+    global_scale_ = new_global_scale;
+    RecomputeFromGlobalScale();
+    return scale_out;
+  }
+
+  // Recomputes other derived fields after global_scale_ has changed.
+  void RecomputeFromGlobalScale() {
+    global_scale_float_ = global_scale_ * (1.0 / kGlobalScaleDenom);
+    inv_global_scale_ = 1.0 * kGlobalScaleDenom / global_scale_;
+    inv_quant_dc_ = inv_global_scale_ / quant_dc_;
+    for (size_t c = 0; c < 3; c++) {
+      mul_dc_[c] = GetDcStep(c);
+      inv_mul_dc_[c] = GetInvDcStep(c);
+    }
+  }
+
+  // Returns scaling factor such that Scale() * (RawDC() or RawQuantField())
+  // pixels yields the same float values returned by GetQuantField.
+  JXL_INLINE float Scale() const { return global_scale_float_; }
+
+  // Reciprocal of Scale().
+  JXL_INLINE float InvGlobalScale() const { return inv_global_scale_; }
+
+  void SetQuantFieldRect(const ImageF& qf, const Rect& rect,
+                         ImageI* JXL_RESTRICT raw_quant_field) const;
+
+  void SetQuantField(float quant_dc, const ImageF& qf,
+                     ImageI* JXL_RESTRICT raw_quant_field);
+
+  void SetQuant(float quant_dc, float quant_ac,
+                ImageI* JXL_RESTRICT raw_quant_field);
+
+  // Returns the DC quantization base value, which is currently global (not
+  // adaptive). The actual scale factor used to dequantize pixels in channel c
+  // is: inv_quant_dc() * dequant_->DCQuant(c).
+  float inv_quant_dc() const { return inv_quant_dc_; }
+
+  // Dequantize by multiplying with this times dequant_matrix.
+  float inv_quant_ac(int32_t quant) const { return inv_global_scale_ / quant; }
+
+  QuantizerParams GetParams() const;
+
+  Status Decode(BitReader* reader);
+
+  void DumpQuantizationMap(const ImageI& raw_quant_field) const;
+
+  JXL_INLINE const float* DequantMatrix(size_t quant_kind, size_t c) const {
+    return dequant_->Matrix(quant_kind, c);
+  }
+
+  JXL_INLINE const float* InvDequantMatrix(size_t quant_kind, size_t c) const {
+    return dequant_->InvMatrix(quant_kind, c);
+  }
+
+  // Calculates DC quantization step.
+  JXL_INLINE float GetDcStep(size_t c) const {
+    return inv_quant_dc_ * dequant_->DCQuant(c);
+  }
+  JXL_INLINE float GetInvDcStep(size_t c) const {
+    return dequant_->InvDCQuant(c) * (global_scale_float_ * quant_dc_);
+  }
+
+  JXL_INLINE const float* MulDC() const { return mul_dc_; }
+  JXL_INLINE const float* InvMulDC() const { return inv_mul_dc_; }
+
+  JXL_INLINE void ClearDCMul() {
+    std::fill(mul_dc_, mul_dc_ + 4, 1.f);
+    std::fill(inv_mul_dc_, inv_mul_dc_ + 4, 1.f);
+  }
+
+  void ComputeGlobalScaleAndQuant(float quant_dc, float quant_median,
+                                  float quant_median_absd);
+
+ private:
+  float mul_dc_[4];
+  float inv_mul_dc_[4];
+
+  // These are serialized:
+  int global_scale_;
+  int quant_dc_;
+
+  // These are derived from global_scale_:
+  float inv_global_scale_;
+  float global_scale_float_;  // reciprocal of inv_global_scale_
+  float inv_quant_dc_;
+
+  float zero_bias_[3];
+  const DequantMatrices* dequant_;
+};
+
+struct QuantizerParams : public Fields {
+  QuantizerParams() { Bundle::Init(this); }
+  JXL_FIELDS_NAME(QuantizerParams)
+
+  Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
+
+  uint32_t global_scale;
+  uint32_t quant_dc;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_QUANTIZER_H_
diff --git a/third_party/jpeg-xl/lib/jxl/quantizer_test.cc b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc
new file mode 100644
index 0000000000..f9cf2c838e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/quantizer_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/quantizer.h"
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void TestEquivalence(int qxsize, int qysize, const Quantizer& quantizer1,
+                     const Quantizer& quantizer2) {
+  ASSERT_NEAR(quantizer1.inv_quant_dc(), quantizer2.inv_quant_dc(), 1e-7);
+}
+
+TEST(QuantizerTest, QuantizerParams) {
+  for (uint32_t i = 1; i < 10000; ++i) {
+    QuantizerParams p;
+    p.global_scale = i;
+    size_t extension_bits = 0, total_bits = 0;
+    EXPECT_TRUE(Bundle::CanEncode(p, &extension_bits, &total_bits));
+    EXPECT_EQ(0u, extension_bits);
+    EXPECT_GE(total_bits, 4u);
+  }
+}
+
+TEST(QuantizerTest, BitStreamRoundtripSameQuant) {
+  const int qxsize = 8;
+  const int qysize = 8;
+  DequantMatrices dequant;
+  Quantizer quantizer1(&dequant);
+  ImageI raw_quant_field(qxsize, qysize);
+  quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field);
+  BitWriter writer;
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+  Quantizer quantizer2(&dequant);
+  BitReader reader(writer.GetSpan());
+  EXPECT_TRUE(quantizer2.Decode(&reader));
+  EXPECT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  EXPECT_TRUE(reader.Close());
+  TestEquivalence(qxsize, qysize, quantizer1, quantizer2);
+}
+
+TEST(QuantizerTest, BitStreamRoundtripRandomQuant) {
+  const int qxsize = 8;
+  const int qysize = 8;
+  DequantMatrices dequant;
+  Quantizer quantizer1(&dequant);
+  ImageI raw_quant_field(qxsize, qysize);
+  quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field);
+  float quant_dc = 0.17f;
+  ImageF qf(qxsize, qysize);
+  RandomFillImage(&qf, 0.0f, 1.0f);
+  quantizer1.SetQuantField(quant_dc, qf, &raw_quant_field);
+  BitWriter writer;
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+  Quantizer quantizer2(&dequant);
+  BitReader reader(writer.GetSpan());
+  EXPECT_TRUE(quantizer2.Decode(&reader));
+  EXPECT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  EXPECT_TRUE(reader.Close());
+  TestEquivalence(qxsize, qysize, quantizer1, quantizer2);
+}
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h
new file mode 100644
index 0000000000..176e24092c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial-inl.h
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast SIMD evaluation of rational polynomials for approximating functions.
+
+#if defined(LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#undef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#else
+#define LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+// Primary template: default to actual division.
+template <typename T, class V>
+struct FastDivision {
+  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
+};
+// Partial specialization for float vectors.
+template <class V>
+struct FastDivision<float, V> {
+  // One Newton-Raphson iteration.
+  static HWY_INLINE V ReciprocalNR(const V x) {
+    const auto rcp = ApproximateReciprocal(x);
+    const auto sum = Add(rcp, rcp);
+    const auto x_rcp = Mul(x, rcp);
+    return NegMulAdd(x_rcp, rcp, sum);
+  }
+
+  V operator()(const V n, const V d) const {
+#if 1  // Faster on SKX
+    return Div(n, d);
+#else
+    return n * ReciprocalNR(d);
+#endif
+  }
+};
+
+// Approximates smooth functions via rational polynomials (i.e. dividing two
+// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
+// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
+// specify constants (replicated 4x) independently of the lane count.
+template <size_t NP, size_t NQ, class D, class V, typename T>
+HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
+                                                     const T (&p)[NP],
+                                                     const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(d, &p[kDegP * 4]);
+  auto yq = LoadDup128(d, &q[kDegQ * 4]);
+  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
+  // compiler warning that the index is out of bounds since we are already
+  // checking that it is not out of bounds with (kDegP >= n) and the access
+  // will be optimized away. Similarly with q and kDegQ.
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
+  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
+  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
+  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
+  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
+
+  return FastDivision<T, V>()(yp, yq);
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc
new file mode 100644
index 0000000000..13fc044a55
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/rational_polynomial_test.cc
@@ -0,0 +1,238 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include <cmath>
+#include <string>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/rational_polynomial_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+using T = float;  // required by EvalLog2
+using D = HWY_FULL(T);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Generic: only computes polynomial
+struct EvalPoly {
+  template <size_t NP, size_t NQ>
+  T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const {
+    const HWY_FULL(T) d;
+    const auto vx = Set(d, x);
+    const auto approx = EvalRationalPolynomial(d, vx, p, q);
+    return GetLane(approx);
+  }
+};
+
+// Range reduction for log2
+struct EvalLog2 {
+  template <size_t NP, size_t NQ>
+  T operator()(T x, const T (&p)[NP], const T (&q)[NQ]) const {
+    const HWY_FULL(T) d;
+    auto vx = Set(d, x);
+
+    const HWY_FULL(int32_t) di;
+    const auto x_bits = BitCast(di, vx);
+    // Cannot handle negative numbers / NaN.
+    JXL_DASSERT(AllTrue(di, Eq(Abs(x_bits), x_bits)));
+
+    // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+    const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+    // Shifted exponent = log2; also used to clear mantissa.
+    const auto exp_shifted = ShiftRight<23>(exp_bits);
+    const auto mantissa = BitCast(d, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+    const auto exp_val = ConvertTo(d, exp_shifted);
+    vx = Sub(mantissa, Set(d, 1.0f));
+
+    const auto approx = Add(EvalRationalPolynomial(d, vx, p, q), exp_val);
+    return GetLane(approx);
+  }
+};
+
+// Functions to approximate:
+
+T LinearToSrgb8Direct(T val) {
+  if (val < 0.0) return 0.0;
+  if (val >= 255.0) return 255.0;
+  if (val <= 10.0 / 12.92) return val * 12.92;
+  return 255.0 * (std::pow(val / 255.0, 1.0 / 2.4) * 1.055 - 0.055);
+}
+
+T SimpleGamma(T v) {
+  static const T kGamma = 0.387494322593;
+  static const T limit = 43.01745241042018;
+  T bright = v - limit;
+  if (bright >= 0) {
+    static const T mul = 0.0383723643799;
+    v -= bright * mul;
+  }
+  static const T limit2 = 94.68634353321337;
+  T bright2 = v - limit2;
+  if (bright2 >= 0) {
+    static const T mul = 0.22885405968;
+    v -= bright2 * mul;
+  }
+  static const T offset = 0.156775786057;
+  static const T scale = 8.898059160493739;
+  T retval = scale * (offset + pow(v, kGamma));
+  return retval;
+}
+
+// Runs CaratheodoryFejer and verifies the polynomial using a lot of samples to
+// return the biggest error.
+template <size_t NP, size_t NQ, class Eval>
+T RunApproximation(T x0, T x1, const T (&p)[NP], const T (&q)[NQ],
+                   const Eval& eval, T func_to_approx(T)) {
+  float maxerr = 0;
+  T lastPrint = 0;
+  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
+  for (T x = x0; x <= x1; x += (x1 - x0) / 10000.0) {
+    const T f = func_to_approx(x);
+    const T g = eval(x, p, q);
+    maxerr = std::max(fabsf(g - f), maxerr);
+    if (x == x0 || x - lastPrint > (x1 - x0) / 20.0) {
+      printf("x: %11.6f, f: %11.6f, g: %11.6f, e: %11.6f\n", x, f, g,
+             fabs(g - f));
+      lastPrint = x;
+    }
+  }
+  return maxerr;
+}
+
+void TestSimpleGamma() {
+  const T p[4 * (6 + 1)] = {
+      HWY_REP4(-5.0646949363741811E-05), HWY_REP4(6.7369380528439771E-05),
+      HWY_REP4(8.9376652530412794E-05),  HWY_REP4(2.1153513301520462E-06),
+      HWY_REP4(-6.9130322970386449E-08), HWY_REP4(3.9424752749293728E-10),
+      HWY_REP4(1.2360288207619576E-13)};
+
+  const T q[4 * (6 + 1)] = {
+      HWY_REP4(-6.6389733798591366E-06), HWY_REP4(1.3299859726565908E-05),
+      HWY_REP4(3.8538748358398873E-06),  HWY_REP4(-2.8707687262928236E-08),
+      HWY_REP4(-6.6897385800005434E-10), HWY_REP4(6.1428748869186003E-12),
+      HWY_REP4(-2.5475738169252870E-15)};
+
+  const T err = RunApproximation(0.77, 274.579999999999984, p, q, EvalPoly(),
+                                 SimpleGamma);
+  EXPECT_LT(err, 0.05);
+}
+
+void TestLinearToSrgb8Direct() {
+  const T p[4 * (5 + 1)] = {
+      HWY_REP4(-9.5357499040105154E-05), HWY_REP4(4.6761186249798248E-04),
+      HWY_REP4(2.5708174333943594E-04),  HWY_REP4(1.5250087770436082E-05),
+      HWY_REP4(1.1946768008931187E-07),  HWY_REP4(5.9916446295972850E-11)};
+
+  const T q[4 * (4 + 1)] = {
+      HWY_REP4(1.8932479758079768E-05), HWY_REP4(2.7312342474687321E-05),
+      HWY_REP4(4.3901204783327006E-06), HWY_REP4(1.0417787306920273E-07),
+      HWY_REP4(3.0084206762140419E-10)};
+
+  const T err =
+      RunApproximation(0.77, 255, p, q, EvalPoly(), LinearToSrgb8Direct);
+  EXPECT_LT(err, 0.05);
+}
+
+void TestExp() {
+  const T p[4 * (2 + 1)] = {HWY_REP4(9.6266879665530902E-01),
+                            HWY_REP4(4.8961265681586763E-01),
+                            HWY_REP4(8.2619259189548433E-02)};
+  const T q[4 * (2 + 1)] = {HWY_REP4(9.6259895571622622E-01),
+                            HWY_REP4(-4.7272457588933831E-01),
+                            HWY_REP4(7.4802088567547664E-02)};
+  const T err =
+      RunApproximation(-1, 1, p, q, EvalPoly(), [](T x) { return T(exp(x)); });
+  EXPECT_LT(err, 1E-4);
+}
+
+void TestNegExp() {
+  // 4,3 is the min required for monotonicity; max error in 0,10: 751 ppm
+  // no benefit for k>50.
+  const T p[4 * (4 + 1)] = {
+      HWY_REP4(5.9580258551150123E-02), HWY_REP4(-2.5073728806886408E-02),
+      HWY_REP4(4.1561830213689248E-03), HWY_REP4(-3.1815408488900372E-04),
+      HWY_REP4(9.3866690094906802E-06)};
+  const T q[4 * (3 + 1)] = {
+      HWY_REP4(5.9579108238812878E-02), HWY_REP4(3.4542074345478582E-02),
+      HWY_REP4(8.7263562483501714E-03), HWY_REP4(1.4095109143061216E-03)};
+
+  const T err =
+      RunApproximation(0, 10, p, q, EvalPoly(), [](T x) { return T(exp(-x)); });
+  EXPECT_LT(err, sizeof(T) == 8 ? 2E-5 : 3E-5);
+}
+
+void TestSin() {
+  const T p[4 * (6 + 1)] = {
+      HWY_REP4(1.5518122109203780E-05),  HWY_REP4(2.3388958643675966E+00),
+      HWY_REP4(-8.6705520940849157E-01), HWY_REP4(-1.9702294764873535E-01),
+      HWY_REP4(1.2193404314472320E-01),  HWY_REP4(-1.7373966109788839E-02),
+      HWY_REP4(7.8829435883034796E-04)};
+  const T q[4 * (5 + 1)] = {
+      HWY_REP4(2.3394371422557279E+00), HWY_REP4(-8.7028221081288615E-01),
+      HWY_REP4(2.0052872219658430E-01), HWY_REP4(-3.2460335995264836E-02),
+      HWY_REP4(3.1546157932479282E-03), HWY_REP4(-1.6692542019380155E-04)};
+
+  const T err = RunApproximation(0, Pi<T>(1) * 2, p, q, EvalPoly(),
+                                 [](T x) { return T(sin(x)); });
+  EXPECT_LT(err, sizeof(T) == 8 ? 5E-4 : 7E-4);
+}
+
+void TestLog() {
+  HWY_ALIGN const T p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06),
+                                      HWY_REP4(1.4287160470083755E+00),
+                                      HWY_REP4(7.4245873327820566E-01)};
+  HWY_ALIGN const T q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01),
+                                      HWY_REP4(1.0096718572241148E+00),
+                                      HWY_REP4(1.7409343003366853E-01)};
+  const T err = RunApproximation(1E-6, 1000, p, q, EvalLog2(), std::log2);
+  printf("%E\n", err);
+}
+
+HWY_NOINLINE void TestRationalPolynomial() {
+  TestSimpleGamma();
+  TestLinearToSrgb8Direct();
+  TestExp();
+  TestNegExp();
+  TestSin();
+  TestLog();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class RationalPolynomialTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(RationalPolynomialTest);
+
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSimpleGamma);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLinearToSrgb8Direct);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestExp);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestNegExp);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestSin);
+HWY_EXPORT_AND_TEST_P(RationalPolynomialTest, TestLog);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
new file mode 100644
index 0000000000..db60a458db
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.cc
@@ -0,0 +1,865 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+
+#include <algorithm>
+#include <queue>
+#include <tuple>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+std::pair<size_t, size_t>
+LowMemoryRenderPipeline::ColorDimensionsToChannelDimensions(
+    std::pair<size_t, size_t> in, size_t c, size_t stage) const {
+  std::pair<size_t, size_t> ret;
+  std::pair<size_t, size_t> shift = channel_shifts_[stage][c];
+  ret.first =
+      ((in.first << base_color_shift_) + (1 << shift.first) - 1) >> shift.first;
+  ret.second = ((in.second << base_color_shift_) + (1 << shift.second) - 1) >>
+               shift.second;
+  return ret;
+}
+
+std::pair<size_t, size_t> LowMemoryRenderPipeline::BorderToStore(
+    size_t c) const {
+  auto ret = ColorDimensionsToChannelDimensions(group_border_, c, 0);
+  ret.first += padding_[0][c].first;
+  ret.second += padding_[0][c].second;
+  return ret;
+}
+
+void LowMemoryRenderPipeline::SaveBorders(size_t group_id, size_t c,
+                                          const ImageF& in) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  if (gy > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2 - 1) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gy + 1 < frame_dimensions_.ysize_groups) {
+    Rect from(group_data_x_border_,
+              group_data_y_border_ + y1 - y0 - bordery_write, x1 - x0,
+              bordery_write);
+    Rect to(x0, (gy * 2) * bordery_write, x1 - x0, bordery_write);
+    CopyImageTo(from, in, to, &borders_horizontal_[c]);
+  }
+  if (gx > 0) {
+    Rect from(group_data_x_border_, group_data_y_border_, borderx_write,
+              y1 - y0);
+    Rect to((gx * 2 - 1) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+  if (gx + 1 < frame_dimensions_.xsize_groups) {
+    Rect from(group_data_x_border_ + x1 - x0 - borderx_write,
+              group_data_y_border_, borderx_write, y1 - y0);
+    Rect to((gx * 2) * borderx_write, y0, borderx_write, y1 - y0);
+    CopyImageTo(from, in, to, &borders_vertical_[c]);
+  }
+}
+
+void LowMemoryRenderPipeline::LoadBorders(size_t group_id, size_t c,
+                                          const Rect& r, ImageF* out) {
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+  size_t hshift = channel_shifts_[0][c].first;
+  size_t vshift = channel_shifts_[0][c].second;
+  // Coordinates of the group in the image.
+  size_t x0 = gx * GroupInputXSize(c);
+  size_t x1 = std::min((gx + 1) * GroupInputXSize(c),
+                       DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+  size_t y0 = gy * GroupInputYSize(c);
+  size_t y1 = std::min((gy + 1) * GroupInputYSize(c),
+                       DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  size_t paddingx = padding_[0][c].first;
+  size_t paddingy = padding_[0][c].second;
+
+  auto borders = BorderToStore(c);
+  size_t borderx_write = borders.first;
+  size_t bordery_write = borders.second;
+
+  // Limits of the area to copy from, in image coordinates.
+  JXL_DASSERT(r.x0() == 0 || (r.x0() << base_color_shift_) >= paddingx);
+  size_t x0src = DivCeil(r.x0() << base_color_shift_, 1 << hshift);
+  if (x0src != 0) {
+    x0src -= paddingx;
+  }
+  // r may be such that r.x1 (namely x0() + xsize()) is within paddingx of the
+  // right side of the image, so we use min() here.
+  size_t x1src =
+      DivCeil((r.x0() + r.xsize()) << base_color_shift_, 1 << hshift);
+  x1src = std::min(x1src + paddingx,
+                   DivCeil(frame_dimensions_.xsize_upsampled, 1 << hshift));
+
+  // Similar computation for y.
+  JXL_DASSERT(r.y0() == 0 || (r.y0() << base_color_shift_) >= paddingy);
+  size_t y0src = DivCeil(r.y0() << base_color_shift_, 1 << vshift);
+  if (y0src != 0) {
+    y0src -= paddingy;
+  }
+  size_t y1src =
+      DivCeil((r.y0() + r.ysize()) << base_color_shift_, 1 << vshift);
+  y1src = std::min(y1src + paddingy,
+                   DivCeil(frame_dimensions_.ysize_upsampled, 1 << vshift));
+
+  // Copy other groups' borders from the border storage.
+  if (y0src < y0) {
+    JXL_DASSERT(gy > 0);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 - 2) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0,
+             group_data_y_border_ - bordery_write, x1src - x0src,
+             bordery_write),
+        out);
+  }
+  if (y1src > y1) {
+    // When copying the bottom border we must not be on the bottom groups.
+    JXL_DASSERT(gy + 1 < frame_dimensions_.ysize_groups);
+    CopyImageTo(
+        Rect(x0src, (gy * 2 + 1) * bordery_write, x1src - x0src, bordery_write),
+        borders_horizontal_[c],
+        Rect(group_data_x_border_ + x0src - x0, group_data_y_border_ + y1 - y0,
+             x1src - x0src, bordery_write),
+        out);
+  }
+  if (x0src < x0) {
+    JXL_DASSERT(gx > 0);
+    CopyImageTo(
+        Rect((gx * 2 - 2) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ - borderx_write,
+             group_data_y_border_ + y0src - y0, borderx_write, y1src - y0src),
+        out);
+  }
+  if (x1src > x1) {
+    // When copying the right border we must not be on the rightmost groups.
+    JXL_DASSERT(gx + 1 < frame_dimensions_.xsize_groups);
+    CopyImageTo(
+        Rect((gx * 2 + 1) * borderx_write, y0src, borderx_write, y1src - y0src),
+        borders_vertical_[c],
+        Rect(group_data_x_border_ + x1 - x0, group_data_y_border_ + y0src - y0,
+             borderx_write, y1src - y0src),
+        out);
+  }
+}
+
+size_t LowMemoryRenderPipeline::GroupInputXSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].first;
+}
+
+size_t LowMemoryRenderPipeline::GroupInputYSize(size_t c) const {
+  return (frame_dimensions_.group_dim << base_color_shift_) >>
+         channel_shifts_[0][c].second;
+}
+
+void LowMemoryRenderPipeline::EnsureBordersStorage() {
+  const auto& shifts = channel_shifts_[0];
+  if (borders_horizontal_.size() < shifts.size()) {
+    borders_horizontal_.resize(shifts.size());
+    borders_vertical_.resize(shifts.size());
+  }
+  for (size_t c = 0; c < shifts.size(); c++) {
+    auto borders = BorderToStore(c);
+    size_t borderx = borders.first;
+    size_t bordery = borders.second;
+    JXL_DASSERT(frame_dimensions_.xsize_groups > 0);
+    size_t num_xborders = (frame_dimensions_.xsize_groups - 1) * 2;
+    JXL_DASSERT(frame_dimensions_.ysize_groups > 0);
+    size_t num_yborders = (frame_dimensions_.ysize_groups - 1) * 2;
+    size_t downsampled_xsize =
+        DivCeil(frame_dimensions_.xsize_upsampled_padded, 1 << shifts[c].first);
+    size_t downsampled_ysize = DivCeil(frame_dimensions_.ysize_upsampled_padded,
+                                       1 << shifts[c].second);
+    Rect horizontal = Rect(0, 0, downsampled_xsize, bordery * num_yborders);
+    if (!SameSize(horizontal, borders_horizontal_[c])) {
+      borders_horizontal_[c] = ImageF(horizontal.xsize(), horizontal.ysize());
+    }
+    Rect vertical = Rect(0, 0, borderx * num_xborders, downsampled_ysize);
+    if (!SameSize(vertical, borders_vertical_[c])) {
+      borders_vertical_[c] = ImageF(vertical.xsize(), vertical.ysize());
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::Init() {
+  group_border_ = {0, 0};
+  base_color_shift_ = CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                                      frame_dimensions_.xsize_padded);
+
+  const auto& shifts = channel_shifts_[0];
+
+  // Ensure that each channel has enough many border pixels.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    group_border_.first =
+        std::max(group_border_.first,
+                 DivCeil(padding_[0][c].first << channel_shifts_[0][c].first,
+                         1 << base_color_shift_));
+    group_border_.second =
+        std::max(group_border_.second,
+                 DivCeil(padding_[0][c].second << channel_shifts_[0][c].second,
+                         1 << base_color_shift_));
+  }
+
+  // Ensure that all channels have an integer number of border pixels in the
+  // input.
+  for (size_t c = 0; c < shifts.size(); c++) {
+    if (channel_shifts_[0][c].first >= base_color_shift_) {
+      group_border_.first =
+          RoundUpTo(group_border_.first,
+                    1 << (channel_shifts_[0][c].first - base_color_shift_));
+    }
+    if (channel_shifts_[0][c].second >= base_color_shift_) {
+      group_border_.second =
+          RoundUpTo(group_border_.second,
+                    1 << (channel_shifts_[0][c].second - base_color_shift_));
+    }
+  }
+  // Ensure that the X border on color channels is a multiple of kBlockDim or
+  // the vector size (required for EPF stages). Vectors on ARM NEON are never
+  // wider than 4 floats, so rounding to multiples of 4 is enough.
+#if JXL_ARCH_ARM
+  constexpr size_t kGroupXAlign = 4;
+#else
+  constexpr size_t kGroupXAlign = 16;
+#endif
+  group_border_.first = RoundUpTo(group_border_.first, kGroupXAlign);
+  // Allocate borders in group images that are just enough for storing the
+  // borders to be copied in, plus any rounding to ensure alignment.
+  std::pair<size_t, size_t> max_border = {0, 0};
+  for (size_t c = 0; c < shifts.size(); c++) {
+    max_border.first = std::max(BorderToStore(c).first, max_border.first);
+    max_border.second = std::max(BorderToStore(c).second, max_border.second);
+  }
+  group_data_x_border_ = RoundUpTo(max_border.first, kGroupXAlign);
+  group_data_y_border_ = max_border.second;
+
+  EnsureBordersStorage();
+  group_border_assigner_.Init(frame_dimensions_);
+
+  for (first_trailing_stage_ = stages_.size(); first_trailing_stage_ > 0;
+       first_trailing_stage_--) {
+    bool has_inout_c = false;
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[first_trailing_stage_ - 1]->GetChannelMode(c) ==
+          RenderPipelineChannelMode::kInOut) {
+        has_inout_c = true;
+      }
+    }
+    if (has_inout_c) {
+      break;
+    }
+  }
+
+  first_image_dim_stage_ = stages_.size();
+  for (size_t i = 0; i < stages_.size(); i++) {
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[i][c].first),
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[i][c].second));
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+    if (stages_[i]->SwitchToImageDimensions()) {
+      // We don't allow kInOut after switching to image dimensions.
+      JXL_ASSERT(i >= first_trailing_stage_);
+      first_image_dim_stage_ = i + 1;
+      stages_[i]->GetImageDimensions(&full_image_xsize_, &full_image_ysize_,
+                                     &frame_origin_);
+      break;
+    }
+  }
+  for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+    if (stages_[i]->SwitchToImageDimensions()) {
+      JXL_ABORT("Cannot switch to image dimensions multiple times");
+    }
+    std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      input_sizes[c] = {full_image_xsize_, full_image_ysize_};
+    }
+    stages_[i]->SetInputSizes(input_sizes);
+  }
+
+  anyc_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    for (size_t c = 0; c < shifts.size(); c++) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        anyc_[i] = c;
+      }
+    }
+  }
+
+  stage_input_for_channel_ = std::vector<std::vector<int32_t>>(
+      stages_.size(), std::vector<int32_t>(shifts.size()));
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int input = -1;
+    for (size_t i = 0; i < stages_.size(); i++) {
+      stage_input_for_channel_[i][c] = input;
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        input = i;
+      }
+    }
+  }
+
+  image_rect_.resize(stages_.size());
+  for (size_t i = 0; i < stages_.size(); i++) {
+    size_t x1 = DivCeil(frame_dimensions_.xsize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].first);
+    size_t y1 = DivCeil(frame_dimensions_.ysize_upsampled,
+                        1 << channel_shifts_[i][anyc_[i]].second);
+    image_rect_[i] = Rect(0, 0, x1, y1);
+  }
+
+  virtual_ypadding_for_output_.resize(stages_.size());
+  xpadding_for_output_.resize(stages_.size());
+  for (size_t c = 0; c < shifts.size(); c++) {
+    int ypad = 0;
+    int xpad = 0;
+    for (size_t i = stages_.size(); i-- > 0;) {
+      if (stages_[i]->GetChannelMode(c) !=
+          RenderPipelineChannelMode::kIgnored) {
+        virtual_ypadding_for_output_[i] =
+            std::max(ypad, virtual_ypadding_for_output_[i]);
+        xpadding_for_output_[i] = std::max(xpad, xpadding_for_output_[i]);
+      }
+      if (stages_[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        ypad = (DivCeil(ypad, 1 << channel_shifts_[i][c].second) +
+                stages_[i]->settings_.border_y)
+               << channel_shifts_[i][c].second;
+        xpad = DivCeil(xpad, 1 << stages_[i]->settings_.shift_x) +
+               stages_[i]->settings_.border_x;
+      }
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                        bool use_group_ids) {
+  const auto& shifts = channel_shifts_[0];
+
+  use_group_ids_ = use_group_ids;
+  size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
+  for (size_t t = group_data_.size(); t < num_buffers; t++) {
+    group_data_.emplace_back();
+    group_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      group_data_[t][c] = ImageF(GroupInputXSize(c) + group_data_x_border_ * 2,
+                                 GroupInputYSize(c) + group_data_y_border_ * 2);
+    }
+  }
+  // TODO(veluca): avoid reallocating buffers if not needed.
+  stage_data_.resize(num);
+  size_t upsampling = 1u << base_color_shift_;
+  size_t group_dim = frame_dimensions_.group_dim * upsampling;
+  size_t padding =
+      2 * group_data_x_border_ * upsampling +  // maximum size of a rect
+      2 * kRenderPipelineXOffset;              // extra padding for processing
+  size_t stage_buffer_xsize = group_dim + padding;
+  for (size_t t = 0; t < num; t++) {
+    stage_data_[t].resize(shifts.size());
+    for (size_t c = 0; c < shifts.size(); c++) {
+      stage_data_[t][c].resize(stages_.size());
+      size_t next_y_border = 0;
+      for (size_t i = stages_.size(); i-- > 0;) {
+        if (stages_[i]->GetChannelMode(c) ==
+            RenderPipelineChannelMode::kInOut) {
+          size_t stage_buffer_ysize =
+              2 * next_y_border + (1 << stages_[i]->settings_.shift_y);
+          stage_buffer_ysize = 1 << CeilLog2Nonzero(stage_buffer_ysize);
+          next_y_border = stages_[i]->settings_.border_y;
+          stage_data_[t][c][i] = ImageF(stage_buffer_xsize, stage_buffer_ysize);
+        }
+      }
+    }
+  }
+  if (first_image_dim_stage_ != stages_.size()) {
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    if (image_rect.xsize() == 0 || image_rect.ysize() == 0) {
+      image_rect = RectT<ssize_t>(0, 0, 0, 0);
+    }
+    size_t left_padding = image_rect.x0();
+    size_t middle_padding = group_dim;
+    size_t right_padding = full_image_xsize_ - image_rect.x1();
+    size_t out_of_frame_xsize =
+        padding +
+        std::max(left_padding, std::max(middle_padding, right_padding));
+    out_of_frame_data_.resize(num);
+    for (size_t t = 0; t < num; t++) {
+      out_of_frame_data_[t] = ImageF(out_of_frame_xsize, shifts.size());
+    }
+  }
+}
+
+std::vector<std::pair<ImageF*, Rect>> LowMemoryRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret(channel_shifts_[0].size());
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    ret[c].first = &group_data_[use_group_ids_ ? group_id : thread_id][c];
+    ret[c].second = Rect(group_data_x_border_, group_data_y_border_,
+                         GroupInputXSize(c), GroupInputYSize(c),
+                         DivCeil(frame_dimensions_.xsize_upsampled,
+                                 1 << channel_shifts_[0][c].first) -
+                             gx * GroupInputXSize(c) + group_data_x_border_,
+                         DivCeil(frame_dimensions_.ysize_upsampled,
+                                 1 << channel_shifts_[0][c].second) -
+                             gy * GroupInputYSize(c) + group_data_y_border_);
+  }
+  return ret;
+}
+
+namespace {
+
+JXL_INLINE int GetMirroredY(int y, ssize_t group_y0, ssize_t image_ysize) {
+  if (group_y0 == 0 && (y < 0 || y + group_y0 >= image_ysize)) {
+    return Mirror(y, image_ysize);
+  }
+  if (y + group_y0 >= image_ysize) {
+    // Here we know that the one mirroring step is sufficient.
+    return 2 * image_ysize - (y + group_y0) - 1 - group_y0;
+  }
+  return y;
+}
+
+JXL_INLINE void ApplyXMirroring(float* row, ssize_t borderx, ssize_t group_x0,
+                                ssize_t group_xsize, ssize_t image_xsize) {
+  if (image_xsize <= borderx) {
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] =
+            row[kRenderPipelineXOffset + Mirror(-ix - 1, image_xsize)];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize + ix - group_x0] =
+            row[kRenderPipelineXOffset + Mirror(image_xsize + ix, image_xsize) -
+                group_x0];
+      }
+    }
+  } else {
+    // Here we know that the one mirroring step is sufficient.
+    if (group_x0 == 0) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset - ix - 1] = row[kRenderPipelineXOffset + ix];
+      }
+    }
+    if (group_xsize + borderx + group_x0 >= image_xsize) {
+      for (ssize_t ix = 0; ix < borderx; ix++) {
+        row[kRenderPipelineXOffset + image_xsize - group_x0 + ix] =
+            row[kRenderPipelineXOffset + image_xsize - group_x0 - ix - 1];
+      }
+    }
+  }
+}
+
+// Information about where the *output* of each stage is stored.
+class Rows {
+ public:
+  Rows(const std::vector<std::unique_ptr<RenderPipelineStage>>& stages,
+       const Rect data_max_color_channel_rect, int group_data_x_border,
+       int group_data_y_border,
+       const std::vector<std::pair<size_t, size_t>>& group_data_shift,
+       size_t base_color_shift, std::vector<std::vector<ImageF>>& thread_data,
+       std::vector<ImageF>& input_data) {
+    size_t num_stages = stages.size();
+    size_t num_channels = input_data.size();
+
+    JXL_ASSERT(thread_data.size() == num_channels);
+    JXL_ASSERT(group_data_shift.size() == num_channels);
+
+#if JXL_ENABLE_ASSERT
+    for (const auto& td : thread_data) {
+      JXL_ASSERT(td.size() == num_stages);
+    }
+#endif
+
+    rows_.resize(num_stages + 1, std::vector<RowInfo>(num_channels));
+
+    for (size_t i = 0; i < num_stages; i++) {
+      for (size_t c = 0; c < input_data.size(); c++) {
+        if (stages[i]->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+          rows_[i + 1][c].ymod_minus_1 = thread_data[c][i].ysize() - 1;
+          rows_[i + 1][c].base_ptr = thread_data[c][i].Row(0);
+          rows_[i + 1][c].stride = thread_data[c][i].PixelsPerRow();
+        }
+      }
+    }
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      auto channel_group_data_rect =
+          data_max_color_channel_rect.As<ssize_t>()
+              .Translate(-group_data_x_border, -group_data_y_border)
+              .ShiftLeft(base_color_shift)
+              .CeilShiftRight(group_data_shift[c])
+              .Translate(group_data_x_border - ssize_t(kRenderPipelineXOffset),
+                         group_data_y_border);
+      rows_[0][c].base_ptr = channel_group_data_rect.Row(&input_data[c], 0);
+      rows_[0][c].stride = input_data[c].PixelsPerRow();
+      rows_[0][c].ymod_minus_1 = -1;
+    }
+  }
+
+  // Stage -1 refers to the input data; all other values must be nonnegative and
+  // refer to the data for the output of that stage.
+  JXL_INLINE float* GetBuffer(int stage, int y, size_t c) const {
+    JXL_DASSERT(stage >= -1);
+    const RowInfo& info = rows_[stage + 1][c];
+    return info.base_ptr + ssize_t(info.stride) * (y & info.ymod_minus_1);
+  }
+
+ private:
+  struct RowInfo {
+    // Pointer to beginning of the first row.
+    float* base_ptr;
+    // Modulo value for the y axis minus 1 (ymod is guaranteed to be a power of
+    // 2, which allows efficient mod computation by masking).
+    int ymod_minus_1;
+    // Number of floats per row.
+    size_t stride;
+  };
+  std::vector<std::vector<RowInfo>> rows_;
+};
+
+}  // namespace
+
+void LowMemoryRenderPipeline::RenderRect(size_t thread_id,
+                                         std::vector<ImageF>& input_data,
+                                         Rect data_max_color_channel_rect,
+                                         Rect image_max_color_channel_rect) {
+  // For each stage, the rect corresponding to the image area currently being
+  // processed, in the coordinates of that stage (i.e. with the scaling factor
+  // that that stage has).
+  std::vector<Rect> group_rect;
+  group_rect.resize(stages_.size());
+  Rect image_area_rect =
+      image_max_color_channel_rect.ShiftLeft(base_color_shift_)
+          .Crop(frame_dimensions_.xsize_upsampled,
+                frame_dimensions_.ysize_upsampled);
+  for (size_t i = 0; i < stages_.size(); i++) {
+    group_rect[i] =
+        image_area_rect.CeilShiftRight(channel_shifts_[i][anyc_[i]]);
+  }
+
+  ssize_t frame_x0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.x0;
+  ssize_t frame_y0 =
+      first_image_dim_stage_ == stages_.size() ? 0 : frame_origin_.y0;
+  size_t full_image_xsize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.xsize_upsampled
+                                : full_image_xsize_;
+  size_t full_image_ysize = first_image_dim_stage_ == stages_.size()
+                                ? frame_dimensions_.ysize_upsampled
+                                : full_image_ysize_;
+
+  // Compute actual x-axis bounds for the current image area in the context of
+  // the full image this frame is part of. As the left boundary may be negative,
+  // we also create the x_pixels_skip value, defined as follows:
+  // - both x_pixels_skip and full_image_x0 are >= 0, and at least one is 0;
+  // - full_image_x0 - x_pixels_skip is the position of the current frame area
+  //   in the full image.
+  ssize_t full_image_x0 = frame_x0 + image_area_rect.x0();
+  ssize_t x_pixels_skip = 0;
+  if (full_image_x0 < 0) {
+    x_pixels_skip = -full_image_x0;
+    full_image_x0 = 0;
+  }
+  ssize_t full_image_x1 = frame_x0 + image_area_rect.x1();
+  full_image_x1 = std::min<ssize_t>(full_image_x1, full_image_xsize);
+
+  // If the current image area is entirely outside of the visible image, there
+  // is no point in proceeding. Note: this uses the assumption that if there is
+  // a stage with observable effects (i.e. a kInput stage), it only appears
+  // after the stage that switches to image dimensions.
+  if (full_image_x1 <= full_image_x0) return;
+
+  // Data structures to hold information about input/output rows and their
+  // buffers.
+  Rows rows(stages_, data_max_color_channel_rect, group_data_x_border_,
+            group_data_y_border_, channel_shifts_[0], base_color_shift_,
+            stage_data_[thread_id], input_data);
+
+  std::vector<RenderPipelineStage::RowInfo> input_rows(first_trailing_stage_ +
+                                                       1);
+  for (size_t i = 0; i < first_trailing_stage_; i++) {
+    input_rows[i].resize(input_data.size());
+  }
+  input_rows[first_trailing_stage_].resize(input_data.size(),
+                                           std::vector<float*>(1));
+
+  // Maximum possible shift is 3.
+  RenderPipelineStage::RowInfo output_rows(input_data.size(),
+                                           std::vector<float*>(8));
+
+  // Fills in input_rows and output_rows for a given y value (relative to the
+  // start of the group, measured in actual pixels at the appropriate vertical
+  // scaling factor) and a given stage, applying mirroring if necessary. This
+  // function is somewhat inefficient for trailing kInOut or kInput stages,
+  // where just filling the input row once ought to be sufficient.
+  auto prepare_io_rows = [&](int y, size_t i) {
+    ssize_t bordery = stages_[i]->settings_.border_y;
+    size_t shifty = stages_[i]->settings_.shift_y;
+    auto make_row = [&](size_t c, ssize_t iy) {
+      size_t mirrored_y = GetMirroredY(y + iy - bordery, group_rect[i].y0(),
+                                       image_rect_[i].ysize());
+      input_rows[i][c][iy] =
+          rows.GetBuffer(stage_input_for_channel_[i][c], mirrored_y, c);
+      ApplyXMirroring(input_rows[i][c][iy], stages_[i]->settings_.border_x,
+                      group_rect[i].x0(), group_rect[i].xsize(),
+                      image_rect_[i].xsize());
+    };
+    for (size_t c = 0; c < input_data.size(); c++) {
+      RenderPipelineChannelMode mode = stages_[i]->GetChannelMode(c);
+      if (mode == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      // If we already have rows from a previous iteration, we can just shift
+      // the rows by 1 and insert the new one.
+      if (input_rows[i][c].size() == 2 * size_t(bordery) + 1) {
+        for (ssize_t iy = 0; iy < 2 * bordery; iy++) {
+          input_rows[i][c][iy] = input_rows[i][c][iy + 1];
+        }
+        make_row(c, bordery * 2);
+      } else {
+        input_rows[i][c].resize(2 * bordery + 1);
+        for (ssize_t iy = 0; iy < 2 * bordery + 1; iy++) {
+          make_row(c, iy);
+        }
+      }
+
+      // If necessary, get the output buffers.
+      if (mode == RenderPipelineChannelMode::kInOut) {
+        for (size_t iy = 0; iy < (1u << shifty); iy++) {
+          output_rows[c][iy] = rows.GetBuffer(i, y * (1 << shifty) + iy, c);
+        }
+      }
+    }
+  };
+
+  // We pretend that every stage has a vertical shift of 0, i.e. it is as tall
+  // as the final image.
+  // We call each such row a "virtual" row, because it may or may not correspond
+  // to an actual row of the current processing stage; actual processing happens
+  // when vy % (1<<vshift) == 0.
+
+  int num_extra_rows = *std::max_element(virtual_ypadding_for_output_.begin(),
+                                         virtual_ypadding_for_output_.end());
+
+  for (int vy = -num_extra_rows;
+       vy < int(image_area_rect.ysize()) + num_extra_rows; vy++) {
+    for (size_t i = 0; i < first_trailing_stage_; i++) {
+      int stage_vy = vy - num_extra_rows + virtual_ypadding_for_output_[i];
+
+      if (stage_vy % (1 << channel_shifts_[i][anyc_[i]].second) != 0) {
+        continue;
+      }
+
+      if (stage_vy < -virtual_ypadding_for_output_[i]) {
+        continue;
+      }
+
+      int y = stage_vy >> channel_shifts_[i][anyc_[i]].second;
+
+      ssize_t image_y = ssize_t(group_rect[i].y0()) + y;
+      // Do not produce rows in out-of-bounds areas.
+      if (image_y < 0 || image_y >= ssize_t(image_rect_[i].ysize())) {
+        continue;
+      }
+
+      // Get the input/output rows and potentially apply mirroring to the input.
+      prepare_io_rows(y, i);
+
+      // Produce output rows.
+      stages_[i]->ProcessRow(input_rows[i], output_rows,
+                             xpadding_for_output_[i], group_rect[i].xsize(),
+                             group_rect[i].x0(), image_y, thread_id);
+    }
+
+    // Process trailing stages, i.e. the final set of non-kInOut stages; they
+    // all have the same input buffer and no need to use any mirroring.
+
+    int y = vy - num_extra_rows;
+
+    for (size_t c = 0; c < input_data.size(); c++) {
+      // Skip pixels that are not part of the actual final image area.
+      input_rows[first_trailing_stage_][c][0] =
+          rows.GetBuffer(stage_input_for_channel_[first_trailing_stage_][c], y,
+                         c) +
+          x_pixels_skip;
+    }
+
+    // Check that we are not outside of the bounds for the current rendering
+    // rect. Not doing so might result in overwriting some rows that have been
+    // written (or will be written) by other threads.
+    if (y < 0 || y >= ssize_t(image_area_rect.ysize())) {
+      continue;
+    }
+
+    // Avoid running pipeline stages on pixels that are outside the full image
+    // area. As trailing stages have no borders, this is a free optimization
+    // (and may be necessary for correctness, as some stages assume coordinates
+    // are within bounds).
+    ssize_t full_image_y = frame_y0 + image_area_rect.y0() + y;
+    if (full_image_y < 0 || full_image_y >= ssize_t(full_image_ysize)) {
+      continue;
+    }
+
+    for (size_t i = first_trailing_stage_; i < stages_.size(); i++) {
+      // Before the first_image_dim_stage_, coordinates are relative to the
+      // current frame.
+      size_t x0 =
+          i < first_image_dim_stage_ ? full_image_x0 - frame_x0 : full_image_x0;
+      size_t y =
+          i < first_image_dim_stage_ ? full_image_y - frame_y0 : full_image_y;
+      stages_[i]->ProcessRow(input_rows[first_trailing_stage_], output_rows,
+                             /*xextra=*/0, full_image_x1 - full_image_x0, x0, y,
+                             thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::RenderPadding(size_t thread_id, Rect rect) {
+  if (rect.xsize() == 0) return;
+  size_t numc = channel_shifts_[0].size();
+  RenderPipelineStage::RowInfo input_rows(numc, std::vector<float*>(1));
+  RenderPipelineStage::RowInfo output_rows;
+
+  for (size_t c = 0; c < numc; c++) {
+    input_rows[c][0] = out_of_frame_data_[thread_id].Row(c);
+  }
+
+  for (size_t y = 0; y < rect.ysize(); y++) {
+    stages_[first_image_dim_stage_ - 1]->ProcessPaddingRow(
+        input_rows, rect.xsize(), rect.x0(), rect.y0() + y);
+    for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
+      stages_[i]->ProcessRow(input_rows, output_rows,
+                             /*xextra=*/0, rect.xsize(), rect.x0(),
+                             rect.y0() + y, thread_id);
+    }
+  }
+}
+
+void LowMemoryRenderPipeline::ProcessBuffers(size_t group_id,
+                                             size_t thread_id) {
+  std::vector<ImageF>& input_data =
+      group_data_[use_group_ids_ ? group_id : thread_id];
+
+  // Copy the group borders to the border storage.
+  for (size_t c = 0; c < input_data.size(); c++) {
+    SaveBorders(group_id, c, input_data[c]);
+  }
+
+  size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t gx = group_id % frame_dimensions_.xsize_groups;
+
+  if (first_image_dim_stage_ != stages_.size()) {
+    size_t group_dim = frame_dimensions_.group_dim << base_color_shift_;
+    RectT<ssize_t> group_rect(gx * group_dim, gy * group_dim, group_dim,
+                              group_dim);
+    RectT<ssize_t> image_rect(0, 0, frame_dimensions_.xsize_upsampled,
+                              frame_dimensions_.ysize_upsampled);
+    RectT<ssize_t> full_image_rect(0, 0, full_image_xsize_, full_image_ysize_);
+    group_rect = group_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Translate(frame_origin_.x0, frame_origin_.y0);
+    image_rect = image_rect.Intersection(full_image_rect);
+    group_rect = group_rect.Intersection(image_rect);
+    size_t x0 = group_rect.x0();
+    size_t y0 = group_rect.y0();
+    size_t x1 = group_rect.x1();
+    size_t y1 = group_rect.y1();
+    JXL_DEBUG_V(6,
+                "Rendering padding for full image rect %s "
+                "outside group rect %s",
+                Description(full_image_rect).c_str(),
+                Description(group_rect).c_str());
+
+    if (group_id == 0 && (image_rect.xsize() == 0 || image_rect.ysize() == 0)) {
+      // If this frame does not intersect with the full image, we have to
+      // initialize the whole image area with RenderPadding.
+      RenderPadding(thread_id,
+                    Rect(0, 0, full_image_xsize_, full_image_ysize_));
+    }
+
+    // Render padding for groups that intersect with the full image. The case
+    // where no groups intersect was handled above.
+    if (group_rect.xsize() > 0 && group_rect.ysize() > 0) {
+      if (gx == 0 && gy == 0) {
+        RenderPadding(thread_id, Rect(0, 0, x0, y0));
+      }
+      if (gy == 0) {
+        RenderPadding(thread_id, Rect(x0, 0, x1 - x0, y0));
+      }
+      if (gx == 0) {
+        RenderPadding(thread_id, Rect(0, y0, x0, y1 - y0));
+      }
+      if (gx == 0 && gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(0, y1, x0, full_image_ysize_ - y1));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups) {
+        RenderPadding(thread_id, Rect(x0, y1, x1 - x0, full_image_ysize_ - y1));
+      }
+      if (gy == 0 && gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, 0, full_image_xsize_ - x1, y0));
+      }
+      if (gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y0, full_image_xsize_ - x1, y1 - y0));
+      }
+      if (gy + 1 == frame_dimensions_.ysize_groups &&
+          gx + 1 == frame_dimensions_.xsize_groups) {
+        RenderPadding(thread_id, Rect(x1, y1, full_image_xsize_ - x1,
+                                      full_image_ysize_ - y1));
+      }
+    }
+  }
+
+  Rect ready_rects[GroupBorderAssigner::kMaxToFinalize];
+  size_t num_ready_rects = 0;
+  group_border_assigner_.GroupDone(group_id, group_border_.first,
+                                   group_border_.second, ready_rects,
+                                   &num_ready_rects);
+  for (size_t i = 0; i < num_ready_rects; i++) {
+    const Rect& image_max_color_channel_rect = ready_rects[i];
+    for (size_t c = 0; c < input_data.size(); c++) {
+      LoadBorders(group_id, c, image_max_color_channel_rect, &input_data[c]);
+    }
+    Rect data_max_color_channel_rect(
+        group_data_x_border_ + image_max_color_channel_rect.x0() -
+            gx * frame_dimensions_.group_dim,
+        group_data_y_border_ + image_max_color_channel_rect.y0() -
+            gy * frame_dimensions_.group_dim,
+        image_max_color_channel_rect.xsize(),
+        image_max_color_channel_rect.ysize());
+    RenderRect(thread_id, input_data, data_max_color_channel_rect,
+               image_max_color_channel_rect);
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
new file mode 100644
index 0000000000..b386f7c078
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/low_memory_render_pipeline.h
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/dec_group_border.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A multithreaded, low-memory rendering pipeline that only allocates a minimal
+// amount of buffers.
+class LowMemoryRenderPipeline final : public RenderPipeline {
+ private:
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void ClearDone(size_t i) override { group_border_assigner_.ClearDone(i); }
+
+  void Init() override;
+
+  void EnsureBordersStorage();
+  size_t GroupInputXSize(size_t c) const;
+  size_t GroupInputYSize(size_t c) const;
+  void RenderRect(size_t thread_id, std::vector<ImageF>& input_data,
+                  Rect data_max_color_channel_rect,
+                  Rect image_max_color_channel_rect);
+  void RenderPadding(size_t thread_id, Rect rect);
+
+  void SaveBorders(size_t group_id, size_t c, const ImageF& in);
+  void LoadBorders(size_t group_id, size_t c, const Rect& r, ImageF* out);
+
+  std::pair<size_t, size_t> ColorDimensionsToChannelDimensions(
+      std::pair<size_t, size_t> in, size_t c, size_t stage) const;
+
+  std::pair<size_t, size_t> BorderToStore(size_t c) const;
+
+  bool use_group_ids_;
+
+  // Storage for borders between groups. Borders of adjacent groups are stacked
+  // together, e.g. bottom border of current group is followed by top border
+  // of next group.
+  std::vector<ImageF> borders_horizontal_;
+  std::vector<ImageF> borders_vertical_;
+
+  // Manages the status of borders.
+  GroupBorderAssigner group_border_assigner_;
+
+  // Size (in color-channel-pixels) of the border around each group that might
+  // be assigned to that group.
+  std::pair<size_t, size_t> group_border_;
+  // base_color_shift_ defines the size of groups in terms of final image
+  // pixels.
+  size_t base_color_shift_;
+
+  // Buffer for decoded pixel data for a group, indexed by [thread][channel] or
+  // [group][channel] depending on `use_group_ids_`.
+  std::vector<std::vector<ImageF>> group_data_;
+
+  // Borders for storing group data.
+  size_t group_data_x_border_;
+  size_t group_data_y_border_;
+
+  // Buffers for intermediate rows for the various stages, indexed by
+  // [thread][channel][stage].
+  std::vector<std::vector<std::vector<ImageF>>> stage_data_;
+
+  // Buffers for out-of-frame data, indexed by [thread]; every row is a
+  // different channel.
+  std::vector<ImageF> out_of_frame_data_;
+
+  // For each stage, a non-kIgnored channel.
+  std::vector<int32_t> anyc_;
+
+  // Size of the image at each stage.
+  std::vector<Rect> image_rect_;
+
+  // For each stage, for each channel, keep track of the kInOut stage that
+  // produced the input to that stage (which corresponds to the buffer index
+  // containing the data). -1 if data comes from the original input.
+  std::vector<std::vector<int32_t>> stage_input_for_channel_;
+
+  // Number of (virtual) extra rows that must be processed at each stage
+  // to produce sufficient output for future stages.
+  std::vector<int> virtual_ypadding_for_output_;
+
+  // Same thing for columns, except these are real columns and not virtual ones.
+  std::vector<int> xpadding_for_output_;
+
+  // First stage that doesn't have any kInOut channel.
+  size_t first_trailing_stage_;
+
+  // Origin and size of the frame after switching to image dimensions.
+  FrameOrigin frame_origin_;
+  size_t full_image_xsize_;
+  size_t full_image_ysize_;
+  size_t first_image_dim_stage_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_LOW_MEMORY_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
new file mode 100644
index 0000000000..68b6ef613f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.cc
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <algorithm>
+
+#include "lib/jxl/render_pipeline/low_memory_render_pipeline.h"
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void RenderPipeline::Builder::AddStage(
+    std::unique_ptr<RenderPipelineStage> stage) {
+  stages_.push_back(std::move(stage));
+}
+
+std::unique_ptr<RenderPipeline> RenderPipeline::Builder::Finalize(
+    FrameDimensions frame_dimensions) && {
+#if JXL_ENABLE_ASSERT
+  // Check that the last stage is not an kInOut stage for any channel, and that
+  // there is at least one stage.
+  JXL_ASSERT(!stages_.empty());
+  for (size_t c = 0; c < num_c_; c++) {
+    JXL_ASSERT(stages_.back()->GetChannelMode(c) !=
+               RenderPipelineChannelMode::kInOut);
+  }
+#endif
+
+  std::unique_ptr<RenderPipeline> res;
+  if (use_simple_implementation_) {
+    res = jxl::make_unique<SimpleRenderPipeline>();
+  } else {
+    res = jxl::make_unique<LowMemoryRenderPipeline>();
+  }
+
+  res->padding_.resize(stages_.size());
+  for (size_t i = stages_.size(); i-- > 0;) {
+    const auto& stage = stages_[i];
+    res->padding_[i].resize(num_c_);
+    if (i + 1 == stages_.size()) {
+      continue;
+    }
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->padding_[i][c].first = DivCeil(res->padding_[i + 1][c].first,
+                                            1 << stage->settings_.shift_x) +
+                                    stage->settings_.border_x;
+        res->padding_[i][c].second = DivCeil(res->padding_[i + 1][c].second,
+                                             1 << stage->settings_.shift_y) +
+                                     stage->settings_.border_y;
+      } else {
+        res->padding_[i][c] = res->padding_[i + 1][c];
+      }
+    }
+  }
+
+  res->frame_dimensions_ = frame_dimensions;
+  res->group_completed_passes_.resize(frame_dimensions.num_groups);
+  res->channel_shifts_.resize(stages_.size());
+  res->channel_shifts_[0].resize(num_c_);
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[0][c].first += stage->settings_.shift_x;
+        res->channel_shifts_[0][c].second += stage->settings_.shift_y;
+      }
+    }
+  }
+  for (size_t i = 1; i < stages_.size(); i++) {
+    auto& stage = stages_[i - 1];
+    res->channel_shifts_[i].resize(num_c_);
+    for (size_t c = 0; c < num_c_; c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kInOut) {
+        res->channel_shifts_[i][c].first =
+            res->channel_shifts_[i - 1][c].first - stage->settings_.shift_x;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second - stage->settings_.shift_y;
+      } else {
+        res->channel_shifts_[i][c].first = res->channel_shifts_[i - 1][c].first;
+        res->channel_shifts_[i][c].second =
+            res->channel_shifts_[i - 1][c].second;
+      }
+    }
+  }
+  res->stages_ = std::move(stages_);
+  res->Init();
+  return res;
+}
+
+RenderPipelineInput RenderPipeline::GetInputBuffers(size_t group_id,
+                                                    size_t thread_id) {
+  RenderPipelineInput ret;
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  ret.group_id_ = group_id;
+  ret.thread_id_ = thread_id;
+  ret.pipeline_ = this;
+  ret.buffers_ = PrepareBuffers(group_id, thread_id);
+  return ret;
+}
+
+void RenderPipeline::InputReady(
+    size_t group_id, size_t thread_id,
+    const std::vector<std::pair<ImageF*, Rect>>& buffers) {
+  JXL_DASSERT(group_id < group_completed_passes_.size());
+  group_completed_passes_[group_id]++;
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    (void)i;
+    JXL_CHECK_PLANE_INITIALIZED(*buffers[i].first, buffers[i].second, i);
+  }
+
+  ProcessBuffers(group_id, thread_id);
+}
+
+Status RenderPipeline::PrepareForThreads(size_t num, bool use_group_ids) {
+  for (const auto& stage : stages_) {
+    JXL_RETURN_IF_ERROR(stage->PrepareForThreads(num));
+  }
+  PrepareForThreadsInternal(num, use_group_ids);
+  return true;
+}
+
+void RenderPipelineInput::Done() {
+  JXL_ASSERT(pipeline_);
+  pipeline_->InputReady(group_id_, thread_id_, buffers_);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
new file mode 100644
index 0000000000..bf3ad4975e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline.h
@@ -0,0 +1,139 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Interface to provide input to the rendering pipeline. When this object is
+// destroyed, all the data in the provided ImageF's Rects must have been
+// initialized.
+class RenderPipelineInput {
+ public:
+  RenderPipelineInput(const RenderPipelineInput&) = delete;
+  RenderPipelineInput(RenderPipelineInput&& other) noexcept {
+    *this = std::move(other);
+  }
+  RenderPipelineInput& operator=(RenderPipelineInput&& other) noexcept {
+    pipeline_ = other.pipeline_;
+    group_id_ = other.group_id_;
+    thread_id_ = other.thread_id_;
+    buffers_ = std::move(other.buffers_);
+    other.pipeline_ = nullptr;
+    return *this;
+  }
+
+  RenderPipelineInput() = default;
+  void Done();
+
+  const std::pair<ImageF*, Rect>& GetBuffer(size_t c) const {
+    JXL_ASSERT(c < buffers_.size());
+    return buffers_[c];
+  }
+
+ private:
+  RenderPipeline* pipeline_ = nullptr;
+  size_t group_id_;
+  size_t thread_id_;
+  std::vector<std::pair<ImageF*, Rect>> buffers_;
+  friend class RenderPipeline;
+};
+
+class RenderPipeline {
+ public:
+  class Builder {
+   public:
+    explicit Builder(size_t num_c) : num_c_(num_c) { JXL_ASSERT(num_c > 0); }
+
+    // Adds a stage to the pipeline. Must be called at least once; the last
+    // added stage cannot have kInOut channels.
+    void AddStage(std::unique_ptr<RenderPipelineStage> stage);
+
+    // Enables using the simple (i.e. non-memory-efficient) implementation of
+    // the pipeline.
+    void UseSimpleImplementation() { use_simple_implementation_ = true; }
+
+    // Finalizes setup of the pipeline. Shifts for all channels should be 0 at
+    // this point.
+    std::unique_ptr<RenderPipeline> Finalize(
+        FrameDimensions frame_dimensions) &&;
+
+   private:
+    std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+    size_t num_c_;
+    bool use_simple_implementation_ = false;
+  };
+
+  friend class Builder;
+
+  virtual ~RenderPipeline() = default;
+
+  Status IsInitialized() const {
+    for (const auto& stage : stages_) {
+      JXL_RETURN_IF_ERROR(stage->IsInitialized());
+    }
+    return true;
+  }
+
+  // Allocates storage to run with `num` threads. If `use_group_ids` is true,
+  // storage is allocated for each group, not each thread. The behaviour is
+  // undefined if calling this function multiple times with a different value
+  // for `use_group_ids`.
+  Status PrepareForThreads(size_t num, bool use_group_ids);
+
+  // Retrieves a buffer where input data should be stored by the callee. When
+  // input has been provided for all buffers, the pipeline will complete its
+  // processing. This method may be called multiple times concurrently from
+  // different threads, provided that a different `thread_id` is given.
+  RenderPipelineInput GetInputBuffers(size_t group_id, size_t thread_id);
+
+  size_t PassesWithAllInput() const {
+    return *std::min_element(group_completed_passes_.begin(),
+                             group_completed_passes_.end());
+  }
+
+  virtual void ClearDone(size_t i) {}
+
+ protected:
+  std::vector<std::unique_ptr<RenderPipelineStage>> stages_;
+  // Shifts for every channel at the input of each stage.
+  std::vector<std::vector<std::pair<size_t, size_t>>> channel_shifts_;
+
+  // Amount of (cumulative) padding required by each stage and channel, in
+  // either direction.
+  std::vector<std::vector<std::pair<size_t, size_t>>> padding_;
+
+  FrameDimensions frame_dimensions_;
+
+  std::vector<uint8_t> group_completed_passes_;
+
+  friend class RenderPipelineInput;
+
+ private:
+  void InputReady(size_t group_id, size_t thread_id,
+                  const std::vector<std::pair<ImageF*, Rect>>& buffers);
+
+  virtual std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) = 0;
+
+  virtual void ProcessBuffers(size_t group_id, size_t thread_id) = 0;
+
+  // Note that this method may be called multiple times with different (or
+  // equal) `num`.
+  virtual void PrepareForThreadsInternal(size_t num, bool use_group_ids) = 0;
+
+  // Called once frame dimensions and stages are known.
+  virtual void Init() {}
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
new file mode 100644
index 0000000000..d1a0074161
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h
@@ -0,0 +1,171 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+#define LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/frame_header.h"
+
+namespace jxl {
+
+// The first pixel in the input to RenderPipelineStage will be located at
+// this position. Pixels before this position may be accessed as padding.
+// This should be at least the RoundUpTo(maximum padding / 2, maximum vector
+// size) times 2: this is realized when using Gaborish + EPF + upsampling +
+// chroma subsampling.
+#if JXL_ARCH_ARM
+constexpr size_t kRenderPipelineXOffset = 16;
+#else
+constexpr size_t kRenderPipelineXOffset = 32;
+#endif
+
+enum class RenderPipelineChannelMode {
+  // This channel is not modified by this stage.
+  kIgnored = 0,
+  // This channel is modified in-place.
+  kInPlace = 1,
+  // This channel is modified and written to a new buffer.
+  kInOut = 2,
+  // This channel is only read. These are the only stages that are assumed to
+  // have observable effects, i.e. calls to ProcessRow for other stages may be
+  // omitted if it can be shown they can't affect any kInput stage ProcessRow
+  // call that happens inside image boundaries.
+  kInput = 3,
+};
+
+class RenderPipeline;
+
+class RenderPipelineStage {
+ protected:
+  using Row = float*;
+  using ChannelRows = std::vector<Row>;
+
+ public:
+  using RowInfo = std::vector<ChannelRows>;
+  struct Settings {
+    // Amount of padding required in the various directions by all channels
+    // that have kInOut mode.
+    size_t border_x = 0;
+    size_t border_y = 0;
+
+    // Log2 of the number of columns/rows of output that this stage will produce
+    // for every input row for kInOut channels.
+    size_t shift_x = 0;
+    size_t shift_y = 0;
+
+    static Settings ShiftX(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = border;
+      settings.shift_x = shift;
+      return settings;
+    }
+
+    static Settings ShiftY(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_y = border;
+      settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings Symmetric(size_t shift, size_t border) {
+      Settings settings;
+      settings.border_x = settings.border_y = border;
+      settings.shift_x = settings.shift_y = shift;
+      return settings;
+    }
+
+    static Settings SymmetricBorderOnly(size_t border) {
+      return Symmetric(0, border);
+    }
+  };
+
+  virtual ~RenderPipelineStage() = default;
+
+  // Processes one row of input, producing the appropriate number of rows of
+  // output. Input/output rows can be obtained by calls to
+  // `GetInputRow`/`GetOutputRow`. `xsize+2*xextra` represents the total number
+  // of pixels to be processed in the input row, where the first pixel is at
+  // position `kRenderPipelineXOffset-xextra`. All pixels in the
+  // `[kRenderPipelineXOffset-xextra-border_x,
+  // kRenderPipelineXOffset+xsize+xextra+border_x)` range are initialized and
+  // accessible. `xpos` and `ypos` represent the position of the first
+  // (non-extra, i.e. in position kRenderPipelineXOffset) pixel in the center
+  // row of the input in the full image. `xpos` is a multiple of
+  // `GroupBorderAssigner::kPaddingXRound`. If `settings_.temp_buffer_size` is
+  // nonzero, `temp` will point to an HWY-aligned buffer of at least that number
+  // of floats; concurrent calls will have different buffers.
+  virtual void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                          size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                          size_t thread_id) const = 0;
+
+  // How each channel will be processed. Channels are numbered starting from
+  // color channels (always 3) and followed by all other channels.
+  virtual RenderPipelineChannelMode GetChannelMode(size_t c) const = 0;
+
+ protected:
+  explicit RenderPipelineStage(Settings settings) : settings_(settings) {}
+
+  virtual Status IsInitialized() const { return true; }
+
+  // Informs the stage about the total size of each channel. Few stages will
+  // actually need to use this information.
+  virtual void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) {}
+
+  virtual Status PrepareForThreads(size_t num_threads) { return true; }
+
+  // Returns a pointer to the input row of channel `c` with offset `y`.
+  // `y` must be in [-settings_.border_y, settings_.border_y]. `c` must be such
+  // that `GetChannelMode(c) != kIgnored`. The returned pointer points to the
+  // offset-ed row (i.e. kRenderPipelineXOffset has been applied).
+  float* GetInputRow(const RowInfo& input_rows, size_t c, int offset) const {
+    JXL_DASSERT(GetChannelMode(c) != RenderPipelineChannelMode::kIgnored);
+    JXL_DASSERT(-offset <= static_cast<int>(settings_.border_y));
+    JXL_DASSERT(offset <= static_cast<int>(settings_.border_y));
+    return input_rows[c][settings_.border_y + offset] + kRenderPipelineXOffset;
+  }
+  // Similar to `GetInputRow`, but can only be used if `GetChannelMode(c) ==
+  // kInOut`. Offset must be less than `1<<settings_.shift_y`.. The returned
+  // pointer points to the offset-ed row (i.e. kRenderPipelineXOffset has been
+  // applied).
+  float* GetOutputRow(const RowInfo& output_rows, size_t c,
+                      size_t offset) const {
+    JXL_DASSERT(GetChannelMode(c) == RenderPipelineChannelMode::kInOut);
+    JXL_DASSERT(offset <= 1ul << settings_.shift_y);
+    return output_rows[c][offset] + kRenderPipelineXOffset;
+  }
+
+  // Indicates whether, from this stage on, the pipeline will operate on an
+  // image- rather than frame-sized buffer. Only one stage in the pipeline
+  // should return true, and it should implement ProcessPaddingRow below too.
+  // It is assumed that, if there is a SwitchToImageDimensions() == true stage,
+  // all kInput stages appear after it.
+  virtual bool SwitchToImageDimensions() const { return false; }
+
+  // If SwitchToImageDimensions returns true, then this should set xsize and
+  // ysize to the image size, and frame_origin to the location of the frame
+  // within the image. Otherwise, this is not called at all.
+  virtual void GetImageDimensions(size_t* xsize, size_t* ysize,
+                                  FrameOrigin* frame_origin) const {}
+
+  // Produces the appropriate output data outside of the frame dimensions. xpos
+  // and ypos are now relative to the full image.
+  virtual void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize,
+                                 size_t xpos, size_t ypos) const {}
+
+  virtual const char* GetName() const = 0;
+
+  Settings settings_;
+  friend class RenderPipeline;
+  friend class SimpleRenderPipeline;
+  friend class LowMemoryRenderPipeline;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_RENDER_PIPELINE_STAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
new file mode 100644
index 0000000000..f638807be9
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_test.cc
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/render_pipeline/test_render_pipeline_stages.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline,
+                  CodecInOut* io, ThreadPool* pool) {
+  Status ret = true;
+  {
+    BitReader reader(file);
+    BitReaderScopedCloser reader_closer(&reader, &ret);
+    JXL_RETURN_IF_ERROR(reader.ReadFixedBits<16>() == 0x0AFF);
+    JXL_RETURN_IF_ERROR(ReadSizeHeader(&reader, &io->metadata.size));
+    JXL_RETURN_IF_ERROR(ReadImageMetadata(&reader, &io->metadata.m));
+    io->metadata.transform_data.nonserialized_xyb_encoded =
+        io->metadata.m.xyb_encoded;
+    JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data));
+    if (io->metadata.m.color_encoding.WantICC()) {
+      PaddedBytes icc;
+      JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc));
+      JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc)));
+    }
+    PassesDecoderState dec_state;
+    JXL_RETURN_IF_ERROR(
+        dec_state.output_encoding_info.SetFromMetadata(io->metadata));
+    JXL_RETURN_IF_ERROR(reader.JumpToByteBoundary());
+    io->frames.clear();
+    do {
+      io->frames.emplace_back(&io->metadata.m);
+      // Skip frames that are not displayed.
+      do {
+        size_t frame_start = reader.TotalBitsConsumed() / kBitsPerByte;
+        size_t size_left = file.size() - frame_start;
+        JXL_RETURN_IF_ERROR(
+            DecodeFrame(&dec_state, pool, file.data() + frame_start, size_left,
+                        &io->frames.back(), io->metadata, use_slow_pipeline));
+        reader.SkipBits(io->frames.back().decoded_bytes() * kBitsPerByte);
+      } while (dec_state.shared->frame_header.frame_type !=
+                   FrameType::kRegularFrame &&
+               dec_state.shared->frame_header.frame_type !=
+                   FrameType::kSkipProgressive);
+    } while (!dec_state.shared->frame_header.is_last);
+
+    if (io->frames.empty()) return JXL_FAILURE("Not enough data.");
+
+    if (reader.TotalBitsConsumed() != file.size() * kBitsPerByte) {
+      return JXL_FAILURE("Reader position not at EOF.");
+    }
+    if (!reader.AllReadsWithinBounds()) {
+      return JXL_FAILURE("Reader out of bounds read.");
+    }
+    io->CheckMetadata();
+    // reader is closed here.
+  }
+  return ret;
+}
+
+TEST(RenderPipelineTest, Build) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroups) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+TEST(RenderPipelineTest, BuildFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  std::move(builder).Finalize(frame_dimensions);
+}
+
+TEST(RenderPipelineTest, CallAllGroupsFast) {
+  RenderPipeline::Builder builder(/*num_c=*/1);
+  builder.AddStage(jxl::make_unique<UpsampleXSlowStage>());
+  builder.AddStage(jxl::make_unique<UpsampleYSlowStage>());
+  builder.AddStage(jxl::make_unique<Check0FinalStage>());
+  builder.UseSimpleImplementation();
+  FrameDimensions frame_dimensions;
+  frame_dimensions.Set(/*xsize=*/1024, /*ysize=*/1024, /*group_size_shift=*/0,
+                       /*max_hshift=*/0, /*max_vshift=*/0,
+                       /*modular_mode=*/false, /*upsampling=*/1);
+  auto pipeline = std::move(builder).Finalize(frame_dimensions);
+  ASSERT_TRUE(pipeline->PrepareForThreads(1, /*use_group_ids=*/false));
+
+  for (size_t i = 0; i < frame_dimensions.num_groups; i++) {
+    auto input_buffers = pipeline->GetInputBuffers(i, 0);
+    FillPlane(0.0f, input_buffers.GetBuffer(0).first,
+              input_buffers.GetBuffer(0).second);
+    input_buffers.Done();
+  }
+
+  EXPECT_EQ(pipeline->PassesWithAllInput(), 1);
+}
+
+struct RenderPipelineTestInputSettings {
+  // Input image.
+  std::string input_path;
+  size_t xsize, ysize;
+  bool jpeg_transcode = false;
+  // Encoding settings.
+  CompressParams cparams;
+  // Short name for the encoder settings.
+  std::string cparams_descr;
+
+  bool add_spot_color = false;
+
+  Splines splines;
+};
+
+class RenderPipelineTestParam
+    : public ::testing::TestWithParam<RenderPipelineTestInputSettings> {};
+
+TEST_P(RenderPipelineTestParam, PipelineTest) {
+  RenderPipelineTestInputSettings config = GetParam();
+
+  // Use a parallel runner that randomly shuffles tasks to detect possible
+  // border handling bugs.
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+  const PaddedBytes orig = jxl::test::ReadTestData(config.input_path);
+
+  CodecInOut io;
+  if (config.jpeg_transcode) {
+    ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
+  } else {
+    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  }
+  io.ShrinkTo(config.xsize, config.ysize);
+
+  if (config.add_spot_color) {
+    jxl::ImageF spot(config.xsize, config.ysize);
+    jxl::ZeroFillImage(&spot);
+
+    for (size_t y = 0; y < config.ysize; y++) {
+      float* JXL_RESTRICT row = spot.Row(y);
+      for (size_t x = 0; x < config.xsize; x++) {
+        row[x] = ((x ^ y) & 255) * (1.f / 255.f);
+      }
+    }
+    ExtraChannelInfo info;
+    info.bit_depth.bits_per_sample = 8;
+    info.dim_shift = 0;
+    info.type = jxl::ExtraChannel::kSpotColor;
+    info.spot_color[0] = 0.5f;
+    info.spot_color[1] = 0.2f;
+    info.spot_color[2] = 1.f;
+    info.spot_color[3] = 0.5f;
+
+    io.metadata.m.extra_channel_info.push_back(info);
+    std::vector<jxl::ImageF> ec;
+    ec.push_back(std::move(spot));
+    io.frames[0].SetExtraChannels(std::move(ec));
+  }
+
+  PaddedBytes compressed;
+
+  PassesEncoderState enc_state;
+  enc_state.shared.image_features.splines = config.splines;
+  ASSERT_TRUE(EncodeFile(config.cparams, &io, &enc_state, &compressed,
+                         GetJxlCms(), /*aux_out=*/nullptr, &pool));
+
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+    Image3F def = std::move(*io_default.frames[i].color());
+    Image3F pip = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _));
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+Splines CreateTestSplines() {
+  const ColorCorrelationMap cmap;
+  std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
+                                            {10, 40}, {150, 25},  {120, 300}};
+  const Spline spline{
+      control_points,
+      /*color_dct=*/
+      {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+      /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}};
+  std::vector<Spline> spline_data = {spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, /*quantization_adjustment=*/0,
+                                   cmap.YtoXRatio(0), cmap.YtoBRatio(0));
+    starting_points.push_back(spline.control_points.front());
+  }
+  return Splines(/*quantization_adjustment=*/0, std::move(quantized_splines),
+                 std::move(starting_points));
+}
+
+std::vector<RenderPipelineTestInputSettings> GeneratePipelineTests() {
+  std::vector<RenderPipelineTestInputSettings> all_tests;
+
+  std::pair<size_t, size_t> sizes[] = {
+      {3, 8}, {128, 128}, {256, 256}, {258, 258}, {533, 401}, {777, 777},
+  };
+
+  for (auto size : sizes) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/flower/flower.png";
+    settings.xsize = size.first;
+    settings.ysize = size.second;
+
+    // Base settings.
+    settings.cparams.butteraugli_distance = 1.0;
+    settings.cparams.patches = Override::kOff;
+    settings.cparams.dots = Override::kOff;
+    settings.cparams.gaborish = Override::kOff;
+    settings.cparams.epf = 0;
+    settings.cparams.color_transform = ColorTransform::kXYB;
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoGabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.color_transform = ColorTransform::kNone;
+      s.cparams_descr = "NoGabNoEpfNoPatchesNoXYB";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams_descr = "GabNoEpfNoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 1;
+      s.cparams_descr = "NoGabEpf1NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 2;
+      s.cparams_descr = "NoGabEpf2NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.epf = 3;
+      s.cparams_descr = "NoGabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.gaborish = Override::kOn;
+      s.cparams.epf = 3;
+      s.cparams_descr = "GabEpf3NoPatches";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Splines";
+      s.splines = CreateTestSplines();
+      all_tests.push_back(s);
+    }
+
+    for (size_t ups : {2, 4, 8}) {
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams_descr = "Ups" + std::to_string(ups);
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "EPF1";
+        all_tests.push_back(s);
+      }
+      {
+        auto s = settings;
+        s.cparams.resampling = ups;
+        s.cparams.gaborish = Override::kOn;
+        s.cparams.epf = 1;
+        s.cparams_descr = "Ups" + std::to_string(ups) + "GabEPF1";
+        all_tests.push_back(s);
+      }
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "Noise";
+      s.cparams.photon_noise_iso = 3200;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "NoiseUps";
+      s.cparams.photon_noise_iso = 3200;
+      s.cparams.resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossless";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ProgressiveDC";
+      s.cparams.progressive_dc = 1;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "ModularLossy";
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 1.f;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCT";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaVarDCTUpsamplingEPF";
+      s.cparams.epf = 1;
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams.modular_mode = true;
+      s.cparams.butteraugli_distance = 0;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaLossless";
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.input_path = "jxl/flower/flower_alpha.png";
+      s.cparams_descr = "AlphaDownsample";
+      s.cparams.ec_resampling = 2;
+      all_tests.push_back(s);
+    }
+
+    {
+      auto s = settings;
+      s.cparams_descr = "SpotColor";
+      s.add_spot_color = true;
+      all_tests.push_back(s);
+    }
+  }
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  for (const char* input : {"jxl/flower/flower.png.im_q85_444.jpg",
+                            "jxl/flower/flower.png.im_q85_420.jpg",
+                            "jxl/flower/flower.png.im_q85_422.jpg",
+                            "jxl/flower/flower.png.im_q85_440.jpg"}) {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = input;
+    settings.jpeg_transcode = true;
+    settings.xsize = 2268;
+    settings.ysize = 1512;
+    settings.cparams_descr = "Default";
+    all_tests.push_back(settings);
+  }
+
+#endif
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams_descr = "Patches";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.photon_noise_iso = 1000;
+    settings.cparams_descr = "PatchesAndNoise";
+    all_tests.push_back(settings);
+  }
+
+  {
+    RenderPipelineTestInputSettings settings;
+    settings.input_path = "jxl/grayscale_patches.png";
+    settings.xsize = 1011;
+    settings.ysize = 277;
+    settings.cparams.resampling = 2;
+    settings.cparams_descr = "PatchesAndUps2";
+    all_tests.push_back(settings);
+  }
+
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const RenderPipelineTestInputSettings& c) {
+  std::string filename;
+  size_t pos = c.input_path.find_last_of('/');
+  if (pos == std::string::npos) {
+    filename = c.input_path;
+  } else {
+    filename = c.input_path.substr(pos + 1);
+  }
+  std::replace_if(
+      filename.begin(), filename.end(), [](char c) { return !isalnum(c); },
+      '_');
+  os << filename << "_" << (c.jpeg_transcode ? "JPEG_" : "") << c.xsize << "x"
+     << c.ysize << "_" << c.cparams_descr;
+  return os;
+}
+
+std::string PipelineTestDescription(
+    const testing::TestParamInfo<RenderPipelineTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(RenderPipelineTest, RenderPipelineTestParam,
+                                   testing::ValuesIn(GeneratePipelineTests()),
+                                   PipelineTestDescription);
+
+TEST(RenderPipelineDecodingTest, Animation) {
+  FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
+  ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
+
+  PaddedBytes compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+
+  CodecInOut io_default;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/false, &io_default, &pool));
+  CodecInOut io_slow_pipeline;
+  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+                         /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
+
+  ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
+  for (size_t i = 0; i < io_default.frames.size(); i++) {
+#if JXL_HIGH_PRECISION
+    constexpr float kMaxError = 1e-5;
+#else
+    constexpr float kMaxError = 1e-4;
+#endif
+
+    Image3F fast_pipeline = std::move(*io_default.frames[i].color());
+    Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color());
+    JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError,
+                                      kMaxError, _))
+    for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
+         ec++) {
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
new file mode 100644
index 0000000000..4495288860
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.cc
@@ -0,0 +1,266 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/simple_render_pipeline.h"
+
+#include <hwy/base.h>
+
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+
+void SimpleRenderPipeline::PrepareForThreadsInternal(size_t num,
+                                                     bool use_group_ids) {
+  if (!channel_data_.empty()) {
+    return;
+  }
+  auto ch_size = [](size_t frame_size, size_t shift) {
+    return DivCeil(frame_size, 1 << shift) + kRenderPipelineXOffset * 2;
+  };
+  for (size_t c = 0; c < channel_shifts_[0].size(); c++) {
+    channel_data_.push_back(ImageF(
+        ch_size(frame_dimensions_.xsize_upsampled, channel_shifts_[0][c].first),
+        ch_size(frame_dimensions_.ysize_upsampled,
+                channel_shifts_[0][c].second)));
+    msan::PoisonImage(channel_data_.back());
+  }
+}
+
+Rect SimpleRenderPipeline::MakeChannelRect(size_t group_id, size_t channel) {
+  size_t base_color_shift =
+      CeilLog2Nonzero(frame_dimensions_.xsize_upsampled_padded /
+                      frame_dimensions_.xsize_padded);
+
+  const size_t gx = group_id % frame_dimensions_.xsize_groups;
+  const size_t gy = group_id / frame_dimensions_.xsize_groups;
+  size_t xgroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].first;
+  size_t ygroupdim = (frame_dimensions_.group_dim << base_color_shift) >>
+                     channel_shifts_[0][channel].second;
+  return Rect(
+      kRenderPipelineXOffset + gx * xgroupdim,
+      kRenderPipelineXOffset + gy * ygroupdim, xgroupdim, ygroupdim,
+      kRenderPipelineXOffset + DivCeil(frame_dimensions_.xsize_upsampled,
+                                       1 << channel_shifts_[0][channel].first),
+      kRenderPipelineXOffset +
+          DivCeil(frame_dimensions_.ysize_upsampled,
+                  1 << channel_shifts_[0][channel].second));
+}
+
+std::vector<std::pair<ImageF*, Rect>> SimpleRenderPipeline::PrepareBuffers(
+    size_t group_id, size_t thread_id) {
+  std::vector<std::pair<ImageF*, Rect>> ret;
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    ret.emplace_back(&channel_data_[c], MakeChannelRect(group_id, c));
+  }
+  return ret;
+}
+
+void SimpleRenderPipeline::ProcessBuffers(size_t group_id, size_t thread_id) {
+  for (size_t c = 0; c < channel_data_.size(); c++) {
+    Rect r = MakeChannelRect(group_id, c);
+    (void)r;
+    JXL_CHECK_PLANE_INITIALIZED(channel_data_[c], r, c);
+  }
+
+  if (PassesWithAllInput() <= processed_passes_) return;
+  processed_passes_++;
+
+  for (size_t stage_id = 0; stage_id < stages_.size(); stage_id++) {
+    const auto& stage = stages_[stage_id];
+    // Prepare buffers for kInOut channels.
+    std::vector<ImageF> new_channels(channel_data_.size());
+    std::vector<ImageF*> output_channels(channel_data_.size());
+
+    std::vector<std::pair<size_t, size_t>> input_sizes(channel_data_.size());
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      input_sizes[c] =
+          std::make_pair(channel_data_[c].xsize() - kRenderPipelineXOffset * 2,
+                         channel_data_[c].ysize() - kRenderPipelineXOffset * 2);
+    }
+
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Ensure that the newly allocated channels are large enough to avoid
+      // problems with padding.
+      new_channels[c] =
+          ImageF(frame_dimensions_.xsize_upsampled_padded +
+                     kRenderPipelineXOffset * 2 + hwy::kMaxVectorSize * 8,
+                 frame_dimensions_.ysize_upsampled_padded +
+                     kRenderPipelineXOffset * 2);
+      new_channels[c].ShrinkTo(
+          (input_sizes[c].first << stage->settings_.shift_x) +
+              kRenderPipelineXOffset * 2,
+          (input_sizes[c].second << stage->settings_.shift_y) +
+              kRenderPipelineXOffset * 2);
+      output_channels[c] = &new_channels[c];
+    }
+
+    auto get_row = [&](size_t c, int64_t y) {
+      return channel_data_[c].Row(kRenderPipelineXOffset + y) +
+             kRenderPipelineXOffset;
+    };
+
+    // Add mirrored pixes to all kInOut channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      // Horizontal mirroring.
+      for (size_t y = 0; y < input_sizes[c].second; y++) {
+        float* row = get_row(c, y);
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row - ix - 1) = row[Mirror(-ssize_t(ix) - 1, input_sizes[c].first)];
+        }
+        for (size_t ix = 0; ix < stage->settings_.border_x; ix++) {
+          *(row + ix + input_sizes[c].first) =
+              row[Mirror(ix + input_sizes[c].first, input_sizes[c].first)];
+        }
+      }
+      // Vertical mirroring.
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(get_row(c, -y - 1) - stage->settings_.border_x,
+               get_row(c, Mirror(-ssize_t(y) - 1, input_sizes[c].second)) -
+                   stage->settings_.border_x,
+               sizeof(float) *
+                   (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+      for (int y = 0; y < static_cast<int>(stage->settings_.border_y); y++) {
+        memcpy(
+            get_row(c, input_sizes[c].second + y) - stage->settings_.border_x,
+            get_row(c,
+                    Mirror(input_sizes[c].second + y, input_sizes[c].second)) -
+                stage->settings_.border_x,
+            sizeof(float) *
+                (input_sizes[c].first + 2 * stage->settings_.border_x));
+      }
+    }
+
+    size_t ysize = 0;
+    size_t xsize = 0;
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+        continue;
+      }
+      ysize = std::max(input_sizes[c].second, ysize);
+      xsize = std::max(input_sizes[c].first, xsize);
+    }
+
+    JXL_ASSERT(ysize != 0);
+    JXL_ASSERT(xsize != 0);
+
+    RenderPipelineStage::RowInfo input_rows(channel_data_.size());
+    RenderPipelineStage::RowInfo output_rows(channel_data_.size());
+
+    // Run the pipeline.
+    {
+      stage->SetInputSizes(input_sizes);
+      int border_y = stage->settings_.border_y;
+      for (size_t y = 0; y < ysize; y++) {
+        // Prepare input rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (stage->GetChannelMode(c) == RenderPipelineChannelMode::kIgnored) {
+            continue;
+          }
+          input_rows[c].resize(2 * border_y + 1);
+          for (int iy = -border_y; iy <= border_y; iy++) {
+            input_rows[c][iy + border_y] =
+                channel_data_[c].Row(y + kRenderPipelineXOffset + iy);
+          }
+        }
+        // Prepare output rows.
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          if (!output_channels[c]) continue;
+          output_rows[c].resize(1 << stage->settings_.shift_y);
+          for (size_t iy = 0; iy < output_rows[c].size(); iy++) {
+            output_rows[c][iy] = output_channels[c]->Row(
+                (y << stage->settings_.shift_y) + iy + kRenderPipelineXOffset);
+          }
+        }
+        stage->ProcessRow(input_rows, output_rows, /*xextra=*/0, xsize,
+                          /*xpos=*/0, y, thread_id);
+      }
+    }
+
+    // Move new channels to current channels.
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      if (stage->GetChannelMode(c) != RenderPipelineChannelMode::kInOut) {
+        continue;
+      }
+      channel_data_[c] = std::move(new_channels[c]);
+    }
+    for (size_t c = 0; c < channel_data_.size(); c++) {
+      size_t next_stage = std::min(stage_id + 1, channel_shifts_.size() - 1);
+      size_t xsize = DivCeil(frame_dimensions_.xsize_upsampled,
+                             1 << channel_shifts_[next_stage][c].first);
+      size_t ysize = DivCeil(frame_dimensions_.ysize_upsampled,
+                             1 << channel_shifts_[next_stage][c].second);
+      channel_data_[c].ShrinkTo(xsize + 2 * kRenderPipelineXOffset,
+                                ysize + 2 * kRenderPipelineXOffset);
+      JXL_CHECK_PLANE_INITIALIZED(
+          channel_data_[c],
+          Rect(kRenderPipelineXOffset, kRenderPipelineXOffset, xsize, ysize),
+          c);
+    }
+
+    if (stage->SwitchToImageDimensions()) {
+      size_t image_xsize, image_ysize;
+      FrameOrigin frame_origin;
+      stage->GetImageDimensions(&image_xsize, &image_ysize, &frame_origin);
+      frame_dimensions_.Set(image_xsize, image_ysize, 0, 0, 0, false, 1);
+      std::vector<ImageF> old_channels = std::move(channel_data_);
+      channel_data_.clear();
+      channel_data_.reserve(old_channels.size());
+      for (size_t c = 0; c < old_channels.size(); c++) {
+        channel_data_.emplace_back(2 * kRenderPipelineXOffset + image_xsize,
+                                   2 * kRenderPipelineXOffset + image_ysize);
+      }
+      for (size_t y = 0; y < image_ysize; ++y) {
+        for (size_t c = 0; c < channel_data_.size(); c++) {
+          output_rows[c].resize(1);
+          output_rows[c][0] = channel_data_[c].Row(kRenderPipelineXOffset + y);
+        }
+        // TODO(sboukortt): consider doing this only on the parts of the
+        // background that won't be occluded.
+        stage->ProcessPaddingRow(output_rows, image_xsize, 0, y);
+      }
+      ssize_t x0 = frame_origin.x0;
+      ssize_t y0 = frame_origin.y0;
+      size_t x0_fg = 0;
+      size_t y0_fg = 0;
+      if (x0 < 0) {
+        xsize += x0;
+        x0_fg -= x0;
+        x0 = 0;
+      }
+      if (x0 + xsize > image_xsize) {
+        xsize = image_xsize - x0;
+      }
+      if (y0 < 0) {
+        ysize += y0;
+        y0_fg -= x0;
+        y0 = 0;
+      }
+      if (y0 + ysize > image_ysize) {
+        ysize = image_ysize - y0;
+      }
+      const Rect rect_fg_relative_to_image =
+          Rect(x0, y0, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      const Rect rect_fg =
+          Rect(x0_fg, y0_fg, xsize, ysize)
+              .Translate(kRenderPipelineXOffset, kRenderPipelineXOffset);
+      for (size_t c = 0; c < channel_data_.size(); c++) {
+        CopyImageTo(rect_fg, old_channels[c], rect_fg_relative_to_image,
+                    &channel_data_[c]);
+      }
+    }
+  }
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
new file mode 100644
index 0000000000..10f4505912
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/simple_render_pipeline.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+#define LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
+
+#include <stdint.h>
+
+#include "lib/jxl/render_pipeline/render_pipeline.h"
+
+namespace jxl {
+
+// A RenderPipeline that is "obviously correct"; it may use potentially large
+// amounts of memory and be slow. It is intended to be used mostly for testing
+// purposes.
+class SimpleRenderPipeline : public RenderPipeline {
+  std::vector<std::pair<ImageF*, Rect>> PrepareBuffers(
+      size_t group_id, size_t thread_id) override;
+
+  void ProcessBuffers(size_t group_id, size_t thread_id) override;
+
+  void PrepareForThreadsInternal(size_t num, bool use_group_ids) override;
+
+  // Full frame buffers. Both X and Y dimensions are padded by
+  // kRenderPipelineXOffset.
+  std::vector<ImageF> channel_data_;
+  size_t processed_passes_ = 0;
+
+ private:
+  Rect MakeChannelRect(size_t group_id, size_t channel);
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_SIMPLE_RENDER_PIPELINE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
new file mode 100644
index 0000000000..b6668c5625
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.cc
@@ -0,0 +1,247 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_blending.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_blending.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/blending.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class BlendingStage : public RenderPipelineStage {
+ public:
+  explicit BlendingStage(const PassesDecoderState* dec_state,
+                         const ColorEncoding& frame_color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        state_(*dec_state->shared) {
+    image_xsize_ = state_.frame_header.nonserialized_metadata->xsize();
+    image_ysize_ = state_.frame_header.nonserialized_metadata->ysize();
+    extra_channel_info_ =
+        &state_.frame_header.nonserialized_metadata->m.extra_channel_info;
+    info_ = state_.frame_header.blending_info;
+    const std::vector<BlendingInfo>& ec_info =
+        state_.frame_header.extra_channel_blending_info;
+    const ImageBundle& bg = state_.reference_frames[info_.source].frame;
+    bg_ = &bg;
+    if (bg.xsize() == 0 || bg.ysize() == 0) {
+      zeroes_.resize(image_xsize_, 0.f);
+    } else if (state_.reference_frames[info_.source].ib_is_in_xyb) {
+      initialized_ = JXL_FAILURE(
+          "Trying to blend XYB reference frame %i and non-XYB frame",
+          info_.source);
+      return;
+    } else if (std::any_of(ec_info.begin(), ec_info.end(),
+                           [this](const BlendingInfo& info) {
+                             const ImageBundle& bg =
+                                 state_.reference_frames[info.source].frame;
+                             return bg.xsize() == 0 || bg.ysize() == 0;
+                           })) {
+      zeroes_.resize(image_xsize_, 0.f);
+    }
+
+    auto verify_bg_size = [&](const ImageBundle& bg) -> Status {
+      if (bg.xsize() != 0 && bg.ysize() != 0 &&
+          (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
+           bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
+        return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
+                           " crop as a background",
+                           bg.xsize(), bg.ysize());
+      }
+      return true;
+    };
+
+    Status ok = verify_bg_size(bg);
+    for (const auto& info : ec_info) {
+      const ImageBundle& bg = state_.reference_frames[info.source].frame;
+      if (!!ok) ok = verify_bg_size(bg);
+    }
+    if (!ok) {
+      initialized_ = ok;
+      return;
+    }
+
+    if (state_.metadata->m.xyb_encoded) {
+      if (!dec_state->output_encoding_info.color_encoding_is_original) {
+        initialized_ = JXL_FAILURE("Blending in unsupported color space");
+        return;
+      }
+    }
+
+    blending_info_.resize(ec_info.size() + 1);
+    auto make_blending = [&](const BlendingInfo& info, PatchBlending* pb) {
+      pb->alpha_channel = info.alpha_channel;
+      pb->clamp = info.clamp;
+      switch (info.mode) {
+        case BlendMode::kReplace: {
+          pb->mode = PatchBlendMode::kReplace;
+          break;
+        }
+        case BlendMode::kAdd: {
+          pb->mode = PatchBlendMode::kAdd;
+          break;
+        }
+        case BlendMode::kMul: {
+          pb->mode = PatchBlendMode::kMul;
+          break;
+        }
+        case BlendMode::kBlend: {
+          pb->mode = PatchBlendMode::kBlendAbove;
+          break;
+        }
+        case BlendMode::kAlphaWeightedAdd: {
+          pb->mode = PatchBlendMode::kAlphaWeightedAddAbove;
+          break;
+        }
+        default: {
+          JXL_ABORT("Invalid blend mode");  // should have failed to decode
+        }
+      }
+    };
+    make_blending(info_, &blending_info_[0]);
+    for (size_t i = 0; i < ec_info.size(); i++) {
+      make_blending(ec_info[i], &blending_info_[1 + i]);
+    }
+  }
+
+  Status IsInitialized() const override { return initialized_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Blend");
+    JXL_ASSERT(initialized_);
+    const FrameOrigin& frame_origin = state_.frame_header.frame_origin;
+    ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos);
+    ssize_t bg_ypos = frame_origin.y0 + static_cast<ssize_t>(ypos);
+    int offset = 0;
+    if (bg_xpos + static_cast<ssize_t>(xsize) <= 0 ||
+        frame_origin.x0 >= static_cast<ssize_t>(image_xsize_) || bg_ypos < 0 ||
+        bg_ypos >= static_cast<ssize_t>(image_ysize_)) {
+      return;
+    }
+    if (bg_xpos < 0) {
+      offset -= bg_xpos;
+      xsize += bg_xpos;
+      bg_xpos = 0;
+    }
+    if (bg_xpos + xsize > image_xsize_) {
+      xsize =
+          std::max<ssize_t>(0, static_cast<ssize_t>(image_xsize_) - bg_xpos);
+    }
+    std::vector<const float*> bg_row_ptrs_(input_rows.size());
+    std::vector<float*> fg_row_ptrs_(input_rows.size());
+    size_t num_c = std::min(input_rows.size(), extra_channel_info_->size() + 3);
+    for (size_t c = 0; c < num_c; ++c) {
+      fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset;
+      if (c < 3) {
+        bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0
+                              ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos
+                              : zeroes_.data();
+      } else {
+        const ImageBundle& ec_bg =
+            state_
+                .reference_frames[state_.frame_header
+                                      .extra_channel_blending_info[c - 3]
+                                      .source]
+                .frame;
+        bg_row_ptrs_[c] =
+            ec_bg.xsize() != 0 && ec_bg.ysize() != 0
+                ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos
+                : zeroes_.data();
+      }
+    }
+    PerformBlending(bg_row_ptrs_.data(), fg_row_ptrs_.data(),
+                    fg_row_ptrs_.data(), 0, xsize, blending_info_[0],
+                    blending_info_.data() + 1, *extra_channel_info_);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInPlace;
+  }
+
+  bool SwitchToImageDimensions() const override { return true; }
+
+  void GetImageDimensions(size_t* xsize, size_t* ysize,
+                          FrameOrigin* frame_origin) const override {
+    *xsize = image_xsize_;
+    *ysize = image_ysize_;
+    *frame_origin = state_.frame_header.frame_origin;
+  }
+
+  void ProcessPaddingRow(const RowInfo& output_rows, size_t xsize, size_t xpos,
+                         size_t ypos) const override {
+    if (bg_->xsize() == 0 || bg_->ysize() == 0) {
+      for (size_t c = 0; c < 3; ++c) {
+        memset(GetInputRow(output_rows, c, 0), 0, xsize * sizeof(float));
+      }
+    } else {
+      for (size_t c = 0; c < 3; ++c) {
+        memcpy(GetInputRow(output_rows, c, 0),
+               bg_->color().ConstPlaneRow(c, ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+    for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) {
+      const ImageBundle& ec_bg =
+          state_
+              .reference_frames
+                  [state_.frame_header.extra_channel_blending_info[ec].source]
+              .frame;
+      if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) {
+        memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float));
+      } else {
+        memcpy(GetInputRow(output_rows, 3 + ec, 0),
+               ec_bg.extra_channels()[ec].ConstRow(ypos) + xpos,
+               xsize * sizeof(float));
+      }
+    }
+  }
+
+  const char* GetName() const override { return "Blending"; }
+
+ private:
+  const PassesSharedState& state_;
+  BlendingInfo info_;
+  const ImageBundle* bg_;
+  Status initialized_ = true;
+  size_t image_xsize_;
+  size_t image_ysize_;
+  std::vector<PatchBlending> blending_info_;
+  const std::vector<ExtraChannelInfo>* extra_channel_info_;
+  std::vector<float> zeroes_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return jxl::make_unique<BlendingStage>(dec_state, frame_color_encoding);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetBlendingStage);
+
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding) {
+  return HWY_DYNAMIC_DISPATCH(GetBlendingStage)(dec_state,
+                                                frame_color_encoding);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
new file mode 100644
index 0000000000..c8db7490cd
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_blending.h
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
+
+#include <utility>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Applies blending if applicable.
+std::unique_ptr<RenderPipelineStage> GetBlendingStage(
+    const PassesDecoderState* dec_state,
+    const ColorEncoding& frame_color_encoding);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_BLENDING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
new file mode 100644
index 0000000000..9b73ee91f1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.cc
@@ -0,0 +1,129 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_chroma_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class HorizontalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit HorizontalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("HorizontalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_in = GetInputRow(input_rows, c_, 0);
+    float* row_out = GetOutputRow(output_rows, c_, 0);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto current = Mul(LoadU(df, row_in + x), threefour);
+      auto prev = LoadU(df, row_in + x - 1);
+      auto next = LoadU(df, row_in + x + 1);
+      auto left = MulAdd(onefour, prev, current);
+      auto right = MulAdd(onefour, next, current);
+      StoreInterleaved(df, left, right, row_out + x * 2);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "HChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+class VerticalChromaUpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit VerticalChromaUpsamplingStage(size_t channel)
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(
+            /*shift=*/1, /*border=*/1)),
+        c_(channel) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("VerticalChromaUpsampling");
+    HWY_FULL(float) df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    auto threefour = Set(df, 0.75f);
+    auto onefour = Set(df, 0.25f);
+    const float* row_top = GetInputRow(input_rows, c_, -1);
+    const float* row_mid = GetInputRow(input_rows, c_, 0);
+    const float* row_bot = GetInputRow(input_rows, c_, 1);
+    float* row_out0 = GetOutputRow(output_rows, c_, 0);
+    float* row_out1 = GetOutputRow(output_rows, c_, 1);
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      auto it = LoadU(df, row_top + x);
+      auto im = LoadU(df, row_mid + x);
+      auto ib = LoadU(df, row_bot + x);
+      auto im_scaled = Mul(im, threefour);
+      Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+      Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "VChromaUps"; }
+
+ private:
+  size_t c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  if (horizontal) {
+    return jxl::make_unique<HorizontalChromaUpsamplingStage>(channel);
+  } else {
+    return jxl::make_unique<VerticalChromaUpsamplingStage>(channel);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetChromaUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal) {
+  return HWY_DYNAMIC_DISPATCH(GetChromaUpsamplingStage)(channel, horizontal);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
new file mode 100644
index 0000000000..b8bfc15f5f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_chroma_upsampling.h
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies simple upsampling, either horizontal or vertical, to the given
+// channel.
+std::unique_ptr<RenderPipelineStage> GetChromaUpsamplingStage(size_t channel,
+                                                              bool horizontal);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
new file mode 100644
index 0000000000..d59c497843
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.cc
@@ -0,0 +1,524 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_epf.h"
+
+#include "lib/jxl/epf.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_epf.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+// TODO(veluca): In principle, vectors could be not capped, if we want to deal
+// with having two different sigma values in a single vector.
+using DF = HWY_CAPPED(float, 8);
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::VFromD;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+JXL_INLINE Vec<DF> Weight(Vec<DF> sad, Vec<DF> inv_sigma, Vec<DF> thres) {
+  auto v = MulAdd(sad, inv_sigma, Set(DF(), 1.0f));
+  return ZeroIfNegative(v);
+}
+
+// 5x5 plus-shaped kernel with 5 SADs per pixel (3x3 plus-shaped). So this makes
+// this filter a 7x7 filter.
+class EPF0Stage : public RenderPipelineStage {
+ public:
+  EPF0Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/3)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][7], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][3 + row] + x)
+                      : LoadU(DF(), rows[0][3 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][3 + row] + x)
+                      : LoadU(DF(), rows[1][3 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][3 + row] + x)
+                      : LoadU(DF(), rows[2][3 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+
+    using V = decltype(Zero(df));
+    V t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA, tB;
+    V* sads[12] = {&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA, &tB};
+
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass0_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+    float* JXL_RESTRICT rows[3][7];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 7; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 3);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][3 + 0] + x);
+          StoreU(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      for (size_t i = 0; i < 12; i++) *sads[i] = Zero(df);
+      constexpr std::array<int, 2> sads_off[12] = {
+          {{-2, 0}}, {{-1, -1}}, {{-1, 0}}, {{-1, 1}}, {{0, -2}}, {{0, -1}},
+          {{0, 1}},  {{0, 2}},   {{1, -1}}, {{1, 0}},  {{1, 1}},  {{2, 0}},
+      };
+
+      // compute sads
+      // TODO(veluca): consider unrolling and optimizing this.
+      for (size_t c = 0; c < 3; c++) {
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        for (size_t i = 0; i < 12; i++) {
+          auto sad = Zero(df);
+          constexpr std::array<int, 2> plus_off[] = {
+              {{0, 0}}, {{-1, 0}}, {{0, -1}}, {{1, 0}}, {{0, 1}}};
+          for (size_t j = 0; j < 5; j++) {
+            const auto r11 =
+                LoadU(df, rows[c][3 + plus_off[j][0]] + x + plus_off[j][1]);
+            const auto c11 =
+                LoadU(df, rows[c][3 + sads_off[i][0] + plus_off[j][0]] + x +
+                              sads_off[i][1] + plus_off[j][1]);
+            sad = Add(sad, AbsDiff(r11, c11));
+          }
+          *sads[i] = MulAdd(sad, scale, *sads[i]);
+        }
+      }
+      const auto x_cc = Load(df, rows[0][3 + 0] + x);
+      const auto y_cc = Load(df, rows[1][3 + 0] + x);
+      const auto b_cc = Load(df, rows[2][3 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      for (size_t i = 0; i < 12; i++) {
+        AddPixel</*aligned=*/false>(/*row=*/sads_off[i][0], rows,
+                                    x + sads_off[i][1], *sads[i], inv_sigma, &X,
+                                    &Y, &B, &w);
+      }
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      StoreU(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      StoreU(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      StoreU(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF0"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 5 SADs per pixel (also 3x3 plus-shaped). So this
+// makes this filter a 5x5 filter.
+class EPF1Stage : public RenderPipelineStage {
+ public:
+  EPF1Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][5], ssize_t x,
+                           Vec<DF> sad, Vec<DF> inv_sigma,
+                           Vec<DF>* JXL_RESTRICT X, Vec<DF>* JXL_RESTRICT Y,
+                           Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][2 + row] + x)
+                      : LoadU(DF(), rows[0][2 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][2 + row] + x)
+                      : LoadU(DF(), rows[1][2 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][2 + row] + x)
+                      : LoadU(DF(), rows[2][2 + row] + x);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass1_zeroflush));
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = 1.65f;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][5];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 5; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 2);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][2 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+      auto sad0 = Zero(df);
+      auto sad1 = Zero(df);
+      auto sad2 = Zero(df);
+      auto sad3 = Zero(df);
+
+      // compute sads
+      for (size_t c = 0; c < 3; c++) {
+        // center px = 22, px above = 21
+        auto t = Undefined(df);
+
+        const auto p20 = Load(df, rows[c][2 + -2] + x);
+        const auto p21 = Load(df, rows[c][2 + -1] + x);
+        auto sad0c = AbsDiff(p20, p21);  // SAD 2, 1
+
+        const auto p11 = LoadU(df, rows[c][2 + -1] + x - 1);
+        auto sad1c = AbsDiff(p11, p21);  // SAD 1, 2
+
+        const auto p31 = LoadU(df, rows[c][2 + -1] + x + 1);
+        auto sad2c = AbsDiff(p31, p21);  // SAD 3, 2
+
+        const auto p02 = LoadU(df, rows[c][2 + 0] + x - 2);
+        const auto p12 = LoadU(df, rows[c][2 + 0] + x - 1);
+        sad1c = Add(sad1c, AbsDiff(p02, p12));  // SAD 1, 2
+        sad0c = Add(sad0c, AbsDiff(p11, p12));  // SAD 2, 1
+
+        const auto p22 = LoadU(df, rows[c][2 + 0] + x);
+        t = AbsDiff(p12, p22);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+        t = AbsDiff(p22, p21);
+        auto sad3c = t;  // SAD 2, 3
+        sad0c = Add(sad0c, t);  // SAD 2, 1
+
+        const auto p32 = LoadU(df, rows[c][2 + 0] + x + 1);
+        sad0c = Add(sad0c, AbsDiff(p31, p32));  // SAD 2, 1
+        t = AbsDiff(p22, p32);
+        sad1c = Add(sad1c, t);  // SAD 1, 2
+        sad2c = Add(sad2c, t);  // SAD 3, 2
+
+        const auto p42 = LoadU(df, rows[c][2 + 0] + x + 2);
+        sad2c = Add(sad2c, AbsDiff(p42, p32));  // SAD 3, 2
+
+        const auto p13 = LoadU(df, rows[c][2 + 1] + x - 1);
+        sad3c = Add(sad3c, AbsDiff(p13, p12));  // SAD 2, 3
+
+        const auto p23 = Load(df, rows[c][2 + 1] + x);
+        t = AbsDiff(p22, p23);
+        sad0c = Add(sad0c, t);                  // SAD 2, 1
+        sad3c = Add(sad3c, t);                  // SAD 2, 3
+        sad1c = Add(sad1c, AbsDiff(p13, p23));  // SAD 1, 2
+
+        const auto p33 = LoadU(df, rows[c][2 + 1] + x + 1);
+        sad2c = Add(sad2c, AbsDiff(p33, p23));  // SAD 3, 2
+        sad3c = Add(sad3c, AbsDiff(p33, p32));  // SAD 2, 3
+
+        const auto p24 = Load(df, rows[c][2 + 2] + x);
+        sad3c = Add(sad3c, AbsDiff(p24, p23));  // SAD 2, 3
+
+        auto scale = Set(df, lf_.epf_channel_scale[c]);
+        sad0 = MulAdd(sad0c, scale, sad0);
+        sad1 = MulAdd(sad1c, scale, sad1);
+        sad2 = MulAdd(sad2c, scale, sad2);
+        sad3 = MulAdd(sad3c, scale, sad3);
+      }
+      const auto x_cc = Load(df, rows[0][2 + 0] + x);
+      const auto y_cc = Load(df, rows[1][2 + 0] + x);
+      const auto b_cc = Load(df, rows[2][2 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, sad0, inv_sigma, &X, &Y,
+                                 &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, sad1, inv_sigma, &X,
+                                  &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, sad2, inv_sigma, &X,
+                                  &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, sad3, inv_sigma, &X, &Y,
+                                 &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF1"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+// 3x3 plus-shaped kernel with 1 SAD per pixel. So this makes this filter a 3x3
+// filter.
+class EPF2Stage : public RenderPipelineStage {
+ public:
+  EPF2Stage(const LoopFilter& lf, const ImageF& sigma)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)),
+        lf_(lf),
+        sigma_(&sigma) {}
+
+  template <bool aligned>
+  JXL_INLINE void AddPixel(int row, float* JXL_RESTRICT rows[3][3], ssize_t x,
+                           Vec<DF> rx, Vec<DF> ry, Vec<DF> rb,
+                           Vec<DF> inv_sigma, Vec<DF>* JXL_RESTRICT X,
+                           Vec<DF>* JXL_RESTRICT Y, Vec<DF>* JXL_RESTRICT B,
+                           Vec<DF>* JXL_RESTRICT w) const {
+    auto cx = aligned ? Load(DF(), rows[0][1 + row] + x)
+                      : LoadU(DF(), rows[0][1 + row] + x);
+    auto cy = aligned ? Load(DF(), rows[1][1 + row] + x)
+                      : LoadU(DF(), rows[1][1 + row] + x);
+    auto cb = aligned ? Load(DF(), rows[2][1 + row] + x)
+                      : LoadU(DF(), rows[2][1 + row] + x);
+
+    auto sad = Mul(AbsDiff(cx, rx), Set(DF(), lf_.epf_channel_scale[0]));
+    sad = MulAdd(AbsDiff(cy, ry), Set(DF(), lf_.epf_channel_scale[1]), sad);
+    sad = MulAdd(AbsDiff(cb, rb), Set(DF(), lf_.epf_channel_scale[2]), sad);
+
+    auto weight = Weight(sad, inv_sigma, Set(DF(), lf_.epf_pass2_zeroflush));
+
+    *w = Add(*w, weight);
+    *X = MulAdd(weight, cx, *X);
+    *Y = MulAdd(weight, cy, *Y);
+    *B = MulAdd(weight, cb, *B);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    DF df;
+    xextra = RoundUpTo(xextra, Lanes(df));
+    const float* JXL_RESTRICT row_sigma =
+        sigma_->Row(ypos / kBlockDim + kSigmaPadding);
+
+    float sm = lf_.epf_pass2_sigma_scale * 1.65;
+    float bsm = sm * lf_.epf_border_sad_mul;
+
+    HWY_ALIGN float sad_mul_center[kBlockDim] = {bsm, sm, sm, sm,
+                                                 sm,  sm, sm, bsm};
+    HWY_ALIGN float sad_mul_border[kBlockDim] = {bsm, bsm, bsm, bsm,
+                                                 bsm, bsm, bsm, bsm};
+
+    float* JXL_RESTRICT rows[3][3];
+    for (size_t c = 0; c < 3; c++) {
+      for (int i = 0; i < 3; i++) {
+        rows[c][i] = GetInputRow(input_rows, c, i - 1);
+      }
+    }
+
+    const float* sad_mul =
+        (ypos % kBlockDim == 0 || ypos % kBlockDim == kBlockDim - 1)
+            ? sad_mul_border
+            : sad_mul_center;
+
+    for (ssize_t x = -xextra; x < static_cast<ssize_t>(xsize + xextra);
+         x += Lanes(df)) {
+      size_t bx = (x + xpos + kSigmaPadding * kBlockDim) / kBlockDim;
+      size_t ix = (x + xpos) % kBlockDim;
+
+      if (row_sigma[bx] < kMinSigma) {
+        for (size_t c = 0; c < 3; c++) {
+          auto px = Load(df, rows[c][1 + 0] + x);
+          Store(px, df, GetOutputRow(output_rows, c, 0) + x);
+        }
+        continue;
+      }
+
+      const auto sm = Load(df, sad_mul + ix);
+      const auto inv_sigma = Mul(Set(df, row_sigma[bx]), sm);
+
+      const auto x_cc = Load(df, rows[0][1 + 0] + x);
+      const auto y_cc = Load(df, rows[1][1 + 0] + x);
+      const auto b_cc = Load(df, rows[2][1 + 0] + x);
+
+      auto w = Set(df, 1);
+      auto X = x_cc;
+      auto Y = y_cc;
+      auto B = b_cc;
+
+      // Top row
+      AddPixel</*aligned=*/true>(/*row=*/-1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+      // Center
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x - 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      AddPixel</*aligned=*/false>(/*row=*/0, rows, x + 1, x_cc, y_cc, b_cc,
+                                  inv_sigma, &X, &Y, &B, &w);
+      // Bottom
+      AddPixel</*aligned=*/true>(/*row=*/1, rows, x, x_cc, y_cc, b_cc,
+                                 inv_sigma, &X, &Y, &B, &w);
+#if JXL_HIGH_PRECISION
+      auto inv_w = Div(Set(df, 1.0f), w);
+#else
+      auto inv_w = ApproximateReciprocal(w);
+#endif
+      Store(Mul(X, inv_w), df, GetOutputRow(output_rows, 0, 0) + x);
+      Store(Mul(Y, inv_w), df, GetOutputRow(output_rows, 1, 0) + x);
+      Store(Mul(B, inv_w), df, GetOutputRow(output_rows, 2, 0) + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "EPF2"; }
+
+ private:
+  LoopFilter lf_;
+  const ImageF* sigma_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage0(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF0Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage1(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF1Stage>(lf, sigma);
+}
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage2(const LoopFilter& lf,
+                                                  const ImageF& sigma) {
+  return jxl::make_unique<EPF2Stage>(lf, sigma);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetEPFStage0);
+HWY_EXPORT(GetEPFStage1);
+HWY_EXPORT(GetEPFStage2);
+
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage) {
+  JXL_ASSERT(lf.epf_iters != 0);
+  switch (epf_stage) {
+    case 0:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage0)(lf, sigma);
+    case 1:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage1)(lf, sigma);
+    case 2:
+      return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma);
+    default:
+      JXL_ABORT("Invalid EPF stage");
+  }
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
new file mode 100644
index 0000000000..c9d0d0c785
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_epf.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image.h"
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies the `epf_stage`-th EPF step with the given settings and `sigma`.
+// `sigma` will be accessed with an offset of (kSigmaPadding, kSigmaPadding),
+// and should have (kSigmaBorder, kSigmaBorder) mirrored sigma values available
+// around the main image. See also filters.(h|cc)
+std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
+                                                 const ImageF& sigma,
+                                                 size_t epf_stage);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_EPF_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
new file mode 100644
index 0000000000..c7b22c663b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.cc
@@ -0,0 +1,191 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_from_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_from_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return linear;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+#if JXL_HIGH_PRECISION
+    return TF_SRGB().EncodedFromDisplay(d, linear);
+#else
+    return FastLinearToSRGB(d, linear);
+#endif
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_PQ().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::ToSceneLight(/*display_luminance=*/intensity_target,
+                                        luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    hlg_ootf_.Apply(r, g, b);
+    *r = TF_HLG().EncodedFromDisplay(d, *r);
+    *g = TF_HLG().EncodedFromDisplay(d, *g);
+    *b = TF_HLG().EncodedFromDisplay(d, *b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return TF_709().EncodedFromDisplay(d, linear);
+  }
+};
+
+struct OpGamma {
+  const float inverse_gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& linear) const {
+    return IfThenZeroElse(Le(linear, Set(d, 1e-5f)),
+                          FastPowf(d, linear, Set(d, inverse_gamma)));
+  }
+};
+
+template <typename Op>
+class FromLinearStage : public RenderPipelineStage {
+ public:
+  explicit FromLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("FromLinear");
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FromLinear"; }
+
+ private:
+  Op op_;
+};
+
+template <typename Op>
+std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) {
+  return jxl::make_unique<FromLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeFromLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeFromLinearStage(
+        OpHlg(output_encoding_info.luminances,
+              output_encoding_info.desired_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeFromLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeFromLinearStage(
+        MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma}));
+  } else {
+    // This is a programming error.
+    JXL_ABORT("Invalid target encoding");
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetFromLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetFromLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
new file mode 100644
index 0000000000..548ab50b8c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_from_linear.h
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from linear to the specified output encoding.
+std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_FROM_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
new file mode 100644
index 0000000000..fc90acb476
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.cc
@@ -0,0 +1,122 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_gaborish.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_gaborish.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class GaborishStage : public RenderPipelineStage {
+ public:
+  explicit GaborishStage(const LoopFilter& lf)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/1)) {
+    weights_[0] = 1;
+    weights_[1] = lf.gab_x_weight1;
+    weights_[2] = lf.gab_x_weight2;
+    weights_[3] = 1;
+    weights_[4] = lf.gab_y_weight1;
+    weights_[5] = lf.gab_y_weight2;
+    weights_[6] = 1;
+    weights_[7] = lf.gab_b_weight1;
+    weights_[8] = lf.gab_b_weight2;
+    // Normalize
+    for (size_t c = 0; c < 3; c++) {
+      const float div =
+          weights_[3 * c] + 4 * (weights_[3 * c + 1] + weights_[3 * c + 2]);
+      const float mul = 1.0f / div;
+      weights_[3 * c] *= mul;
+      weights_[3 * c + 1] *= mul;
+      weights_[3 * c + 2] *= mul;
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Gaborish");
+
+    const HWY_FULL(float) d;
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1);
+      float* JXL_RESTRICT row_m = GetInputRow(input_rows, c, 0);
+      float* JXL_RESTRICT row_b = GetInputRow(input_rows, c, 1);
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      const auto w0 = Set(d, weights_[3 * c + 0]);
+      const auto w1 = Set(d, weights_[3 * c + 1]);
+      const auto w2 = Set(d, weights_[3 * c + 2]);
+// Group data need only be aligned to a block; for >=512 bit vectors, this may
+// result in unaligned loads.
+#if HWY_CAP_GE512
+#define LoadMaybeU LoadU
+#else
+#define LoadMaybeU Load
+#endif
+      // Since GetInputRow(input_rows, c, {-1, 0, 1}) is aligned, rounding
+      // xextra up to Lanes(d) doesn't access anything problematic.
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto t = LoadMaybeU(d, row_t + x);
+        const auto tl = LoadU(d, row_t + x - 1);
+        const auto tr = LoadU(d, row_t + x + 1);
+        const auto m = LoadMaybeU(d, row_m + x);
+        const auto l = LoadU(d, row_m + x - 1);
+        const auto r = LoadU(d, row_m + x + 1);
+        const auto b = LoadMaybeU(d, row_b + x);
+        const auto bl = LoadU(d, row_b + x - 1);
+        const auto br = LoadU(d, row_b + x + 1);
+        const auto sum0 = m;
+        const auto sum1 = Add(Add(l, r), Add(t, b));
+        const auto sum2 = Add(Add(tl, tr), Add(bl, br));
+        auto pixels = MulAdd(sum2, w2, MulAdd(sum1, w1, Mul(sum0, w0)));
+        Store(pixels, d, row_out + x);
+      }
+    }
+  }
+#undef LoadMaybeU
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInOut
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Gab"; }
+
+ private:
+  float weights_[9];
+};
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  return jxl::make_unique<GaborishStage>(lf);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetGaborishStage);
+
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf) {
+  JXL_ASSERT(lf.gab == 1);
+  return HWY_DYNAMIC_DISPATCH(GetGaborishStage)(lf);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
new file mode 100644
index 0000000000..761800f668
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_gaborish.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/loop_filter.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Applies decoder-side Gaborish with the given settings. `lf.gab` must be 1.
+std::unique_ptr<RenderPipelineStage> GetGaborishStage(const LoopFilter& lf);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
new file mode 100644
index 0000000000..187095cf61
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.cc
@@ -0,0 +1,311 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_noise.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+using D = HWY_CAPPED(float, kBlockDim);
+using DI = hwy::HWY_NAMESPACE::Rebind<int32_t, D>;
+using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
+
+// [0, max_value]
+template <class D, class V>
+static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
+  const auto clamped = Min(x, max_value);
+  return ZeroIfNegative(clamped);
+}
+
+// x is in [0+delta, 1+delta], delta ~= 0.06
+template <class StrengthEval>
+typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
+                                       const typename StrengthEval::V x) {
+  return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
+}
+
+// TODO(veluca): SIMD-fy.
+class StrengthEvalLut {
+ public:
+  using V = Vec<D>;
+
+  explicit StrengthEvalLut(const NoiseParams& noise_params)
+#if HWY_TARGET == HWY_SCALAR
+      : noise_params_(noise_params)
+#endif
+  {
+#if HWY_TARGET != HWY_SCALAR
+    uint32_t lut[8];
+    memcpy(lut, noise_params.lut, sizeof(lut));
+    for (size_t i = 0; i < 8; i++) {
+      low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
+      low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
+      high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
+      high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
+    }
+#endif
+  }
+
+  V operator()(const V vx) const {
+    constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
+    auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale)));
+    auto floor_x = Floor(scaled_vx);
+    auto frac_x = Sub(scaled_vx, floor_x);
+    floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale),
+                         floor_x);
+    frac_x =
+        IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x);
+    auto floor_x_int = ConvertTo(DI(), floor_x);
+#if HWY_TARGET == HWY_SCALAR
+    auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
+    auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
+#else
+    // Set each lane's bytes to {0, 0, 2x+1, 2x}.
+    auto floorx_indices_low =
+        Add(Mul(floor_x_int, Set(DI(), 0x0202)), Set(DI(), 0x0100));
+    // Set each lane's bytes to {2x+1, 2x, 0, 0}.
+    auto floorx_indices_hi =
+        Add(Mul(floor_x_int, Set(DI(), 0x02020000)), Set(DI(), 0x01000000));
+    // load LUT
+    auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
+    auto lowm = Set(DI(), 0xFFFF);
+    auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
+    auto him = Set(DI(), 0xFFFF0000);
+    // low = noise_params.lut[floor_x]
+    auto low =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+    // hi = noise_params.lut[floor_x+1]
+    floorx_indices_low = Add(floorx_indices_low, Set(DI(), 0x0202));
+    floorx_indices_hi = Add(floorx_indices_hi, Set(DI(), 0x02020000));
+    auto hi =
+        BitCast(D(), Or(And(TableLookupBytes(low16, floorx_indices_low), lowm),
+                        And(TableLookupBytes(hi16, floorx_indices_hi), him)));
+#endif
+    return MulAdd(Sub(hi, low), frac_x, low);
+  }
+
+ private:
+#if HWY_TARGET != HWY_SCALAR
+  // noise_params.lut transformed into two 16-bit lookup tables.
+  HWY_ALIGN uint8_t high16_lut[16];
+  HWY_ALIGN uint8_t low16_lut[16];
+#else
+  const NoiseParams& noise_params_;
+#endif
+};
+
+template <class D>
+void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
+                   const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
+                   const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
+                   float ytox, float ytob, float* JXL_RESTRICT out_x,
+                   float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
+  const auto kRGCorr = Set(d, 0.9921875f);   // 127/128
+  const auto kRGNCorr = Set(d, 0.0078125f);  // 1/128
+
+  const auto red_noise =
+      Mul(noise_strength_r,
+          MulAdd(kRGNCorr, rnd_noise_r, Mul(kRGCorr, rnd_noise_cor)));
+  const auto green_noise =
+      Mul(noise_strength_g,
+          MulAdd(kRGNCorr, rnd_noise_g, Mul(kRGCorr, rnd_noise_cor)));
+
+  auto vx = LoadU(d, out_x);
+  auto vy = LoadU(d, out_y);
+  auto vb = LoadU(d, out_b);
+
+  const auto rg_noise = Add(red_noise, green_noise);
+  vx = Add(MulAdd(Set(d, ytox), rg_noise, Sub(red_noise, green_noise)), vx);
+  vy = Add(vy, rg_noise);
+  vb = MulAdd(Set(d, ytob), rg_noise, vb);
+
+  StoreU(vx, d, out_x);
+  StoreU(vy, d, out_y);
+  StoreU(vb, d, out_b);
+}
+
+class AddNoiseStage : public RenderPipelineStage {
+ public:
+  AddNoiseStage(const NoiseParams& noise_params,
+                const ColorCorrelationMap& cmap, size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/0)),
+        noise_params_(noise_params),
+        cmap_(cmap),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise apply");
+
+    if (!noise_params_.HasAny()) return;
+    const StrengthEvalLut noise_model(noise_params_);
+    D d;
+    const auto half = Set(d, 0.5f);
+
+    // With the prior subtract-random Laplacian approximation, rnd_* ranges were
+    // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
+    // normalizer is half of what it was before (0.5).
+    const auto norm_const = Set(d, 0.22f);
+
+    float ytox = cmap_.YtoXRatio(0);
+    float ytob = cmap_.YtoBRatio(0);
+
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+
+    float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
+    const float* JXL_RESTRICT row_rnd_r =
+        GetInputRow(input_rows, first_c_ + 0, 0);
+    const float* JXL_RESTRICT row_rnd_g =
+        GetInputRow(input_rows, first_c_ + 1, 0);
+    const float* JXL_RESTRICT row_rnd_c =
+        GetInputRow(input_rows, first_c_ + 2, 0);
+    // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
+    // shuffles are otherwise done on the data, so this is safe.
+    msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    for (size_t x = 0; x < xsize_v; x += Lanes(d)) {
+      const auto vx = LoadU(d, row_x + x);
+      const auto vy = LoadU(d, row_y + x);
+      const auto in_g = Sub(vy, vx);
+      const auto in_r = Add(vy, vx);
+      const auto noise_strength_g = NoiseStrength(noise_model, Mul(in_g, half));
+      const auto noise_strength_r = NoiseStrength(noise_model, Mul(in_r, half));
+      const auto addit_rnd_noise_red = Mul(LoadU(d, row_rnd_r + x), norm_const);
+      const auto addit_rnd_noise_green =
+          Mul(LoadU(d, row_rnd_g + x), norm_const);
+      const auto addit_rnd_noise_correlated =
+          Mul(LoadU(d, row_rnd_c + x), norm_const);
+      AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
+                    addit_rnd_noise_correlated, noise_strength_g,
+                    noise_strength_r, ytox, ytob, row_x + x, row_y + x,
+                    row_b + x);
+    }
+    msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
+    msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInput
+           : c < 3       ? RenderPipelineChannelMode::kInPlace
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "AddNoise"; }
+
+ private:
+  const NoiseParams& noise_params_;
+  const ColorCorrelationMap& cmap_;
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
+}
+
+class ConvolveNoiseStage : public RenderPipelineStage {
+ public:
+  explicit ConvolveNoiseStage(size_t first_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/0, /*border=*/2)),
+        first_c_(first_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Noise convolve");
+
+    const HWY_FULL(float) d;
+    for (size_t c = first_c_; c < first_c_ + 3; c++) {
+      float* JXL_RESTRICT rows[5];
+      for (size_t i = 0; i < 5; i++) {
+        rows[i] = GetInputRow(input_rows, c, i - 2);
+      }
+      float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
+      for (ssize_t x = -RoundUpTo(xextra, Lanes(d));
+           x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto p00 = LoadU(d, rows[2] + x);
+        auto others = Zero(d);
+        // TODO(eustas): sum loaded values to reduce the calculation chain
+        for (ssize_t i = -2; i <= 2; i++) {
+          others = Add(others, LoadU(d, rows[0] + x + i));
+          others = Add(others, LoadU(d, rows[1] + x + i));
+          others = Add(others, LoadU(d, rows[3] + x + i));
+          others = Add(others, LoadU(d, rows[4] + x + i));
+        }
+        others = Add(others, LoadU(d, rows[2] + x - 2));
+        others = Add(others, LoadU(d, rows[2] + x - 1));
+        others = Add(others, LoadU(d, rows[2] + x + 1));
+        others = Add(others, LoadU(d, rows[2] + x + 2));
+        // 4 * (1 - box kernel)
+        auto pixels = MulAdd(others, Set(d, 0.16), Mul(p00, Set(d, -3.84)));
+        StoreU(pixels, d, row_out + x);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c >= first_c_ ? RenderPipelineChannelMode::kInOut
+                         : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ConvNoise"; }
+
+ private:
+  size_t first_c_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetAddNoiseStage);
+HWY_EXPORT(GetConvolveNoiseStage);
+
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
+                                                noise_c_start);
+}
+
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start) {
+  return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
new file mode 100644
index 0000000000..bd7797f991
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_noise.h
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Adds noise to color channels.
+std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
+    const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
+    size_t noise_c_start);
+
+// Applies a 5x5 subtract-box-filter convolution to the noise input channels.
+std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
+    size_t noise_c_start);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_NOISE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
new file mode 100644
index 0000000000..527be03839
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.cc
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_patches.h"
+
+namespace jxl {
+namespace {
+class PatchDictionaryStage : public RenderPipelineStage {
+ public:
+  PatchDictionaryStage(const PatchDictionary* patches, size_t num_channels)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        patches_(*patches),
+        num_channels_(num_channels) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderPatches");
+    JXL_ASSERT(xpos == 0 || xpos >= xextra);
+    size_t x0 = xpos ? xpos - xextra : 0;
+    std::vector<float*> row_ptrs(num_channels_);
+    for (size_t i = 0; i < num_channels_; i++) {
+      row_ptrs[i] = GetInputRow(input_rows, i, 0) + x0 - xpos;
+    }
+    patches_.AddOneRow(row_ptrs.data(), ypos, x0, xsize + xextra + xpos - x0);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < num_channels_ ? RenderPipelineChannelMode::kInPlace
+                             : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Patches"; }
+
+ private:
+  const PatchDictionary& patches_;
+  const size_t num_channels_;
+};
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels) {
+  return jxl::make_unique<PatchDictionaryStage>(patches, num_channels);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
new file mode 100644
index 0000000000..b35abdc2eb
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_patches.h
@@ -0,0 +1,22 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
+
+#include <utility>
+
+#include "lib/jxl/patch_dictionary_internal.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Draws patches if applicable.
+std::unique_ptr<RenderPipelineStage> GetPatchesStage(
+    const PatchDictionary* patches, size_t num_channels);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_PATCHES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
new file mode 100644
index 0000000000..d97d97e5f2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.cc
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_splines.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class SplineStage : public RenderPipelineStage {
+ public:
+  explicit SplineStage(const Splines* splines)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        splines_(*splines) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("RenderSplines");
+    float* row_x = GetInputRow(input_rows, 0, 0);
+    float* row_y = GetInputRow(input_rows, 1, 0);
+    float* row_b = GetInputRow(input_rows, 2, 0);
+    splines_.AddToRow(row_x, row_y, row_b, Rect(xpos, ypos, xsize, 1));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Splines"; }
+
+ private:
+  const Splines& splines_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return jxl::make_unique<SplineStage>(splines);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetSplineStage);
+
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines) {
+  return HWY_DYNAMIC_DISPATCH(GetSplineStage)(splines);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
new file mode 100644
index 0000000000..363af393ec
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_splines.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+
+// Draws splines if applicable.
+std::unique_ptr<RenderPipelineStage> GetSplineStage(const Splines* splines);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
new file mode 100644
index 0000000000..d4f6152994
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_spot.h"
+
+namespace jxl {
+class SpotColorStage : public RenderPipelineStage {
+ public:
+  explicit SpotColorStage(size_t spot_c, const float* spot_color)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        spot_c_(spot_c),
+        spot_color_(spot_color) {
+    JXL_ASSERT(spot_c_ >= 3);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    // TODO(veluca): add SIMD.
+    PROFILER_ZONE("RenderSpotColors");
+    float scale = spot_color_[3];
+    for (size_t c = 0; c < 3; c++) {
+      float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0);
+      const float* JXL_RESTRICT s = GetInputRow(input_rows, spot_c_, 0);
+      for (ssize_t x = -xextra; x < ssize_t(xsize + xextra); x++) {
+        float mix = scale * s[x];
+        p[x] = mix * spot_color_[c] + (1.0f - mix) * p[x];
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3          ? RenderPipelineChannelMode::kInPlace
+           : c == spot_c_ ? RenderPipelineChannelMode::kInput
+                          : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Spot"; }
+
+ private:
+  size_t spot_c_;
+  const float* spot_color_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(
+    size_t spot_c, const float* spot_color) {
+  return jxl::make_unique<SpotColorStage>(spot_c, spot_color);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
new file mode 100644
index 0000000000..3e79c75823
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_spot.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
+
+#include <utility>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Render the spot color channels.
+std::unique_ptr<RenderPipelineStage> GetSpotColorStage(size_t spot_c,
+                                                       const float* spot_color);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_SPOT_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
new file mode 100644
index 0000000000..9f5b2b73dc
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.cc
@@ -0,0 +1,202 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_to_linear.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_to_linear.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::IfThenZeroElse;
+
+template <typename Op>
+struct PerChannelOp {
+  explicit PerChannelOp(Op op) : op(op) {}
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    *r = op.Transform(d, *r);
+    *g = op.Transform(d, *g);
+    *b = op.Transform(d, *b);
+  }
+
+  Op op;
+};
+template <typename Op>
+PerChannelOp<Op> MakePerChannelOp(Op&& op) {
+  return PerChannelOp<Op>(std::forward<Op>(op));
+}
+
+struct OpLinear {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return encoded;
+  }
+};
+
+struct OpRgb {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_SRGB().DisplayFromEncoded(encoded);
+  }
+};
+
+struct OpPq {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_PQ().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpHlg {
+  explicit OpHlg(const float luminances[3], const float intensity_target)
+      : hlg_ootf_(HlgOOTF::FromSceneLight(
+            /*display_luminance=*/intensity_target, luminances)) {}
+
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {
+    for (T* val : {r, g, b}) {
+      HWY_ALIGN float vals[MaxLanes(d)];
+      Store(*val, d, vals);
+      for (size_t i = 0; i < Lanes(d); ++i) {
+        vals[i] = TF_HLG().DisplayFromEncoded(vals[i]);
+      }
+      *val = Load(d, vals);
+    }
+    hlg_ootf_.Apply(r, g, b);
+  }
+  HlgOOTF hlg_ootf_;
+};
+
+struct Op709 {
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return TF_709().DisplayFromEncoded(d, encoded);
+  }
+};
+
+struct OpGamma {
+  const float gamma;
+  template <typename D, typename T>
+  T Transform(D d, const T& encoded) const {
+    return IfThenZeroElse(Le(encoded, Set(d, 1e-5f)),
+                          FastPowf(d, encoded, Set(d, gamma)));
+  }
+};
+
+struct OpInvalid {
+  template <typename D, typename T>
+  void Transform(D d, T* r, T* g, T* b) const {}
+};
+
+template <typename Op>
+class ToLinearStage : public RenderPipelineStage {
+ public:
+  explicit ToLinearStage(Op op)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        op_(std::move(op)) {}
+
+  explicit ToLinearStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings()), valid_(false) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToLinear");
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      op_.Transform(d, &r, &g, &b);
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToLinear"; }
+
+ private:
+  Status IsInitialized() const override { return valid_; }
+
+  Op op_;
+  bool valid_ = true;
+};
+
+template <typename Op>
+std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) {
+  return jxl::make_unique<ToLinearStage<Op>>(std::forward<Op>(op));
+}
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+    return MakeToLinearStage(MakePerChannelOp(OpLinear()));
+  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+    return MakeToLinearStage(MakePerChannelOp(OpRgb()));
+  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
+    return MakeToLinearStage(MakePerChannelOp(OpPq()));
+  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+    return MakeToLinearStage(OpHlg(output_encoding_info.luminances,
+                                   output_encoding_info.orig_intensity_target));
+  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+    return MakeToLinearStage(MakePerChannelOp(Op709()));
+  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
+             output_encoding_info.color_encoding.tf.IsDCI()) {
+    return MakeToLinearStage(
+        MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma}));
+  } else {
+    return jxl::make_unique<ToLinearStage<OpInvalid>>();
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToLinearStage);
+
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToLinearStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
new file mode 100644
index 0000000000..ccee7b09f0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_to_linear.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from `output_encoding_info.color_encoding` to
+// linear.
+std::unique_ptr<RenderPipelineStage> GetToLinearStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TO_LINEAR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
new file mode 100644
index 0000000000..7609534a5b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.cc
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_tone_mapping.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_tone_mapping.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class ToneMappingStage : public RenderPipelineStage {
+ public:
+  explicit ToneMappingStage(OutputEncodingInfo output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        output_encoding_info_(std::move(output_encoding_info)) {
+    if (output_encoding_info_.desired_intensity_target ==
+        output_encoding_info_.orig_intensity_target) {
+      // No tone mapping requested.
+      return;
+    }
+    if (output_encoding_info_.orig_color_encoding.tf.IsPQ() &&
+        output_encoding_info_.desired_intensity_target <
+            output_encoding_info_.orig_intensity_target) {
+      tone_mapper_ = jxl::make_unique<ToneMapper>(
+          /*source_range=*/std::pair<float, float>(
+              0, output_encoding_info_.orig_intensity_target),
+          /*target_range=*/
+          std::pair<float, float>(
+              0, output_encoding_info_.desired_intensity_target),
+          output_encoding_info_.luminances);
+    } else if (output_encoding_info_.orig_color_encoding.tf.IsHLG() &&
+               !output_encoding_info_.color_encoding.tf.IsHLG()) {
+      hlg_ootf_ = jxl::make_unique<HlgOOTF>(
+          /*source_luminance=*/output_encoding_info_.orig_intensity_target,
+          /*target_luminance=*/output_encoding_info_.desired_intensity_target,
+          output_encoding_info_.luminances);
+    }
+
+    if (output_encoding_info_.color_encoding.tf.IsPQ() &&
+        (tone_mapper_ || hlg_ootf_)) {
+      to_intensity_target_ =
+          10000.f / output_encoding_info_.orig_intensity_target;
+      from_desired_intensity_target_ =
+          output_encoding_info_.desired_intensity_target / 10000.f;
+    }
+  }
+
+  bool IsNeeded() const { return tone_mapper_ || hlg_ootf_; }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("ToneMapping");
+
+    if (!(tone_mapper_ || hlg_ootf_)) return;
+
+    const HWY_FULL(float) d;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+      auto r = LoadU(d, row0 + x);
+      auto g = LoadU(d, row1 + x);
+      auto b = LoadU(d, row2 + x);
+      if (tone_mapper_ || hlg_ootf_) {
+        r = Mul(r, Set(d, to_intensity_target_));
+        g = Mul(g, Set(d, to_intensity_target_));
+        b = Mul(b, Set(d, to_intensity_target_));
+        if (tone_mapper_) {
+          tone_mapper_->ToneMap(&r, &g, &b);
+        } else {
+          JXL_ASSERT(hlg_ootf_);
+          hlg_ootf_->Apply(&r, &g, &b);
+        }
+        if (tone_mapper_ || hlg_ootf_->WarrantsGamutMapping()) {
+          GamutMap(&r, &g, &b, output_encoding_info_.luminances);
+        }
+        r = Mul(r, Set(d, from_desired_intensity_target_));
+        g = Mul(g, Set(d, from_desired_intensity_target_));
+        b = Mul(b, Set(d, from_desired_intensity_target_));
+      }
+      StoreU(r, d, row0 + x);
+      StoreU(g, d, row1 + x);
+      StoreU(b, d, row2 + x);
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "ToneMapping"; }
+
+ private:
+  using ToneMapper = Rec2408ToneMapper<HWY_FULL(float)>;
+  OutputEncodingInfo output_encoding_info_;
+  std::unique_ptr<ToneMapper> tone_mapper_;
+  std::unique_ptr<HlgOOTF> hlg_ootf_;
+  // When the target colorspace is PQ, 1 represents 10000 nits instead of
+  // orig_intensity_target. This temporarily changes this if the tone mappers
+  // require it.
+  float to_intensity_target_ = 1.f;
+  float from_desired_intensity_target_ = 1.f;
+};
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  auto stage = jxl::make_unique<ToneMappingStage>(output_encoding_info);
+  if (!stage->IsNeeded()) return nullptr;
+  return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetToneMappingStage);
+
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetToneMappingStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
new file mode 100644
index 0000000000..99824f8511
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_tone_mapping.h
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Tone maps the image if appropriate. It must be in linear space and
+// `output_encoding_info.luminances` must contain the luminance for the
+// primaries of that space. It must also be encoded such that (1, 1, 1)
+// represents `output_encoding_info.orig_intensity_target` nits, unless
+// `output_encoding_info.color_encoding.tf.IsPQ()`, in which case (1, 1, 1) must
+// represent 10000 nits. This corresponds to what XYBStage outputs. After this
+// stage, (1, 1, 1) will represent
+// `output_encoding_info.desired_intensity_target` nits, except in the PQ
+// special case in which it remains 10000.
+//
+// If no tone mapping is necessary, this will return nullptr.
+std::unique_ptr<RenderPipelineStage> GetToneMappingStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
new file mode 100644
index 0000000000..a75e259865
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc
@@ -0,0 +1,187 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_upsampling.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/simd_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class UpsamplingStage : public RenderPipelineStage {
+ public:
+  explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c,
+                           size_t shift)
+      : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
+            /*shift=*/shift, /*border=*/2)),
+        c_(c) {
+    const float* weights = shift == 1   ? ups_factors.upsampling2_weights
+                           : shift == 2 ? ups_factors.upsampling4_weights
+                                        : ups_factors.upsampling8_weights;
+    size_t N = 1 << (shift - 1);
+    for (size_t i = 0; i < 5 * N; i++) {
+      for (size_t j = 0; j < 5 * N; j++) {
+        size_t y = std::min(i, j);
+        size_t x = std::max(i, j);
+        kernel_[j / 5][i / 5][j % 5][i % 5] =
+            weights[5 * N * y - y * (y - 1) / 2 + x - y];
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("Upsampling");
+    static HWY_FULL(float) df;
+    size_t shift = settings_.shift_x;
+    size_t N = 1 << shift;
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(df));
+    for (ssize_t iy = -2; iy <= 2; iy++) {
+      msan::UnpoisonMemory(GetInputRow(input_rows, c_, iy) + xsize + 2,
+                           sizeof(float) * (xsize_v - xsize));
+    }
+    JXL_ASSERT(xextra == 0);
+    ssize_t x0 = 0;
+    ssize_t x1 = xsize;
+    if (N == 2) {
+      ProcessRowImpl<2>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 4) {
+      ProcessRowImpl<4>(input_rows, output_rows, x0, x1);
+    }
+    if (N == 8) {
+      ProcessRowImpl<8>(input_rows, output_rows, x0, x1);
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      msan::PoisonMemory(dst_row + xsize * N,
+                         sizeof(float) * (xsize_v - xsize) * N);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c == c_ ? RenderPipelineChannelMode::kInOut
+                   : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Upsample"; }
+
+ private:
+  template <size_t N>
+  JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const {
+    ix += 2;
+    iy += 2;
+    if (N == 2) {
+      return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix];
+    }
+    if (N == 4) {
+      return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2]
+                    [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy]
+                    [x % 4 < 2 ? ix : 4 - ix];
+    }
+    if (N == 8) {
+      return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4]
+                    [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy]
+                    [x % 8 < 4 ? ix : 4 - ix];
+    }
+    JXL_ABORT("Invalid upsample");
+  }
+
+  template <ssize_t N>
+  void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows,
+                      ssize_t x0, ssize_t x1) const {
+    static HWY_FULL(float) df;
+    using V = hwy::HWY_NAMESPACE::Vec<HWY_FULL(float)>;
+    V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7;
+    (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7;
+    V* ups[N];
+    if (N >= 2) {
+      ups[0] = &ups0;
+      ups[1] = &ups1;
+    }
+    if (N >= 4) {
+      ups[2] = &ups2;
+      ups[3] = &ups3;
+    }
+    if (N == 8) {
+      ups[4] = &ups4;
+      ups[5] = &ups5;
+      ups[6] = &ups6;
+      ups[7] = &ups7;
+    }
+    for (size_t oy = 0; oy < N; oy++) {
+      float* dst_row = GetOutputRow(output_rows, c_, oy);
+      for (ssize_t x = x0; x < x1; x += Lanes(df)) {
+        for (size_t ox = 0; ox < N; ox++) {
+          auto result = Zero(df);
+          auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x);
+          auto max = min;
+          for (ssize_t iy = -2; iy <= 2; iy++) {
+            for (ssize_t ix = -2; ix <= 2; ix++) {
+              auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix);
+              result = MulAdd(Set(df, Kernel<N>(ox, oy, ix, iy)), v, result);
+              min = Min(v, min);
+              max = Max(v, max);
+            }
+          }
+          // Avoid overshooting.
+          *ups[ox] = Clamp(result, min, max);
+        }
+        if (N == 2) {
+          StoreInterleaved(df, ups0, ups1, dst_row + x * N);
+        }
+        if (N == 4) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, dst_row + x * N);
+        }
+        if (N == 8) {
+          StoreInterleaved(df, ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7,
+                           dst_row + x * N);
+        }
+      }
+    }
+  }
+
+  size_t c_;
+  float kernel_[4][4][5][5];
+};
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  return jxl::make_unique<UpsamplingStage>(ups_factors, c, shift);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetUpsamplingStage);
+
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift) {
+  JXL_ASSERT(shift != 0);
+  JXL_ASSERT(shift <= 3);
+  return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
new file mode 100644
index 0000000000..7d5defd23c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Upsamples the given channel by the given factor.
+std::unique_ptr<RenderPipelineStage> GetUpsamplingStage(
+    const CustomTransformData& ups_factors, size_t c, size_t shift);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
new file mode 100644
index 0000000000..902fc33b7e
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc
@@ -0,0 +1,601 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_write.h"
+
+#include "lib/jxl/alpha.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/sanitizers.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_write.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+
+class WriteToOutputStage : public RenderPipelineStage {
+ public:
+  WriteToOutputStage(const ImageOutput& main_output, size_t width,
+                     size_t height, bool has_alpha, bool unpremul_alpha,
+                     size_t alpha_c, Orientation undo_orientation,
+                     const std::vector<ImageOutput>& extra_output)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        width_(width),
+        height_(height),
+        main_(main_output),
+        num_color_(main_.num_channels_ < 3 ? 1 : 3),
+        want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4),
+        has_alpha_(has_alpha),
+        unpremul_alpha_(unpremul_alpha),
+        alpha_c_(alpha_c),
+        flip_x_(ShouldFlipX(undo_orientation)),
+        flip_y_(ShouldFlipY(undo_orientation)),
+        transpose_(ShouldTranspose(undo_orientation)),
+        opaque_alpha_(kMaxPixelsPerCall, 1.0f) {
+    for (size_t ec = 0; ec < extra_output.size(); ++ec) {
+      if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) {
+        Output extra(extra_output[ec]);
+        extra.channel_index_ = 3 + ec;
+        extra_channels_.push_back(extra);
+      }
+    }
+  }
+
+  WriteToOutputStage(const WriteToOutputStage&) = delete;
+  WriteToOutputStage& operator=(const WriteToOutputStage&) = delete;
+  WriteToOutputStage(WriteToOutputStage&&) = delete;
+  WriteToOutputStage& operator=(WriteToOutputStage&&) = delete;
+
+  ~WriteToOutputStage() override {
+    if (main_.run_opaque_) {
+      main_.pixel_callback_.destroy(main_.run_opaque_);
+    }
+    for (auto& extra : extra_channels_) {
+      if (extra.run_opaque_) {
+        extra.pixel_callback_.destroy(extra.run_opaque_);
+      }
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    JXL_DASSERT(xextra == 0);
+    JXL_DASSERT(main_.run_opaque_ || main_.buffer_);
+    if (ypos >= height_) return;
+    if (xpos >= width_) return;
+    if (flip_y_) {
+      ypos = height_ - 1u - ypos;
+    }
+    size_t limit = std::min(xsize, width_ - xpos);
+    for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) {
+      size_t xstart = xpos + x0;
+      size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0);
+
+      const float* line_buffers[4];
+      for (size_t c = 0; c < num_color_; c++) {
+        line_buffers[c] = GetInputRow(input_rows, c, 0) + x0;
+      }
+      if (has_alpha_) {
+        line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0;
+      } else {
+        // opaque_alpha_ is a way to set all values to 1.0f.
+        line_buffers[num_color_] = opaque_alpha_.data();
+      }
+      if (has_alpha_ && want_alpha_ && unpremul_alpha_) {
+        UnpremulAlpha(thread_id, len, line_buffers);
+      }
+      OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers);
+      for (const auto& extra : extra_channels_) {
+        line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0;
+        OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    if (c < num_color_ || (has_alpha_ && c == alpha_c_)) {
+      return RenderPipelineChannelMode::kInput;
+    }
+    for (const auto& extra : extra_channels_) {
+      if (c == extra.channel_index_) {
+        return RenderPipelineChannelMode::kInput;
+      }
+    }
+    return RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WritePixelCB"; }
+
+ private:
+  struct Output {
+    Output(const ImageOutput& image_out)
+        : pixel_callback_(image_out.callback),
+          buffer_(image_out.buffer),
+          buffer_size_(image_out.buffer_size),
+          stride_(image_out.stride),
+          num_channels_(image_out.format.num_channels),
+          swap_endianness_(SwapEndianness(image_out.format.endianness)),
+          data_type_(image_out.format.data_type),
+          bits_per_sample_(image_out.bits_per_sample) {}
+
+    Status PrepareForThreads(size_t num_threads) {
+      if (pixel_callback_.IsPresent()) {
+        run_opaque_ =
+            pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
+        JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
+      } else {
+        JXL_RETURN_IF_ERROR(buffer_ != nullptr);
+      }
+      return true;
+    }
+
+    PixelCallback pixel_callback_;
+    void* run_opaque_ = nullptr;
+    void* buffer_ = nullptr;
+    size_t buffer_size_;
+    size_t stride_;
+    size_t num_channels_;
+    bool swap_endianness_;
+    JxlDataType data_type_;
+    size_t bits_per_sample_;
+    size_t channel_index_;  // used for extra_channels
+  };
+
+  Status PrepareForThreads(size_t num_threads) override {
+    JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads));
+    for (auto& extra : extra_channels_) {
+      JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads));
+    }
+    temp_out_.resize(num_threads);
+    for (CacheAlignedUniquePtr& temp : temp_out_) {
+      temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall *
+                           main_.num_channels_);
+    }
+    if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) {
+      temp_in_.resize(num_threads * main_.num_channels_);
+      for (CacheAlignedUniquePtr& temp : temp_in_) {
+        temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall);
+      }
+    }
+    return true;
+  }
+  static bool ShouldFlipX(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipHorizontal ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldFlipY(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipVertical ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldTranspose(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kTranspose ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+
+  void UnpremulAlpha(size_t thread_id, size_t len,
+                     const float** line_buffers) const {
+    const HWY_FULL(float) d;
+    auto one = Set(d, 1.0f);
+    float* temp_in[4];
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+    }
+    auto small_alpha = Set(d, kSmallAlpha);
+    for (size_t ix = 0; ix < len; ix += Lanes(d)) {
+      auto alpha = LoadU(d, temp_in[num_color_] + ix);
+      auto mul = Div(one, Max(small_alpha, alpha));
+      for (size_t c = 0; c < num_color_; ++c) {
+        auto val = LoadU(d, temp_in[c] + ix);
+        StoreU(Mul(val, mul), d, temp_in[c] + ix);
+      }
+    }
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+  }
+
+  void OutputBuffers(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, const float* input[4]) const {
+    if (flip_x_) {
+      FlipX(out, thread_id, len, &xstart, input);
+    }
+    if (out.data_type_ == JXL_TYPE_UINT8) {
+      uint8_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
+      StoreUnsignedRow(out, input, len, temp);
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_UINT16 ||
+               out.data_type_ == JXL_TYPE_FLOAT16) {
+      uint16_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
+      if (out.data_type_ == JXL_TYPE_UINT16) {
+        StoreUnsignedRow(out, input, len, temp);
+      } else {
+        StoreFloat16Row(out, input, len, temp);
+      }
+      if (out.swap_endianness_) {
+        const HWY_FULL(uint16_t) du;
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; j += Lanes(du)) {
+          auto v = LoadU(du, temp + j);
+          auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+          StoreU(vswap, du, temp + j);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_FLOAT) {
+      float* JXL_RESTRICT temp =
+          reinterpret_cast<float*>(temp_out_[thread_id].get());
+      StoreFloatRow(out, input, len, temp);
+      if (out.swap_endianness_) {
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; ++j) {
+          temp[j] = BSwapFloat(temp[j]);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    }
+  }
+
+  void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart,
+             const float** line_buffers) const {
+    float* temp_in[4];
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      if (temp_in[c] != line_buffers[c]) {
+        memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+      }
+    }
+    size_t last = (len - 1u);
+    size_t num = (len / 2);
+    for (size_t i = 0; i < num; ++i) {
+      for (size_t c = 0; c < out.num_channels_; ++c) {
+        std::swap(temp_in[c][i], temp_in[c][last - i]);
+      }
+    }
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+    *xstart = width_ - *xstart - len;
+  }
+
+  template <typename T>
+  void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
+                        T* output) const {
+    const HWY_FULL(float) d;
+    auto zero = Zero(d);
+    auto one = Set(d, 1.0f);
+    auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
+    const Rebind<T, decltype(d)> du;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul);
+        StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)),
+                          DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloat16Row(const Output& out, const float* input[4], size_t len,
+                       uint16_t* output) const {
+    const HWY_FULL(float) d;
+    const Rebind<uint16_t, decltype(d)> du;
+    const Rebind<hwy::float16_t, decltype(d)> df16;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        auto v3 = LoadU(d, &input[3][i]);
+        StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)),
+                          BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloatRow(const Output& out, const float* input[4], size_t len,
+                     float* output) const {
+    const HWY_FULL(float) d;
+    if (out.num_channels_ == 1) {
+      memcpy(output, input[0], len * sizeof(output[0]));
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d,
+                          &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), d, &output[3 * i]);
+      }
+    } else {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d,
+                          &output[4 * i]);
+      }
+    }
+  }
+
+  template <typename T>
+  void WriteToOutput(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, T* output) const {
+    if (transpose_) {
+      // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD.
+      if (out.run_opaque_) {
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i,
+                                  1, output + j);
+        }
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = xstart * out.stride_ + ypos * pixel_stride;
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          const size_t ix = offset + i * out.stride_;
+          JXL_DASSERT(ix + pixel_stride <= out.buffer_size_);
+          memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j,
+                 pixel_stride);
+        }
+      }
+    } else {
+      if (out.run_opaque_) {
+        out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len,
+                                output);
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = ypos * out.stride_ + xstart * pixel_stride;
+        JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_);
+        memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output,
+               len * pixel_stride);
+      }
+    }
+  }
+
+  static constexpr size_t kMaxPixelsPerCall = 1024;
+  size_t width_;
+  size_t height_;
+  Output main_;  // color + alpha
+  size_t num_color_;
+  bool want_alpha_;
+  bool has_alpha_;
+  bool unpremul_alpha_;
+  size_t alpha_c_;
+  bool flip_x_;
+  bool flip_y_;
+  bool transpose_;
+  std::vector<Output> extra_channels_;
+  std::vector<float> opaque_alpha_;
+  std::vector<CacheAlignedUniquePtr> temp_in_;
+  std::vector<CacheAlignedUniquePtr> temp_out_;
+};
+
+constexpr size_t WriteToOutputStage::kMaxPixelsPerCall;
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return jxl::make_unique<WriteToOutputStage>(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jxl {
+
+HWY_EXPORT(GetWriteToOutputStage);
+
+namespace {
+class WriteToImageBundleStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImageBundleStage(ImageBundle* image_bundle,
+                                   ColorEncoding color_encoding)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        image_bundle_(image_bundle),
+        color_encoding_(std::move(color_encoding)) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < input_sizes.size(); c++) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    // TODO(eustas): what should we do in the case of "want only ECs"?
+    image_bundle_->SetFromImage(
+        Image3F(input_sizes[0].first, input_sizes[0].second), color_encoding_);
+    // TODO(veluca): consider not reallocating ECs if not needed.
+    image_bundle_->extra_channels().clear();
+    for (size_t c = 3; c < input_sizes.size(); c++) {
+      image_bundle_->extra_channels().emplace_back(input_sizes[c].first,
+                                                   input_sizes[c].second);
+    }
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_bundle_->color()->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+    for (size_t ec = 0; ec < image_bundle_->extra_channels().size(); ec++) {
+      JXL_ASSERT(image_bundle_->extra_channels()[ec].xsize() >=
+                 xpos + xsize + xextra);
+      memcpy(image_bundle_->extra_channels()[ec].Row(ypos) + xpos - xextra,
+             GetInputRow(input_rows, 3 + ec, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+
+  const char* GetName() const override { return "WriteIB"; }
+
+ private:
+  ImageBundle* image_bundle_;
+  ColorEncoding color_encoding_;
+};
+
+class WriteToImage3FStage : public RenderPipelineStage {
+ public:
+  explicit WriteToImage3FStage(Image3F* image)
+      : RenderPipelineStage(RenderPipelineStage::Settings()), image_(image) {}
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < 3; ++c) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    *image_ = Image3F(input_sizes[0].first, input_sizes[0].second);
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < 3; c++) {
+      memcpy(image_->PlaneRow(c, ypos) + xpos - xextra,
+             GetInputRow(input_rows, c, 0) - xextra,
+             sizeof(float) * (xsize + 2 * xextra));
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInput
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "WriteI3F"; }
+
+ private:
+  Image3F* image_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding) {
+  return jxl::make_unique<WriteToImageBundleStage>(image_bundle,
+                                                   std::move(color_encoding));
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) {
+  return jxl::make_unique<WriteToImage3FStage>(image);
+}
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
+}
+
+}  // namespace jxl
+
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
new file mode 100644
index 0000000000..c5f844ebe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.h
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
+
+#include <functional>
+
+#include "lib/jxl/dec_cache.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
+    ImageBundle* image_bundle, ColorEncoding color_encoding);
+
+// Gets a stage to write color channels to an Image3F.
+std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image);
+
+// Gets a stage to write to a pixel callback or image buffer.
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_WRITE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
new file mode 100644
index 0000000000..15cfc75b18
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.cc
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_xyb.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class XYBStage : public RenderPipelineStage {
+ public:
+  explicit XYBStage(const OutputEncodingInfo& output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        opsin_params_(output_encoding_info.opsin_params),
+        output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() ==
+                       ColorSpace::kXYB) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoXYB");
+
+    const HWY_FULL(float) d;
+    JXL_ASSERT(xextra == 0);
+    const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // All calculations are lane-wise, still some might require
+    // value-dependent behaviour (e.g. NearestInt). Temporary unpoison last
+    // vector tail.
+    msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    if (output_is_xyb_) {
+      const auto scale_x = Set(d, kScaledXYBScale[0]);
+      const auto scale_y = Set(d, kScaledXYBScale[1]);
+      const auto scale_bmy = Set(d, kScaledXYBScale[2]);
+      const auto offset_x = Set(d, kScaledXYBOffset[0]);
+      const auto offset_y = Set(d, kScaledXYBOffset[1]);
+      const auto offset_bmy = Set(d, kScaledXYBOffset[2]);
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_x = LoadU(d, row0 + x);
+        const auto in_y = LoadU(d, row1 + x);
+        const auto in_b = LoadU(d, row2 + x);
+        auto out_x = Mul(Add(in_x, offset_x), scale_x);
+        auto out_y = Mul(Add(in_y, offset_y), scale_y);
+        auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy);
+        StoreU(out_x, d, row0 + x);
+        StoreU(out_y, d, row1 + x);
+        StoreU(out_b, d, row2 + x);
+      }
+    } else {
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_opsin_x = LoadU(d, row0 + x);
+        const auto in_opsin_y = LoadU(d, row1 + x);
+        const auto in_opsin_b = LoadU(d, row2 + x);
+        auto r = Undefined(d);
+        auto g = Undefined(d);
+        auto b = Undefined(d);
+        XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
+                 &b);
+        StoreU(r, d, row0 + x);
+        StoreU(g, d, row1 + x);
+        StoreU(b, d, row2 + x);
+      }
+    }
+    msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
+    msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "XYB"; }
+
+ private:
+  const OpsinParams opsin_params_;
+  const bool output_is_xyb_;
+};
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return jxl::make_unique<XYBStage>(output_encoding_info);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetXYBStage);
+
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info);
+}
+
+namespace {
+class FastXYBStage : public RenderPipelineStage {
+ public:
+  FastXYBStage(uint8_t* rgb, size_t stride, size_t width, size_t height,
+               bool rgba, bool has_alpha, size_t alpha_c)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        rgb_(rgb),
+        stride_(stride),
+        width_(width),
+        height_(height),
+        rgba_(rgba),
+        has_alpha_(has_alpha),
+        alpha_c_(alpha_c) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    if (ypos >= height_) return;
+    JXL_ASSERT(xextra == 0);
+    const float* xyba[4] = {
+        GetInputRow(input_rows, 0, 0), GetInputRow(input_rows, 1, 0),
+        GetInputRow(input_rows, 2, 0),
+        has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr};
+    uint8_t* out_buf = rgb_ + stride_ * ypos + (rgba_ ? 4 : 3) * xpos;
+    FastXYBTosRGB8(xyba, out_buf, rgba_,
+                   xsize + xpos <= width_ ? xsize : width_ - xpos);
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 || (has_alpha_ && c == alpha_c_)
+               ? RenderPipelineChannelMode::kInput
+               : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "FastXYB"; }
+
+ private:
+  uint8_t* rgb_;
+  size_t stride_;
+  size_t width_;
+  size_t height_;
+  bool rgba_;
+  bool has_alpha_;
+  size_t alpha_c_;
+  std::vector<float> opaque_alpha_;
+};
+
+}  // namespace
+
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c) {
+  JXL_ASSERT(HasFastXYBTosRGB8());
+  return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha,
+                                   alpha_c);
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
new file mode 100644
index 0000000000..7b06345c36
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_xyb.h
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
+#include <stdint.h>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from XYB to linear with appropriate primaries.
+std::unique_ptr<RenderPipelineStage> GetXYBStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+// Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and
+// write to a uint8 buffer.
+std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
+    uint8_t* rgb, size_t stride, size_t width, size_t height, bool rgba,
+    bool has_alpha, size_t alpha_c);
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_XYB_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
new file mode 100644
index 0000000000..5cba4a7d41
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.cc
@@ -0,0 +1,85 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_ycbcr.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_ycbcr.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::MulAdd;
+
+class kYCbCrStage : public RenderPipelineStage {
+ public:
+  kYCbCrStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    PROFILER_ZONE("UndoYCbCr");
+
+    const HWY_FULL(float) df;
+
+    // Full-range BT.601 as defined by JFIF Clause 7:
+    // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+    const auto c128 = Set(df, 128.0f / 255);
+    const auto crcr = Set(df, 1.402f);
+    const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+    const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+    const auto cbcb = Set(df, 1.772f);
+
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    // TODO(eustas): when using frame origin, addresses might be unaligned;
+    //               making them aligned will void performance penalty.
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto y_vec = Add(LoadU(df, row1 + x), c128);
+      const auto cb_vec = LoadU(df, row0 + x);
+      const auto cr_vec = LoadU(df, row2 + x);
+      const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+      const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+      const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+      StoreU(r_vec, df, row0 + x);
+      StoreU(g_vec, df, row1 + x);
+      StoreU(b_vec, df, row2 + x);
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "YCbCr"; }
+};
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return jxl::make_unique<kYCbCrStage>();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetYCbCrStage);
+
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage() {
+  return HWY_DYNAMIC_DISPATCH(GetYCbCrStage)();
+}
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
new file mode 100644
index 0000000000..9320c9723f
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_ycbcr.h
@@ -0,0 +1,25 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+// Converts the color channels from YCbCr to RGB.
+std::unique_ptr<RenderPipelineStage> GetYCbCrStage();
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
new file mode 100644
index 0000000000..789a52f8b2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/test_render_pipeline_stages.h
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+class UpsampleXSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleXSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftX(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* row = GetInputRow(input_rows, c, 0);
+      float* row_out = GetOutputRow(output_rows, c, 0);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(row + x - 1);
+        float xc = *(row + x);
+        float xn = *(row + x + 1);
+        float xout0 = xp * 0.25f + xc * 0.75f;
+        float xout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out + 2 * x + 0) = xout0;
+        *(row_out + 2 * x + 1) = xout1;
+      }
+    }
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleXSlowStage"; }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+};
+
+class UpsampleYSlowStage : public RenderPipelineStage {
+ public:
+  UpsampleYSlowStage()
+      : RenderPipelineStage(RenderPipelineStage::Settings::ShiftY(1, 1)) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      const float* rowp = GetInputRow(input_rows, c, -1);
+      const float* rowc = GetInputRow(input_rows, c, 0);
+      const float* rown = GetInputRow(input_rows, c, 1);
+      float* row_out0 = GetOutputRow(output_rows, c, 0);
+      float* row_out1 = GetOutputRow(output_rows, c, 1);
+      for (int64_t x = -xextra; x < (int64_t)(xsize + xextra); x++) {
+        float xp = *(rowp + x);
+        float xc = *(rowc + x);
+        float xn = *(rown + x);
+        float yout0 = xp * 0.25f + xc * 0.75f;
+        float yout1 = xc * 0.75f + xn * 0.25f;
+        *(row_out0 + x) = yout0;
+        *(row_out1 + x) = yout1;
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInOut;
+  }
+
+  const char* GetName() const override { return "TEST::UpsampleYSlowStage"; }
+};
+
+class Check0FinalStage : public RenderPipelineStage {
+ public:
+  Check0FinalStage() : RenderPipelineStage(RenderPipelineStage::Settings()) {}
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    for (size_t c = 0; c < input_rows.size(); c++) {
+      for (size_t x = 0; x < xsize; x++) {
+        JXL_CHECK(fabsf(GetInputRow(input_rows, c, 0)[x]) < 1e-8);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return RenderPipelineChannelMode::kInput;
+  }
+  const char* GetName() const override { return "TEST::Check0FinalStage"; }
+};
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc
new file mode 100644
index 0000000000..f1529b500c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/roundtrip_test.cc
@@ -0,0 +1,839 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/types.h>
+
+#include <cmath>  // std::abs
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_comparator.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/encode_internal.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace {
+
+// Converts a test image to a CodecInOut.
+// icc_profile can be empty to automatically deduce profile from the pixel
+// format, or filled in to force this ICC profile
+jxl::CodecInOut ConvertTestImage(const std::vector<uint8_t>& buf,
+                                 const size_t xsize, const size_t ysize,
+                                 const JxlPixelFormat& pixel_format,
+                                 const jxl::PaddedBytes& icc_profile) {
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+
+  bool is_gray = pixel_format.num_channels < 3;
+  bool has_alpha =
+      pixel_format.num_channels == 2 || pixel_format.num_channels == 4;
+
+  io.metadata.m.color_encoding.SetColorSpace(is_gray ? jxl::ColorSpace::kGray
+                                                     : jxl::ColorSpace::kRGB);
+  if (has_alpha) {
+    // Note: alpha > 16 not yet supported by the C++ codec
+    switch (pixel_format.data_type) {
+      case JXL_TYPE_UINT8:
+        io.metadata.m.SetAlphaBits(8);
+        break;
+      case JXL_TYPE_UINT16:
+      case JXL_TYPE_FLOAT:
+      case JXL_TYPE_FLOAT16:
+        io.metadata.m.SetAlphaBits(16);
+        break;
+      default:
+        ADD_FAILURE() << "Roundtrip tests for data type "
+                      << pixel_format.data_type << " not yet implemented.";
+    }
+  }
+  size_t bitdepth = 0;
+  switch (pixel_format.data_type) {
+    case JXL_TYPE_FLOAT:
+      bitdepth = 32;
+      io.metadata.m.SetFloat32Samples();
+      break;
+    case JXL_TYPE_FLOAT16:
+      bitdepth = 16;
+      io.metadata.m.SetFloat16Samples();
+      break;
+    case JXL_TYPE_UINT8:
+      bitdepth = 8;
+      io.metadata.m.SetUintSamples(8);
+      break;
+    case JXL_TYPE_UINT16:
+      bitdepth = 16;
+      io.metadata.m.SetUintSamples(16);
+      break;
+    default:
+      ADD_FAILURE() << "Roundtrip tests for data type "
+                    << pixel_format.data_type << " not yet implemented.";
+  }
+  jxl::ColorEncoding color_encoding;
+  if (!icc_profile.empty()) {
+    jxl::PaddedBytes icc_profile_copy(icc_profile);
+    EXPECT_TRUE(color_encoding.SetICC(std::move(icc_profile_copy)));
+  } else if (pixel_format.data_type == JXL_TYPE_FLOAT) {
+    color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray);
+  } else {
+    color_encoding = jxl::ColorEncoding::SRGB(is_gray);
+  }
+  EXPECT_TRUE(
+      ConvertFromExternal(jxl::Span<const uint8_t>(buf.data(), buf.size()),
+                          xsize, ysize, color_encoding,
+                          /*bits_per_sample=*/bitdepth, pixel_format,
+                          /*pool=*/nullptr, &io.Main()));
+  return io;
+}
+
+template <typename T>
+T ConvertTestPixel(float val);
+
+template <>
+float ConvertTestPixel<float>(const float val) {
+  return val;
+}
+
+template <>
+uint16_t ConvertTestPixel<uint16_t>(const float val) {
+  return (uint16_t)(val * UINT16_MAX);
+}
+
+template <>
+uint8_t ConvertTestPixel<uint8_t>(const float val) {
+  return (uint8_t)(val * UINT8_MAX);
+}
+
+// Returns a test image.
+template <typename T>
+std::vector<uint8_t> GetTestImage(const size_t xsize, const size_t ysize,
+                                  const JxlPixelFormat& pixel_format) {
+  std::vector<T> pixels(xsize * ysize * pixel_format.num_channels);
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      for (size_t chan = 0; chan < pixel_format.num_channels; chan++) {
+        float val;
+        switch (chan % 4) {
+          case 0:
+            val = static_cast<float>(y) / static_cast<float>(ysize);
+            break;
+          case 1:
+            val = static_cast<float>(x) / static_cast<float>(xsize);
+            break;
+          case 2:
+            val = static_cast<float>(x + y) / static_cast<float>(xsize + ysize);
+            break;
+          case 3:
+            val = static_cast<float>(x * y) / static_cast<float>(xsize * ysize);
+            break;
+        }
+        pixels[(y * xsize + x) * pixel_format.num_channels + chan] =
+            ConvertTestPixel<T>(val);
+      }
+    }
+  }
+  std::vector<uint8_t> bytes(pixels.size() * sizeof(T));
+  memcpy(bytes.data(), pixels.data(), sizeof(T) * pixels.size());
+  return bytes;
+}
+
+void EncodeWithEncoder(JxlEncoder* enc, std::vector<uint8_t>* compressed) {
+  compressed->resize(64);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+}
+
+// Generates some pixels using some dimensions and pixel_format,
+// compresses them, and verifies that the decoded version is similar to the
+// original pixels.
+// TODO(firsching): change this to be a parameterized test, like in
+// decode_test.cc
+template <typename T>
+void VerifyRoundtripCompression(
+    const size_t xsize, const size_t ysize,
+    const JxlPixelFormat& input_pixel_format,
+    const JxlPixelFormat& output_pixel_format, const bool lossless,
+    const bool use_container, const uint32_t resampling = 1,
+    const bool already_downsampled = false,
+    const std::vector<std::pair<JxlExtraChannelType, std::string>>&
+        extra_channels = {}) {
+  size_t orig_xsize = xsize;
+  size_t orig_ysize = ysize;
+  if (already_downsampled) {
+    orig_xsize = jxl::DivCeil(xsize, resampling);
+    orig_ysize = jxl::DivCeil(ysize, resampling);
+  }
+
+  JxlPixelFormat extra_channel_pixel_format = input_pixel_format;
+  extra_channel_pixel_format.num_channels = 1;
+  const std::vector<uint8_t> extra_channel_bytes =
+      GetTestImage<T>(xsize, ysize, extra_channel_pixel_format);
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<T>(orig_xsize, orig_ysize, input_pixel_format);
+  jxl::CodecInOut original_io = ConvertTestImage(
+      original_bytes, orig_xsize, orig_ysize, input_pixel_format, {});
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, use_container));
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &input_pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = lossless;
+  uint32_t num_channels = input_pixel_format.num_channels;
+  size_t has_interleaved_alpha = num_channels == 2 || num_channels == 4;
+  JxlPixelFormat output_pixel_format_with_extra_channel_alpha =
+      output_pixel_format;
+
+  // In the case where we have an alpha channel, but it is provided as an extra
+  // channel and not interleaved, we do two things here:
+  // 1. modify the original_io to have the correct alpha channel
+  // 2. change the output_format_with_extra_alpha to have an alpha channel
+  bool alpha_in_extra_channels_vector = false;
+  for (const auto& extra_channel : extra_channels) {
+    if (extra_channel.first == JXL_CHANNEL_ALPHA) {
+      alpha_in_extra_channels_vector = true;
+    }
+  }
+  if (alpha_in_extra_channels_vector && !has_interleaved_alpha) {
+    jxl::ImageF alpha_channel(xsize, ysize);
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Span<const uint8_t>(extra_channel_bytes.data(),
+                                 extra_channel_bytes.size()),
+        xsize, ysize, basic_info.bits_per_sample, extra_channel_pixel_format, 0,
+        /*pool=*/nullptr, &alpha_channel));
+
+    original_io.metadata.m.SetAlphaBits(basic_info.bits_per_sample);
+    original_io.Main().SetAlpha(std::move(alpha_channel));
+    output_pixel_format_with_extra_channel_alpha.num_channels++;
+  }
+  // Those are the num_extra_channels including a potential alpha channel.
+  basic_info.num_extra_channels = extra_channels.size() + has_interleaved_alpha;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  EXPECT_EQ(enc->metadata.m.num_extra_channels,
+            extra_channels.size() + has_interleaved_alpha);
+  JxlColorEncoding color_encoding;
+  if (input_pixel_format.data_type == JXL_TYPE_FLOAT) {
+    JxlColorEncodingSetToLinearSRGB(
+        &color_encoding,
+        /*is_gray=*/input_pixel_format.num_channels < 3);
+  } else {
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/input_pixel_format.num_channels < 3);
+  }
+
+  std::vector<JxlExtraChannelInfo> channel_infos;
+  for (const auto& extra_channel : extra_channels) {
+    auto channel_type = extra_channel.first;
+    JxlExtraChannelInfo channel_info;
+    JxlEncoderInitExtraChannelInfo(channel_type, &channel_info);
+    channel_info.bits_per_sample = (lossless ? basic_info.bits_per_sample : 8);
+    channel_info.exponent_bits_per_sample =
+        (lossless ? basic_info.exponent_bits_per_sample : 0);
+    channel_infos.push_back(channel_info);
+  }
+  for (size_t index = 0; index < channel_infos.size(); index++) {
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelInfo(enc, index + has_interleaved_alpha,
+                                            &channel_infos[index]));
+    std::string name = extra_channels[index].second;
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelName(enc, index + has_interleaved_alpha,
+                                            name.c_str(), name.length()));
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, lossless);
+  if (resampling > 1) {
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetOption(
+            frame_settings, JXL_ENC_FRAME_SETTING_RESAMPLING, resampling));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED,
+                  already_downsampled));
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &input_pixel_format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  EXPECT_EQ(frame_settings->enc->input_queue.back()
+                .frame->frame.extra_channels()
+                .size(),
+            has_interleaved_alpha + extra_channels.size());
+  EXPECT_EQ(frame_settings->enc->input_queue.empty(), false);
+  for (size_t index = 0; index < channel_infos.size(); index++) {
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetExtraChannelBuffer(
+                  frame_settings, &extra_channel_pixel_format,
+                  (void*)extra_channel_bytes.data(), extra_channel_bytes.size(),
+                  index + has_interleaved_alpha));
+  }
+  JxlEncoderCloseInput(enc);
+  EXPECT_EQ(frame_settings->enc->input_queue.back()
+                .frame->frame.extra_channels()
+                .size(),
+            has_interleaved_alpha + extra_channels.size());
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderImageOutBufferSize(
+          dec, &output_pixel_format_with_extra_channel_alpha, &buffer_size));
+  if (&input_pixel_format == &output_pixel_format_with_extra_channel_alpha &&
+      !already_downsampled) {
+    EXPECT_EQ(buffer_size, original_bytes.size());
+  }
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+  EXPECT_EQ(extra_channels.size() + has_interleaved_alpha,
+            info.num_extra_channels);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t icc_profile_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(
+                dec, &output_pixel_format_with_extra_channel_alpha,
+                JXL_COLOR_PROFILE_TARGET_DATA, &icc_profile_size));
+  jxl::PaddedBytes icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(
+                dec, &output_pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
+                icc_profile.data(), icc_profile.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(
+                dec, &output_pixel_format_with_extra_channel_alpha,
+                decoded_bytes.data(), decoded_bytes.size()));
+  std::vector<std::vector<uint8_t>> extra_channel_decoded_bytes(
+      info.num_extra_channels - has_interleaved_alpha);
+
+  for (size_t index = has_interleaved_alpha; index < info.num_extra_channels;
+       index++) {
+    JxlExtraChannelInfo channel_info;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelInfo(dec, index, &channel_info));
+    EXPECT_EQ(channel_info.type,
+              extra_channels[index - has_interleaved_alpha].first);
+    std::string input_name =
+        extra_channels[index - has_interleaved_alpha].second;
+    const size_t name_length = channel_info.name_length;
+    EXPECT_EQ(input_name.size(), name_length);
+    std::vector<char> output_name(name_length + 1);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetExtraChannelName(dec, index, output_name.data(),
+                                            output_name.size()));
+    EXPECT_EQ(0,
+              memcmp(input_name.data(), output_name.data(), input_name.size()));
+    size_t extra_buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderExtraChannelBufferSize(dec, &output_pixel_format,
+                                               &extra_buffer_size, index));
+    std::vector<uint8_t> extra_decoded_bytes(extra_buffer_size);
+    extra_channel_decoded_bytes[index - has_interleaved_alpha] =
+        std::move(extra_decoded_bytes);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetExtraChannelBuffer(
+            dec, &output_pixel_format,
+            extra_channel_decoded_bytes[index - has_interleaved_alpha].data(),
+            extra_channel_decoded_bytes[index - has_interleaved_alpha].size(),
+            index));
+  }
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  // Check if there are no further errors after getting the full image, e.g.
+  // check that the final codestream box is actually marked as last.
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+
+  jxl::CodecInOut decoded_io = ConvertTestImage(
+      decoded_bytes, xsize, ysize, output_pixel_format_with_extra_channel_alpha,
+      icc_profile);
+
+  if (already_downsampled) {
+    jxl::Image3F* color = decoded_io.Main().color();
+    jxl::DownsampleImage(color, resampling);
+    if (decoded_io.Main().HasAlpha()) {
+      jxl::ImageF* alpha = decoded_io.Main().alpha();
+      jxl::DownsampleImage(alpha, resampling);
+    }
+    decoded_io.SetSize(color->xsize(), color->ysize());
+  }
+
+  if (lossless && !already_downsampled) {
+    JXL_EXPECT_OK(jxl::SamePixels(*original_io.Main().color(),
+                                  *decoded_io.Main().color(), _));
+  } else {
+    jxl::ButteraugliParams ba;
+    float butteraugli_score = ButteraugliDistance(
+        original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(),
+        /*distmap=*/nullptr, nullptr);
+    EXPECT_LE(butteraugli_score, 2.0f);
+  }
+  JxlPixelFormat extra_channel_output_pixel_format = output_pixel_format;
+  extra_channel_output_pixel_format.num_channels = 1;
+  for (auto& extra_channel : extra_channel_decoded_bytes) {
+    EXPECT_EQ(extra_channel.size(), extra_channel_bytes.size());
+    if (lossless) {
+      EXPECT_EQ(jxl::test::ComparePixels(extra_channel.data(),
+                                         extra_channel_bytes.data(), xsize,
+                                         ysize, extra_channel_pixel_format,
+                                         extra_channel_output_pixel_format),
+                0u);
+      EXPECT_EQ(extra_channel, extra_channel_bytes);
+    }
+  }
+}
+
+}  // namespace
+
+TEST(RoundtripTest, FloatFrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_DEPTH, "depth"},
+                               {JXL_CHANNEL_SELECTION_MASK, "mask"},
+                               {JXL_CHANNEL_BLACK, "black"},
+                               {JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_OPTIONAL, "optional channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          uint32_t has_alpha = static_cast<uint32_t>(num_channels % 2 == 0);
+          uint32_t total_extra_channels = has_alpha + extra_channels.size();
+          // There's no support (yet) for lossless extra float
+          // channels, so we don't test it.
+          if (total_extra_channels == 0 || !lossless) {
+            JxlPixelFormat pixel_format = JxlPixelFormat{
+                num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+            VerifyRoundtripCompression<float>(
+                63, 129, pixel_format, pixel_format, (bool)lossless,
+                (bool)use_container, 1, false, extra_channels);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, Uint16FrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_BLACK, "k_channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          JxlPixelFormat pixel_format = JxlPixelFormat{
+              num_channels, JXL_TYPE_UINT16, JXL_NATIVE_ENDIAN, 0};
+          VerifyRoundtripCompression<uint16_t>(
+              63, 129, pixel_format, pixel_format, (bool)lossless,
+              (bool)use_container, 1, false, extra_channels);
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, Uint8FrameRoundtripTest) {
+  std::vector<std::vector<std::pair<JxlExtraChannelType, std::string>>>
+      extra_channels_cases = {{},
+                              {{JXL_CHANNEL_THERMAL, "temperature"}},
+                              {{JXL_CHANNEL_ALPHA, "my extra alpha channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"}},
+                              {{JXL_CHANNEL_CFA, "my cfa channel"},
+                               {JXL_CHANNEL_BLACK, "k_channel"}},
+                              {{JXL_CHANNEL_DEPTH, "very deep"}}};
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (int lossless = 0; lossless < 2; lossless++) {
+      for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+        for (auto& extra_channels : extra_channels_cases) {
+          JxlPixelFormat pixel_format = JxlPixelFormat{
+              num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+          VerifyRoundtripCompression<uint8_t>(
+              63, 129, pixel_format, pixel_format, (bool)lossless,
+              (bool)use_container, 1, false, extra_channels);
+        }
+      }
+    }
+  }
+}
+
+TEST(RoundtripTest, TestNonlinearSrgbAsXybEncoded) {
+  for (int use_container = 0; use_container < 2; use_container++) {
+    for (uint32_t num_channels = 1; num_channels < 5; num_channels++) {
+      JxlPixelFormat pixel_format_in =
+          JxlPixelFormat{num_channels, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+      JxlPixelFormat pixel_format_out =
+          JxlPixelFormat{num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+      VerifyRoundtripCompression<uint8_t>(
+          63, 129, pixel_format_in, pixel_format_out,
+          /*lossless=*/false, (bool)use_container, {});
+    }
+  }
+}
+
+TEST(RoundtripTest, Resampling) {
+  JxlPixelFormat pixel_format =
+      JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+  VerifyRoundtripCompression<uint8_t>(63, 129, pixel_format, pixel_format,
+                                      /*lossless=*/false,
+                                      /*use_container=*/false, 2,
+                                      /*already_downsampled=*/false);
+
+  // TODO(lode): also make this work for odd sizes. This requires a fix in
+  // enc_frame.cc to not set custom_size_or_origin to true due to even/odd
+  // mismatch.
+  VerifyRoundtripCompression<uint8_t>(64, 128, pixel_format, pixel_format,
+                                      /*lossless=*/true,
+                                      /*use_container=*/false, 2,
+                                      /*already_downsampled=*/true);
+}
+
+TEST(RoundtripTest, ExtraBoxesTest) {
+  JxlPixelFormat pixel_format =
+      JxlPixelFormat{4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+  const size_t xsize = 61;
+  const size_t ysize = 71;
+
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<float>(xsize, ysize, pixel_format);
+  jxl::CodecInOut original_io =
+      ConvertTestImage(original_bytes, xsize, ysize, pixel_format, {});
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, true));
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+  JxlColorEncoding color_encoding;
+  if (pixel_format.data_type == JXL_TYPE_FLOAT) {
+    JxlColorEncodingSetToLinearSRGB(&color_encoding,
+                                    /*is_gray=*/pixel_format.num_channels < 3);
+  } else {
+    JxlColorEncodingSetToSRGB(&color_encoding,
+                              /*is_gray=*/pixel_format.num_channels < 3);
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  JxlEncoderSetFrameLossless(frame_settings, false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  std::vector<uint8_t> extra_data(1023);
+  jxl::AppendBoxHeader(jxl::MakeBoxType("crud"), extra_data.size(), false,
+                       &compressed);
+  compressed.insert(compressed.end(), extra_data.begin(), extra_data.end());
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &pixel_format, &buffer_size));
+  EXPECT_EQ(buffer_size, original_bytes.size());
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t icc_profile_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, &pixel_format,
+                                        JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &icc_profile_size));
+  jxl::PaddedBytes icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(
+                dec, &pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
+                icc_profile.data(), icc_profile.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(dec, &pixel_format,
+                                                         decoded_bytes.data(),
+                                                         decoded_bytes.size()));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  JxlDecoderDestroy(dec);
+
+  jxl::CodecInOut decoded_io =
+      ConvertTestImage(decoded_bytes, xsize, ysize, pixel_format, icc_profile);
+
+  jxl::ButteraugliParams ba;
+  float butteraugli_score = ButteraugliDistance(
+      original_io.frames, decoded_io.frames, ba, jxl::GetJxlCms(),
+      /*distmap=*/nullptr, nullptr);
+  EXPECT_LE(butteraugli_score, 2.0f);
+}
+
+static const unsigned char kEncodedTestProfile[] = {
+    0x1f, 0x8b, 0x1,  0x13, 0x10, 0x0,  0x0,  0x0,  0x20, 0x4c, 0xcc, 0x3,
+    0xe7, 0xa0, 0xa5, 0xa2, 0x90, 0xa4, 0x27, 0xe8, 0x79, 0x1d, 0xe3, 0x26,
+    0x57, 0x54, 0xef, 0x0,  0xe8, 0x97, 0x2,  0xce, 0xa1, 0xd7, 0x85, 0x16,
+    0xb4, 0x29, 0x94, 0x58, 0xf2, 0x56, 0xc0, 0x76, 0xea, 0x23, 0xec, 0x7c,
+    0x73, 0x51, 0x41, 0x40, 0x23, 0x21, 0x95, 0x4,  0x75, 0x12, 0xc9, 0xcc,
+    0x16, 0xbd, 0xb6, 0x99, 0xad, 0xf8, 0x75, 0x35, 0xb6, 0x42, 0xae, 0xae,
+    0xae, 0x86, 0x56, 0xf8, 0xcc, 0x16, 0x30, 0xb3, 0x45, 0xad, 0xd,  0x40,
+    0xd6, 0xd1, 0xd6, 0x99, 0x40, 0xbe, 0xe2, 0xdc, 0x31, 0x7,  0xa6, 0xb9,
+    0x27, 0x92, 0x38, 0x0,  0x3,  0x5e, 0x2c, 0xbe, 0xe6, 0xfb, 0x19, 0xbf,
+    0xf3, 0x6d, 0xbc, 0x4d, 0x64, 0xe5, 0xba, 0x76, 0xde, 0x31, 0x65, 0x66,
+    0x14, 0xa6, 0x3a, 0xc5, 0x8f, 0xb1, 0xb4, 0xba, 0x1f, 0xb1, 0xb8, 0xd4,
+    0x75, 0xba, 0x18, 0x86, 0x95, 0x3c, 0x26, 0xf6, 0x25, 0x62, 0x53, 0xfd,
+    0x9c, 0x94, 0x76, 0xf6, 0x95, 0x2c, 0xb1, 0xfd, 0xdc, 0xc0, 0xe4, 0x3f,
+    0xb3, 0xff, 0x67, 0xde, 0xd5, 0x94, 0xcc, 0xb0, 0x83, 0x2f, 0x28, 0x93,
+    0x92, 0x3,  0xa1, 0x41, 0x64, 0x60, 0x62, 0x70, 0x80, 0x87, 0xaf, 0xe7,
+    0x60, 0x4a, 0x20, 0x23, 0xb3, 0x11, 0x7,  0x38, 0x38, 0xd4, 0xa,  0x66,
+    0xb5, 0x93, 0x41, 0x90, 0x19, 0x17, 0x18, 0x60, 0xa5, 0xb,  0x7a, 0x24,
+    0xaa, 0x20, 0x81, 0xac, 0xa9, 0xa1, 0x70, 0xa6, 0x12, 0x8a, 0x4a, 0xa3,
+    0xa0, 0xf9, 0x9a, 0x97, 0xe7, 0xa8, 0xac, 0x8,  0xa8, 0xc4, 0x2a, 0x86,
+    0xa7, 0x69, 0x1e, 0x67, 0xe6, 0xbe, 0xa4, 0xd3, 0xff, 0x91, 0x61, 0xf6,
+    0x8a, 0xe6, 0xb5, 0xb3, 0x61, 0x9f, 0x19, 0x17, 0x98, 0x27, 0x6b, 0xe9,
+    0x8,  0x98, 0xe1, 0x21, 0x4a, 0x9,  0xb5, 0xd7, 0xca, 0xfa, 0x94, 0xd0,
+    0x69, 0x1a, 0xeb, 0x52, 0x1,  0x4e, 0xf5, 0xf6, 0xdf, 0x7f, 0xe7, 0x29,
+    0x70, 0xee, 0x4,  0xda, 0x2f, 0xa4, 0xff, 0xfe, 0xbb, 0x6f, 0xa8, 0xff,
+    0xfe, 0xdb, 0xaf, 0x8,  0xf6, 0x72, 0xa1, 0x40, 0x5d, 0xf0, 0x2d, 0x8,
+    0x82, 0x5b, 0x87, 0xbd, 0x10, 0x8,  0xe9, 0x7,  0xee, 0x4b, 0x80, 0xda,
+    0x4a, 0x4,  0xc5, 0x5e, 0xa0, 0xb7, 0x1e, 0x60, 0xb0, 0x59, 0x76, 0x60,
+    0xb,  0x2e, 0x19, 0x8a, 0x2e, 0x1c, 0xe6, 0x6,  0x20, 0xb8, 0x64, 0x18,
+    0x2a, 0xcf, 0x51, 0x94, 0xd4, 0xee, 0xc3, 0xfe, 0x39, 0x74, 0xd4, 0x2b,
+    0x48, 0xc9, 0x83, 0x4c, 0x9b, 0xd0, 0x4c, 0x35, 0x10, 0xe3, 0x9,  0xf7,
+    0x72, 0xf0, 0x7a, 0xe,  0xbf, 0x7d, 0x36, 0x2e, 0x19, 0x7e, 0x3f, 0xc,
+    0xf7, 0x93, 0xe7, 0xf4, 0x1d, 0x32, 0xc6, 0xb0, 0x89, 0xad, 0xe0, 0x28,
+    0xc1, 0xa7, 0x59, 0xe3, 0x0,
+};
+
+TEST(RoundtripTest, TestICCProfile) {
+  // JxlEncoderSetICCProfile parses the ICC profile, so a valid profile is
+  // needed. The profile should be passed correctly through the roundtrip.
+  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
+                                                 sizeof(kEncodedTestProfile)));
+  jxl::PaddedBytes icc;
+  ASSERT_TRUE(ReadICC(&reader, &icc));
+  ASSERT_TRUE(reader.Close());
+
+  JxlPixelFormat format =
+      JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+
+  size_t xsize = 25;
+  size_t ysize = 37;
+  const std::vector<uint8_t> original_bytes =
+      GetTestImage<uint8_t>(xsize, ysize, format);
+
+  JxlEncoder* enc = JxlEncoderCreate(nullptr);
+  EXPECT_NE(nullptr, enc);
+
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = JXL_TRUE;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetICCProfile(enc, icc.data(), icc.size()));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc, nullptr);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &format,
+                                    (void*)original_bytes.data(),
+                                    original_bytes.size()));
+  JxlEncoderCloseInput(enc);
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc, &compressed);
+  JxlEncoderDestroy(enc);
+
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  EXPECT_NE(nullptr, dec);
+
+  const uint8_t* next_in = compressed.data();
+  size_t avail_in = compressed.size();
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                               JXL_DEC_COLOR_ENCODING |
+                                               JXL_DEC_FULL_IMAGE));
+
+  JxlDecoderSetInput(dec, next_in, avail_in);
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  size_t buffer_size;
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+  EXPECT_EQ(buffer_size, original_bytes.size());
+
+  JxlBasicInfo info;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(xsize, info.xsize);
+  EXPECT_EQ(ysize, info.ysize);
+
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+  size_t dec_icc_size;
+  EXPECT_EQ(
+      JXL_DEC_SUCCESS,
+      JxlDecoderGetICCProfileSize(
+          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_icc_size));
+  EXPECT_EQ(icc.size(), dec_icc_size);
+  jxl::PaddedBytes dec_icc(dec_icc_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsICCProfile(dec, &format,
+                                           JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                           dec_icc.data(), dec_icc.size()));
+
+  std::vector<uint8_t> decoded_bytes(buffer_size);
+
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetImageOutBuffer(dec, &format, decoded_bytes.data(),
+                                        decoded_bytes.size()));
+
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+
+  EXPECT_EQ(icc, dec_icc);
+
+  JxlDecoderDestroy(dec);
+}
+
+#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
+TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const jxl::PaddedBytes orig = jxl::test::ReadTestData(jpeg_path);
+  jxl::CodecInOut orig_io;
+  ASSERT_TRUE(
+      SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io, /*pool=*/nullptr));
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed;
+  EncodeWithEncoder(enc.get(), &compressed);
+
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(
+                dec.get(), JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_FULL_IMAGE));
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  EXPECT_EQ(JXL_DEC_JPEG_RECONSTRUCTION, JxlDecoderProcessInput(dec.get()));
+  std::vector<uint8_t> reconstructed_buffer(128);
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data(),
+                                    reconstructed_buffer.size()));
+  size_t used = 0;
+  JxlDecoderStatus dec_process_result = JXL_DEC_JPEG_NEED_MORE_OUTPUT;
+  while (dec_process_result == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+    used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+    reconstructed_buffer.resize(reconstructed_buffer.size() * 2);
+    EXPECT_EQ(
+        JXL_DEC_SUCCESS,
+        JxlDecoderSetJPEGBuffer(dec.get(), reconstructed_buffer.data() + used,
+                                reconstructed_buffer.size() - used));
+    dec_process_result = JxlDecoderProcessInput(dec.get());
+  }
+  ASSERT_EQ(JXL_DEC_FULL_IMAGE, dec_process_result);
+  used = reconstructed_buffer.size() - JxlDecoderReleaseJPEGBuffer(dec.get());
+  ASSERT_EQ(used, orig.size());
+  EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), orig.data(), used));
+}
+#endif  // JPEGXL_ENABLE_JPEG
diff --git a/third_party/jpeg-xl/lib/jxl/sanitizers.h b/third_party/jpeg-xl/lib/jxl/sanitizers.h
new file mode 100644
index 0000000000..ce0bd8dc63
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/sanitizers.h
@@ -0,0 +1,242 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SANITIZERS_H_
+#define LIB_JXL_SANITIZERS_H_
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/sanitizer_definitions.h"
+#include "lib/jxl/image.h"
+
+#if JXL_MEMORY_SANITIZER
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+#include "sanitizer/msan_interface.h"
+#endif
+
+namespace jxl {
+namespace msan {
+
+#if JXL_MEMORY_SANITIZER
+
+// Chosen so that kSanitizerSentinel is four copies of kSanitizerSentinelByte.
+constexpr uint8_t kSanitizerSentinelByte = 0x48;
+constexpr float kSanitizerSentinel = 205089.125f;
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const volatile void* m,
+                                                     size_t size) {
+  __msan_poison(m, size);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const volatile void* m,
+                                                       size_t size) {
+  __msan_unpoison(m, size);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char* c) {
+  do {
+    UnpoisonMemory(c, 1);
+  } while (*c++);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized(
+    const volatile void* m, size_t size) {
+  __msan_check_mem_is_initialized(m, size);
+}
+
+// Mark all the bytes of an image (including padding) as poisoned bytes.
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {
+  PoisonMemory(im.bytes(), im.bytes_per_row() * im.ysize());
+}
+
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Image3<T>& im) {
+  PoisonImage(im.Plane(0));
+  PoisonImage(im.Plane(1));
+  PoisonImage(im.Plane(2));
+}
+
+// Print the uninitialized regions of an image.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PrintImageUninitialized(
+    const Plane<T>& im) {
+  fprintf(stderr,
+          "Uninitialized regions for image of size %" PRIu64 "x%" PRIu64 ":\n",
+          static_cast<uint64_t>(im.xsize()), static_cast<uint64_t>(im.ysize()));
+
+  // A segment of uninitialized pixels in a row, in the format [first, second).
+  typedef std::pair<size_t, size_t> PixelSegment;
+
+  // Helper class to merge and print a list of rows of PixelSegment that may be
+  // the same over big ranges of rows. This compacts the output to ranges of
+  // rows like "[y0, y1): [x0, x1) [x2, x3)".
+  class RowsMerger {
+   public:
+    // Add a new row the list of rows. If the row is the same as the previous
+    // one it will be merged showing a range of rows [y0, y1), but if the new
+    // row is different the current range of rows (if any) will be printed and a
+    // new one will be started.
+    void AddRow(size_t y, std::vector<PixelSegment>&& new_row) {
+      if (start_y_ != -1 && new_row != segments_) {
+        PrintRow(y);
+      }
+      if (new_row.empty()) {
+        // Skip ranges with no uninitialized pixels.
+        start_y_ = -1;
+        segments_.clear();
+        return;
+      }
+      if (start_y_ == -1) {
+        start_y_ = y;
+        segments_ = std::move(new_row);
+      }
+    }
+
+    // Print the contents of the range of rows [start_y_, end_y) if any.
+    void PrintRow(size_t end_y) {
+      if (start_y_ == -1) return;
+      if (segments_.empty()) {
+        start_y_ = -1;
+        return;
+      }
+      if (end_y - start_y_ > 1) {
+        fprintf(stderr, " y=[%" PRId64 ", %" PRIu64 "):",
+                static_cast<int64_t>(start_y_), static_cast<uint64_t>(end_y));
+      } else {
+        fprintf(stderr, " y=[%" PRId64 "]:", static_cast<int64_t>(start_y_));
+      }
+      for (const auto& seg : segments_) {
+        if (seg.first + 1 == seg.second) {
+          fprintf(stderr, " [%" PRId64 "]", static_cast<int64_t>(seg.first));
+        } else {
+          fprintf(stderr, " [%" PRId64 ", %" PRIu64 ")",
+                  static_cast<int64_t>(seg.first),
+                  static_cast<uint64_t>(seg.second));
+        }
+      }
+      fprintf(stderr, "\n");
+      start_y_ = -1;
+    }
+
+   private:
+    std::vector<PixelSegment> segments_;
+    // Row number of the first row in the range of rows that have |segments| as
+    // the undefined segments.
+    ssize_t start_y_ = -1;
+  } rows_merger;
+
+  class SegmentsMerger {
+   public:
+    void AddValue(size_t x) {
+      if (row.empty() || row.back().second != x) {
+        row.emplace_back(x, x + 1);
+      } else {
+        row.back().second = x + 1;
+      }
+    }
+
+    std::vector<PixelSegment> row;
+  };
+
+  for (size_t y = 0; y < im.ysize(); y++) {
+    auto* row = im.Row(y);
+    SegmentsMerger seg_merger;
+    size_t x = 0;
+    while (x < im.xsize()) {
+      intptr_t ret =
+          __msan_test_shadow(row + x, (im.xsize() - x) * sizeof(row[0]));
+      if (ret < 0) break;
+      size_t next_x = x + ret / sizeof(row[0]);
+      seg_merger.AddValue(next_x);
+      x = next_x + 1;
+    }
+    rows_merger.AddRow(y, std::move(seg_merger.row));
+  }
+  rows_merger.PrintRow(im.ysize());
+}
+
+// Check that all the pixels in the provided rect of the image are initialized
+// (not poisoned). If any of the values is poisoned it will abort.
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized(
+    const Plane<T>& im, const Rect& r, size_t c, const char* message) {
+  JXL_ASSERT(r.x0() <= im.xsize());
+  JXL_ASSERT(r.x0() + r.xsize() <= im.xsize());
+  JXL_ASSERT(r.y0() <= im.ysize());
+  JXL_ASSERT(r.y0() + r.ysize() <= im.ysize());
+  for (size_t y = r.y0(); y < r.y0() + r.ysize(); y++) {
+    const auto* row = im.Row(y);
+    intptr_t ret = __msan_test_shadow(row + r.x0(), sizeof(*row) * r.xsize());
+    if (ret != -1) {
+      JXL_DEBUG(
+          1,
+          "Checking an image of %" PRIu64 " x %" PRIu64 ", rect x0=%" PRIu64
+          ", y0=%" PRIu64
+          ", "
+          "xsize=%" PRIu64 ", ysize=%" PRIu64,
+          static_cast<uint64_t>(im.xsize()), static_cast<uint64_t>(im.ysize()),
+          static_cast<uint64_t>(r.x0()), static_cast<uint64_t>(r.y0()),
+          static_cast<uint64_t>(r.xsize()), static_cast<uint64_t>(r.ysize()));
+      size_t x = ret / sizeof(*row);
+      JXL_DEBUG(1,
+                "CheckImageInitialized failed at x=%" PRIu64 ", y=%" PRIu64
+                ", c=%" PRIu64 ": %s",
+                static_cast<uint64_t>(r.x0() + x), static_cast<uint64_t>(y),
+                static_cast<uint64_t>(c), message ? message : "");
+      PrintImageUninitialized(im);
+    }
+    // This will report an error if memory is not initialized.
+    __msan_check_mem_is_initialized(row + r.x0(), sizeof(*row) * r.xsize());
+  }
+}
+
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void CheckImageInitialized(
+    const Image3<T>& im, const Rect& r, const char* message) {
+  for (size_t c = 0; c < 3; c++) {
+    std::string str_message(message);
+    str_message += " c=" + std::to_string(c);
+    CheckImageInitialized(im.Plane(c), r, c, str_message.c_str());
+  }
+}
+
+#define JXL_CHECK_IMAGE_INITIALIZED(im, r) \
+  ::jxl::msan::CheckImageInitialized(im, r, "im=" #im ", r=" #r);
+
+#define JXL_CHECK_PLANE_INITIALIZED(im, r, c) \
+  ::jxl::msan::CheckImageInitialized(im, r, c, "im=" #im ", r=" #r ", c=" #c);
+
+#else  // JXL_MEMORY_SANITIZER
+
+// In non-msan mode these functions don't use volatile since it is not needed
+// for the empty functions.
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonMemory(const void*, size_t) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonMemory(const void*, size_t) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void UnpoisonCStr(const char*) {}
+static JXL_INLINE JXL_MAYBE_UNUSED void MemoryIsInitialized(const void*,
+                                                            size_t) {}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const PlaneBase& im) {}
+template <typename T>
+static JXL_INLINE JXL_MAYBE_UNUSED void PoisonImage(const Plane<T>& im) {}
+
+#define JXL_CHECK_IMAGE_INITIALIZED(im, r)
+#define JXL_CHECK_PLANE_INITIALIZED(im, r, c)
+
+#endif
+
+}  // namespace msan
+}  // namespace jxl
+
+#endif  // LIB_JXL_SANITIZERS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/simd_util-inl.h b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h
new file mode 100644
index 0000000000..77b207ffe8
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/simd_util-inl.h
@@ -0,0 +1,349 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Misc utilities for SIMD operations
+
+#if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_SIMD_UTIL_INL_H_
+#undef LIB_JXL_SIMD_UTIL_INL_H_
+#else
+#define LIB_JXL_SIMD_UTIL_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+#if HWY_CAP_GE512
+using hwy::HWY_NAMESPACE::Half;
+using hwy::HWY_NAMESPACE::Vec;
+template <size_t i, class DF, class V>
+HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
+  using HF = Half<DF>;
+  using HHF = Half<HF>;
+  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
+  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
+}
+
+template <class DF, class V>
+HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
+  using HF = Half<DF>;
+  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
+}
+
+#endif
+
+// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
+// aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+#elif !HWY_CAP_GE256
+  Store(InterleaveLower(df, v0, v1), df, mem);
+  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
+#else
+  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(ConcatLowerLower(df, t1, t0), df, mem);
+    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
+  } else {
+#if HWY_CAP_GE512
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
+                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
+          df, mem);
+    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
+                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
+          df, mem + Lanes(df));
+#endif
+  }
+#endif
+}
+
+// Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must
+// be aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+  Store(v2, df, mem + 2);
+  Store(v3, df, mem + 3);
+#elif !HWY_CAP_GE256
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+  Store(InterleaveLower(df, t0, t1), df, mem);
+  Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df));
+  Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df));
+  Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df));
+#elif !HWY_CAP_GE512
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+
+  auto m0 = InterleaveLower(df, t0, t1);
+  auto m1 = InterleaveUpper(df, t0, t1);
+  auto m2 = InterleaveLower(df, t2, t3);
+  auto m3 = InterleaveUpper(df, t2, t3);
+
+  Store(ConcatLowerLower(df, m1, m0), df, mem);
+  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
+  Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df));
+  Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df));
+#else
+  auto t0 = InterleaveLower(df, v0, v2);
+  auto t1 = InterleaveLower(df, v1, v3);
+  auto t2 = InterleaveUpper(df, v0, v2);
+  auto t3 = InterleaveUpper(df, v1, v3);
+
+  auto m0 = InterleaveLower(df, t0, t1);
+  auto m1 = InterleaveUpper(df, t0, t1);
+  auto m2 = InterleaveLower(df, t2, t3);
+  auto m3 = InterleaveUpper(df, t2, t3);
+
+  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
+                Quarter<0>(df, m3)),
+        df, mem);
+  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
+                Quarter<1>(df, m3)),
+        df, mem + Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
+                Quarter<2>(df, m3)),
+        df, mem + 2 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
+                Quarter<3>(df, m3)),
+        df, mem + 3 * Lanes(df));
+#endif
+}
+
+// Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to
+// mem, in this order. Mem must be aligned.
+template <class DF, class V>
+void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6,
+                      V v7, float* mem) {
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+  Store(v2, df, mem + 2);
+  Store(v3, df, mem + 3);
+  Store(v4, df, mem + 4);
+  Store(v5, df, mem + 5);
+  Store(v6, df, mem + 6);
+  Store(v7, df, mem + 7);
+#elif !HWY_CAP_GE256
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  Store(InterleaveLower(df, w0, w1), df, mem);
+  Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df));
+  Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df));
+  Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df));
+  Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df));
+  Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df));
+  Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df));
+  Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df));
+#elif !HWY_CAP_GE512
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  auto m0 = InterleaveLower(df, w0, w1);
+  auto m1 = InterleaveUpper(df, w0, w1);
+  auto m2 = InterleaveLower(df, w2, w3);
+  auto m3 = InterleaveUpper(df, w2, w3);
+  auto m4 = InterleaveLower(df, w4, w5);
+  auto m5 = InterleaveUpper(df, w4, w5);
+  auto m6 = InterleaveLower(df, w6, w7);
+  auto m7 = InterleaveUpper(df, w6, w7);
+
+  Store(ConcatLowerLower(df, m1, m0), df, mem);
+  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
+  Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df));
+  Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df));
+  Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df));
+  Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df));
+  Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df));
+  Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df));
+#else
+  auto t0 = InterleaveLower(df, v0, v4);
+  auto t1 = InterleaveLower(df, v1, v5);
+  auto t2 = InterleaveLower(df, v2, v6);
+  auto t3 = InterleaveLower(df, v3, v7);
+  auto t4 = InterleaveUpper(df, v0, v4);
+  auto t5 = InterleaveUpper(df, v1, v5);
+  auto t6 = InterleaveUpper(df, v2, v6);
+  auto t7 = InterleaveUpper(df, v3, v7);
+
+  auto w0 = InterleaveLower(df, t0, t2);
+  auto w1 = InterleaveLower(df, t1, t3);
+  auto w2 = InterleaveUpper(df, t0, t2);
+  auto w3 = InterleaveUpper(df, t1, t3);
+  auto w4 = InterleaveLower(df, t4, t6);
+  auto w5 = InterleaveLower(df, t5, t7);
+  auto w6 = InterleaveUpper(df, t4, t6);
+  auto w7 = InterleaveUpper(df, t5, t7);
+
+  auto m0 = InterleaveLower(df, w0, w1);
+  auto m1 = InterleaveUpper(df, w0, w1);
+  auto m2 = InterleaveLower(df, w2, w3);
+  auto m3 = InterleaveUpper(df, w2, w3);
+  auto m4 = InterleaveLower(df, w4, w5);
+  auto m5 = InterleaveUpper(df, w4, w5);
+  auto m6 = InterleaveLower(df, w6, w7);
+  auto m7 = InterleaveUpper(df, w6, w7);
+
+  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
+                Quarter<0>(df, m3)),
+        df, mem);
+  Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6),
+                Quarter<0>(df, m7)),
+        df, mem + Lanes(df));
+  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
+                Quarter<1>(df, m3)),
+        df, mem + 2 * Lanes(df));
+  Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6),
+                Quarter<1>(df, m7)),
+        df, mem + 3 * Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
+                Quarter<2>(df, m3)),
+        df, mem + 4 * Lanes(df));
+  Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6),
+                Quarter<2>(df, m7)),
+        df, mem + 5 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
+                Quarter<3>(df, m3)),
+        df, mem + 6 * Lanes(df));
+  Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6),
+                Quarter<3>(df, m7)),
+        df, mem + 7 * Lanes(df));
+#endif
+}
+
+#if HWY_CAP_GE256
+JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
+                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
+  const HWY_CAPPED(int32_t, 8) d;
+  auto i0 = Load(d, from);
+  auto i1 = Load(d, from + 1 * fromstride);
+  auto i2 = Load(d, from + 2 * fromstride);
+  auto i3 = Load(d, from + 3 * fromstride);
+  auto i4 = Load(d, from + 4 * fromstride);
+  auto i5 = Load(d, from + 5 * fromstride);
+  auto i6 = Load(d, from + 6 * fromstride);
+  auto i7 = Load(d, from + 7 * fromstride);
+
+  const auto q0 = InterleaveLower(d, i0, i2);
+  const auto q1 = InterleaveLower(d, i1, i3);
+  const auto q2 = InterleaveUpper(d, i0, i2);
+  const auto q3 = InterleaveUpper(d, i1, i3);
+  const auto q4 = InterleaveLower(d, i4, i6);
+  const auto q5 = InterleaveLower(d, i5, i7);
+  const auto q6 = InterleaveUpper(d, i4, i6);
+  const auto q7 = InterleaveUpper(d, i5, i7);
+
+  const auto r0 = InterleaveLower(d, q0, q1);
+  const auto r1 = InterleaveUpper(d, q0, q1);
+  const auto r2 = InterleaveLower(d, q2, q3);
+  const auto r3 = InterleaveUpper(d, q2, q3);
+  const auto r4 = InterleaveLower(d, q4, q5);
+  const auto r5 = InterleaveUpper(d, q4, q5);
+  const auto r6 = InterleaveLower(d, q6, q7);
+  const auto r7 = InterleaveUpper(d, q6, q7);
+
+  i0 = ConcatLowerLower(d, r4, r0);
+  i1 = ConcatLowerLower(d, r5, r1);
+  i2 = ConcatLowerLower(d, r6, r2);
+  i3 = ConcatLowerLower(d, r7, r3);
+  i4 = ConcatUpperUpper(d, r4, r0);
+  i5 = ConcatUpperUpper(d, r5, r1);
+  i6 = ConcatUpperUpper(d, r6, r2);
+  i7 = ConcatUpperUpper(d, r7, r3);
+
+  Store(i0, d, to);
+  Store(i1, d, to + 1 * 8);
+  Store(i2, d, to + 2 * 8);
+  Store(i3, d, to + 3 * 8);
+  Store(i4, d, to + 4 * 8);
+  Store(i5, d, to + 5 * 8);
+  Store(i6, d, to + 6 * 8);
+  Store(i7, d, to + 7 * 8);
+}
+#elif HWY_TARGET != HWY_SCALAR
+JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
+                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
+  const HWY_CAPPED(int32_t, 4) d;
+  for (size_t n = 0; n < 8; n += 4) {
+    for (size_t m = 0; m < 8; m += 4) {
+      auto p0 = Load(d, from + n * fromstride + m);
+      auto p1 = Load(d, from + (n + 1) * fromstride + m);
+      auto p2 = Load(d, from + (n + 2) * fromstride + m);
+      auto p3 = Load(d, from + (n + 3) * fromstride + m);
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      Store(r0, d, to + m * 8 + n);
+      Store(r1, d, to + (1 + m) * 8 + n);
+      Store(r2, d, to + (2 + m) * 8 + n);
+      Store(r3, d, to + (3 + m) * 8 + n);
+    }
+  }
+}
+
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_SIMD_UTIL_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/simd_util_test.cc b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc
new file mode 100644
index 0000000000..b81f5d1279
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/simd_util_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/simd_util_test.cc"
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/simd_util-inl.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestInterleave2() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 2];
+  StoreInterleaved(d, vec1, vec2, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 2; j++) {
+      EXPECT_EQ(mem[2 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+HWY_NOINLINE void TestInterleave4() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  auto vec3 = Iota(d, 2 * 128.0);
+  auto vec4 = Iota(d, 3 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 4];
+  StoreInterleaved(d, vec1, vec2, vec3, vec4, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 4; j++) {
+      EXPECT_EQ(mem[4 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+HWY_NOINLINE void TestInterleave8() {
+  HWY_FULL(float) d;
+  auto vec1 = Iota(d, 0 * 128.0);
+  auto vec2 = Iota(d, 1 * 128.0);
+  auto vec3 = Iota(d, 2 * 128.0);
+  auto vec4 = Iota(d, 3 * 128.0);
+  auto vec5 = Iota(d, 4 * 128.0);
+  auto vec6 = Iota(d, 5 * 128.0);
+  auto vec7 = Iota(d, 6 * 128.0);
+  auto vec8 = Iota(d, 7 * 128.0);
+  HWY_ALIGN float mem[MaxLanes(d) * 8];
+  StoreInterleaved(d, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, mem);
+  for (size_t i = 0; i < Lanes(d); i++) {
+    for (size_t j = 0; j < 8; j++) {
+      EXPECT_EQ(mem[8 * i + j], j * 128 + i) << "i: " << i << " j: " << j;
+    }
+  }
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class SimdUtilTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(SimdUtilTargetTest);
+
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave2);
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave4);
+HWY_EXPORT_AND_TEST_P(SimdUtilTargetTest, TestInterleave8);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc
new file mode 100644
index 0000000000..b3f30c3e4c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/speed_tier_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string>
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+struct SpeedTierTestParams {
+  explicit SpeedTierTestParams(const SpeedTier speed_tier,
+                               const bool shrink8 = false)
+      : speed_tier(speed_tier), shrink8(shrink8) {}
+  SpeedTier speed_tier;
+  bool shrink8;
+};
+
+std::ostream& operator<<(std::ostream& os, SpeedTierTestParams params) {
+  auto previous_flags = os.flags();
+  os << std::boolalpha;
+  os << "SpeedTierTestParams{" << static_cast<size_t>(params.speed_tier)
+     << ", /*shrink8=*/" << params.shrink8 << "}";
+  os.flags(previous_flags);
+  return os;
+}
+
+class SpeedTierTest : public testing::TestWithParam<SpeedTierTestParams> {};
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    SpeedTierTestInstantiation, SpeedTierTest,
+    testing::Values(SpeedTierTestParams{SpeedTier::kCheetah,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kCheetah,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kThunder,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kThunder,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kLightning,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kLightning,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kFalcon,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kFalcon,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kHare,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kHare,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kWombat,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kWombat,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kSquirrel,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kSquirrel,
+                                        /*shrink8=*/false},
+                    SpeedTierTestParams{SpeedTier::kKitten,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kKitten,
+                                        /*shrink8=*/false},
+                    // Only downscaled image for Tortoise mode.
+                    SpeedTierTestParams{SpeedTier::kTortoise,
+                                        /*shrink8=*/true}));
+
+TEST_P(SpeedTierTest, Roundtrip) {
+  const PaddedBytes orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  CodecInOut io;
+  test::ThreadPoolForTests pool(8);
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+
+  const SpeedTierTestParams& params = GetParam();
+
+  if (params.shrink8) {
+    io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+  }
+
+  CompressParams cparams;
+  cparams.speed_tier = params.speed_tier;
+
+  CodecInOut io2;
+  JXL_EXPECT_OK(test::Roundtrip(&io, cparams, {}, &io2, _));
+
+  // Can be 2.2 in non-hare mode.
+  EXPECT_LE(
+      ButteraugliDistance(io.frames, io2.frames, cparams.ba_params, GetJxlCms(),
+                          /*distmap=*/nullptr, /*pool=*/nullptr),
+      2.8);
+}
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/splines.cc b/third_party/jpeg-xl/lib/jxl/splines.cc
new file mode 100644
index 0000000000..04d1df8e49
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines.cc
@@ -0,0 +1,694 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/splines.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_scales.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/opsin_params.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/splines.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/fast_math-inl.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::MulSub;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Given a set of DCT coefficients, this returns the result of performing cosine
+// interpolation on the original samples.
+float ContinuousIDCT(const float dct[32], const float t) {
+  // We compute here the DCT-3 of the `dct` vector, rescaled by a factor of
+  // sqrt(32). This is such that an input vector vector {x, 0, ..., 0} produces
+  // a constant result of x. dct[0] was scaled in Dequantize() to allow uniform
+  // treatment of all the coefficients.
+  constexpr float kMultipliers[32] = {
+      kPi / 32 * 0,  kPi / 32 * 1,  kPi / 32 * 2,  kPi / 32 * 3,  kPi / 32 * 4,
+      kPi / 32 * 5,  kPi / 32 * 6,  kPi / 32 * 7,  kPi / 32 * 8,  kPi / 32 * 9,
+      kPi / 32 * 10, kPi / 32 * 11, kPi / 32 * 12, kPi / 32 * 13, kPi / 32 * 14,
+      kPi / 32 * 15, kPi / 32 * 16, kPi / 32 * 17, kPi / 32 * 18, kPi / 32 * 19,
+      kPi / 32 * 20, kPi / 32 * 21, kPi / 32 * 22, kPi / 32 * 23, kPi / 32 * 24,
+      kPi / 32 * 25, kPi / 32 * 26, kPi / 32 * 27, kPi / 32 * 28, kPi / 32 * 29,
+      kPi / 32 * 30, kPi / 32 * 31,
+  };
+  HWY_CAPPED(float, 32) df;
+  auto result = Zero(df);
+  const auto tandhalf = Set(df, t + 0.5f);
+  for (int i = 0; i < 32; i += Lanes(df)) {
+    auto cos_arg = Mul(LoadU(df, kMultipliers + i), tandhalf);
+    auto cos = FastCosf(df, cos_arg);
+    auto local_res = Mul(LoadU(df, dct + i), cos);
+    result = MulAdd(Set(df, kSqrt2), local_res, result);
+  }
+  return GetLane(SumOfLanes(df, result));
+}
+
+template <typename DF>
+void DrawSegment(DF df, const SplineSegment& segment, const bool add,
+                 const size_t y, const size_t x, float* JXL_RESTRICT rows[3]) {
+  Rebind<int32_t, DF> di;
+  const auto inv_sigma = Set(df, segment.inv_sigma);
+  const auto half = Set(df, 0.5f);
+  const auto one_over_2s2 = Set(df, 0.353553391f);
+  const auto sigma_over_4_times_intensity =
+      Set(df, segment.sigma_over_4_times_intensity);
+  const auto dx = Sub(ConvertTo(df, Iota(di, x)), Set(df, segment.center_x));
+  const auto dy = Set(df, y - segment.center_y);
+  const auto sqd = MulAdd(dx, dx, Mul(dy, dy));
+  const auto distance = Sqrt(sqd);
+  const auto one_dimensional_factor =
+      Sub(FastErff(df, Mul(MulAdd(distance, half, one_over_2s2), inv_sigma)),
+          FastErff(df, Mul(MulSub(distance, half, one_over_2s2), inv_sigma)));
+  auto local_intensity =
+      Mul(sigma_over_4_times_intensity,
+          Mul(one_dimensional_factor, one_dimensional_factor));
+  for (size_t c = 0; c < 3; ++c) {
+    const auto cm = Set(df, add ? segment.color[c] : -segment.color[c]);
+    const auto in = LoadU(df, rows[c] + x);
+    StoreU(MulAdd(cm, local_intensity, in), df, rows[c] + x);
+  }
+}
+
+void DrawSegment(const SplineSegment& segment, const bool add, const size_t y,
+                 const ssize_t x0, ssize_t x1, float* JXL_RESTRICT rows[3]) {
+  ssize_t x =
+      std::max<ssize_t>(x0, segment.center_x - segment.maximum_distance + 0.5f);
+  // one-past-the-end
+  x1 =
+      std::min<ssize_t>(x1, segment.center_x + segment.maximum_distance + 1.5f);
+  HWY_FULL(float) df;
+  for (; x + static_cast<ssize_t>(Lanes(df)) <= x1; x += Lanes(df)) {
+    DrawSegment(df, segment, add, y, x, rows);
+  }
+  for (; x < x1; ++x) {
+    DrawSegment(HWY_CAPPED(float, 1)(), segment, add, y, x, rows);
+  }
+}
+
+void ComputeSegments(const Spline::Point& center, const float intensity,
+                     const float color[3], const float sigma,
+                     std::vector<SplineSegment>& segments,
+                     std::vector<std::pair<size_t, size_t>>& segments_by_y) {
+  // Sanity check sigma, inverse sigma and intensity
+  if (!(std::isfinite(sigma) && sigma != 0.0f && std::isfinite(1.0f / sigma) &&
+        std::isfinite(intensity))) {
+    return;
+  }
+#if JXL_HIGH_PRECISION
+  constexpr float kDistanceExp = 5;
+#else
+  // About 30% faster.
+  constexpr float kDistanceExp = 3;
+#endif
+  // We cap from below colors to at least 0.01.
+  float max_color = 0.01f;
+  for (size_t c = 0; c < 3; c++) {
+    max_color = std::max(max_color, std::abs(color[c] * intensity));
+  }
+  // Distance beyond which max_color*intensity*exp(-d^2 / (2 * sigma^2)) drops
+  // below 10^-kDistanceExp.
+  const float maximum_distance =
+      std::sqrt(-2 * sigma * sigma *
+                (std::log(0.1) * kDistanceExp - std::log(max_color)));
+  SplineSegment segment;
+  segment.center_y = center.y;
+  segment.center_x = center.x;
+  memcpy(segment.color, color, sizeof(segment.color));
+  segment.inv_sigma = 1.0f / sigma;
+  segment.sigma_over_4_times_intensity = .25f * sigma * intensity;
+  segment.maximum_distance = maximum_distance;
+  ssize_t y0 = center.y - maximum_distance + .5f;
+  ssize_t y1 = center.y + maximum_distance + 1.5f;  // one-past-the-end
+  for (ssize_t y = std::max<ssize_t>(y0, 0); y < y1; y++) {
+    segments_by_y.emplace_back(y, segments.size());
+  }
+  segments.push_back(segment);
+}
+
+void DrawSegments(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                  float* JXL_RESTRICT row_b, const Rect& image_rect,
+                  const bool add, const SplineSegment* segments,
+                  const size_t* segment_indices,
+                  const size_t* segment_y_start) {
+  JXL_ASSERT(image_rect.ysize() == 1);
+  float* JXL_RESTRICT rows[3] = {row_x - image_rect.x0(),
+                                 row_y - image_rect.x0(),
+                                 row_b - image_rect.x0()};
+  size_t y = image_rect.y0();
+  for (size_t i = segment_y_start[y]; i < segment_y_start[y + 1]; i++) {
+    DrawSegment(segments[segment_indices[i]], add, y, image_rect.x0(),
+                image_rect.x0() + image_rect.xsize(), rows);
+  }
+}
+
+void SegmentsFromPoints(
+    const Spline& spline,
+    const std::vector<std::pair<Spline::Point, float>>& points_to_draw,
+    const float arc_length, std::vector<SplineSegment>& segments,
+    std::vector<std::pair<size_t, size_t>>& segments_by_y) {
+  const float inv_arc_length = 1.0f / arc_length;
+  int k = 0;
+  for (const auto& point_to_draw : points_to_draw) {
+    const Spline::Point& point = point_to_draw.first;
+    const float multiplier = point_to_draw.second;
+    const float progress_along_arc =
+        std::min(1.f, (k * kDesiredRenderingDistance) * inv_arc_length);
+    ++k;
+    float color[3];
+    for (size_t c = 0; c < 3; ++c) {
+      color[c] =
+          ContinuousIDCT(spline.color_dct[c], (32 - 1) * progress_along_arc);
+    }
+    const float sigma =
+        ContinuousIDCT(spline.sigma_dct, (32 - 1) * progress_along_arc);
+    ComputeSegments(point, multiplier, color, sigma, segments, segments_by_y);
+  }
+}
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+HWY_EXPORT(SegmentsFromPoints);
+HWY_EXPORT(DrawSegments);
+
+namespace {
+
+// It is not in spec, but reasonable limit to avoid overflows.
+template <typename T>
+Status ValidateSplinePointPos(const T& x, const T& y) {
+  constexpr T kSplinePosLimit = 1u << 23;
+  if ((x >= kSplinePosLimit) || (x <= -kSplinePosLimit) ||
+      (y >= kSplinePosLimit) || (y <= -kSplinePosLimit)) {
+    return JXL_FAILURE("Spline coordinates out of bounds");
+  }
+  return true;
+}
+
+// Maximum number of spline control points per frame is
+//   std::min(kMaxNumControlPoints, xsize * ysize / 2)
+constexpr size_t kMaxNumControlPoints = 1u << 20u;
+constexpr size_t kMaxNumControlPointsPerPixelRatio = 2;
+
+float AdjustedQuant(const int32_t adjustment) {
+  return (adjustment >= 0) ? (1.f + .125f * adjustment)
+                           : 1.f / (1.f - .125f * adjustment);
+}
+
+float InvAdjustedQuant(const int32_t adjustment) {
+  return (adjustment >= 0) ? 1.f / (1.f + .125f * adjustment)
+                           : (1.f - .125f * adjustment);
+}
+
+// X, Y, B, sigma.
+static constexpr float kChannelWeight[] = {0.0042f, 0.075f, 0.07f, .3333f};
+
+Status DecodeAllStartingPoints(std::vector<Spline::Point>* const points,
+                               BitReader* const br, ANSSymbolReader* reader,
+                               const std::vector<uint8_t>& context_map,
+                               const size_t num_splines) {
+  points->clear();
+  points->reserve(num_splines);
+  int64_t last_x = 0;
+  int64_t last_y = 0;
+  for (size_t i = 0; i < num_splines; i++) {
+    int64_t x =
+        reader->ReadHybridUint(kStartingPositionContext, br, context_map);
+    int64_t y =
+        reader->ReadHybridUint(kStartingPositionContext, br, context_map);
+    if (i != 0) {
+      x = UnpackSigned(x) + last_x;
+      y = UnpackSigned(y) + last_y;
+    }
+    JXL_RETURN_IF_ERROR(ValidateSplinePointPos(x, y));
+    points->emplace_back(static_cast<float>(x), static_cast<float>(y));
+    last_x = x;
+    last_y = y;
+  }
+  return true;
+}
+
+struct Vector {
+  float x, y;
+  Vector operator-() const { return {-x, -y}; }
+  Vector operator+(const Vector& other) const {
+    return {x + other.x, y + other.y};
+  }
+  float SquaredNorm() const { return x * x + y * y; }
+};
+Vector operator*(const float k, const Vector& vec) {
+  return {k * vec.x, k * vec.y};
+}
+
+Spline::Point operator+(const Spline::Point& p, const Vector& vec) {
+  return {p.x + vec.x, p.y + vec.y};
+}
+Vector operator-(const Spline::Point& a, const Spline::Point& b) {
+  return {a.x - b.x, a.y - b.y};
+}
+
+// TODO(eustas): avoid making a copy of "points".
+void DrawCentripetalCatmullRomSpline(std::vector<Spline::Point> points,
+                                     std::vector<Spline::Point>& result) {
+  if (points.empty()) return;
+  if (points.size() == 1) {
+    result.push_back(points[0]);
+    return;
+  }
+  // Number of points to compute between each control point.
+  static constexpr int kNumPoints = 16;
+  result.reserve((points.size() - 1) * kNumPoints + 1);
+  points.insert(points.begin(), points[0] + (points[0] - points[1]));
+  points.push_back(points[points.size() - 1] +
+                   (points[points.size() - 1] - points[points.size() - 2]));
+  // points has at least 4 elements at this point.
+  for (size_t start = 0; start < points.size() - 3; ++start) {
+    // 4 of them are used, and we draw from p[1] to p[2].
+    const Spline::Point* const p = &points[start];
+    result.push_back(p[1]);
+    float d[3];
+    float t[4];
+    t[0] = 0;
+    for (int k = 0; k < 3; ++k) {
+      // TODO(eustas): for each segment delta is calculated 3 times...
+      // TODO(eustas): restrict d[k] with reasonable limit and spec it.
+      d[k] = std::sqrt(hypotf(p[k + 1].x - p[k].x, p[k + 1].y - p[k].y));
+      t[k + 1] = t[k] + d[k];
+    }
+    for (int i = 1; i < kNumPoints; ++i) {
+      const float tt = d[0] + (static_cast<float>(i) / kNumPoints) * d[1];
+      Spline::Point a[3];
+      for (int k = 0; k < 3; ++k) {
+        // TODO(eustas): reciprocal multiplication would be faster.
+        a[k] = p[k] + ((tt - t[k]) / d[k]) * (p[k + 1] - p[k]);
+      }
+      Spline::Point b[2];
+      for (int k = 0; k < 2; ++k) {
+        b[k] = a[k] + ((tt - t[k]) / (d[k] + d[k + 1])) * (a[k + 1] - a[k]);
+      }
+      result.push_back(b[0] + ((tt - t[1]) / d[1]) * (b[1] - b[0]));
+    }
+  }
+  result.push_back(points[points.size() - 2]);
+}
+
+// Move along the line segments defined by `points`, `kDesiredRenderingDistance`
+// pixels at a time, and call `functor` with each point and the actual distance
+// to the previous point (which will always be kDesiredRenderingDistance except
+// possibly for the very last point).
+// TODO(eustas): this method always adds the last point, but never the first
+//               (unless those are one); I believe both ends matter.
+template <typename Points, typename Functor>
+void ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
+  JXL_ASSERT(!points.empty());
+  Spline::Point current = points.front();
+  functor(current, kDesiredRenderingDistance);
+  auto next = points.begin();
+  while (next != points.end()) {
+    const Spline::Point* previous = &current;
+    float arclength_from_previous = 0.f;
+    for (;;) {
+      if (next == points.end()) {
+        functor(*previous, arclength_from_previous);
+        return;
+      }
+      const float arclength_to_next =
+          std::sqrt((*next - *previous).SquaredNorm());
+      if (arclength_from_previous + arclength_to_next >=
+          kDesiredRenderingDistance) {
+        current =
+            *previous + ((kDesiredRenderingDistance - arclength_from_previous) /
+                         arclength_to_next) *
+                            (*next - *previous);
+        functor(current, kDesiredRenderingDistance);
+        break;
+      }
+      arclength_from_previous += arclength_to_next;
+      previous = &*next;
+      ++next;
+    }
+  }
+}
+
+}  // namespace
+
+QuantizedSpline::QuantizedSpline(const Spline& original,
+                                 const int32_t quantization_adjustment,
+                                 const float y_to_x, const float y_to_b) {
+  JXL_ASSERT(!original.control_points.empty());
+  control_points_.reserve(original.control_points.size() - 1);
+  const Spline::Point& starting_point = original.control_points.front();
+  int previous_x = static_cast<int>(roundf(starting_point.x)),
+      previous_y = static_cast<int>(roundf(starting_point.y));
+  int previous_delta_x = 0, previous_delta_y = 0;
+  for (auto it = original.control_points.begin() + 1;
+       it != original.control_points.end(); ++it) {
+    const int new_x = static_cast<int>(roundf(it->x));
+    const int new_y = static_cast<int>(roundf(it->y));
+    const int new_delta_x = new_x - previous_x;
+    const int new_delta_y = new_y - previous_y;
+    control_points_.emplace_back(new_delta_x - previous_delta_x,
+                                 new_delta_y - previous_delta_y);
+    previous_delta_x = new_delta_x;
+    previous_delta_y = new_delta_y;
+    previous_x = new_x;
+    previous_y = new_y;
+  }
+
+  const auto to_int = [](float v) -> int {
+    return static_cast<int>(roundf(v));
+  };
+
+  const auto quant = AdjustedQuant(quantization_adjustment);
+  const auto inv_quant = InvAdjustedQuant(quantization_adjustment);
+  for (int c : {1, 0, 2}) {
+    float factor = (c == 0) ? y_to_x : (c == 1) ? 0 : y_to_b;
+    for (int i = 0; i < 32; ++i) {
+      const float dct_factor = (i == 0) ? kSqrt2 : 1.0f;
+      const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+      auto restored_y =
+          color_dct_[1][i] * inv_dct_factor * kChannelWeight[1] * inv_quant;
+      auto decorellated = original.color_dct[c][i] - factor * restored_y;
+      color_dct_[c][i] =
+          to_int(decorellated * dct_factor * quant / kChannelWeight[c]);
+    }
+  }
+  for (int i = 0; i < 32; ++i) {
+    const float dct_factor = (i == 0) ? kSqrt2 : 1.0f;
+    sigma_dct_[i] =
+        to_int(original.sigma_dct[i] * dct_factor * quant / kChannelWeight[3]);
+  }
+}
+
+Status QuantizedSpline::Dequantize(const Spline::Point& starting_point,
+                                   const int32_t quantization_adjustment,
+                                   const float y_to_x, const float y_to_b,
+                                   const uint64_t image_size,
+                                   uint64_t* total_estimated_area_reached,
+                                   Spline& result) const {
+  result.control_points.clear();
+  result.control_points.reserve(control_points_.size() + 1);
+  float px = roundf(starting_point.x);
+  float py = roundf(starting_point.y);
+  JXL_RETURN_IF_ERROR(ValidateSplinePointPos(px, py));
+  int current_x = static_cast<int>(px);
+  int current_y = static_cast<int>(py);
+  result.control_points.push_back(Spline::Point{static_cast<float>(current_x),
+                                                static_cast<float>(current_y)});
+  int current_delta_x = 0, current_delta_y = 0;
+  size_t manhattan_distance = 0;
+  for (const auto& point : control_points_) {
+    current_delta_x += point.first;
+    current_delta_y += point.second;
+    manhattan_distance += abs(current_delta_x) + abs(current_delta_y);
+    JXL_RETURN_IF_ERROR(
+        ValidateSplinePointPos(current_delta_x, current_delta_y));
+    current_x += current_delta_x;
+    current_y += current_delta_y;
+    JXL_RETURN_IF_ERROR(ValidateSplinePointPos(current_x, current_y));
+    result.control_points.push_back(Spline::Point{
+        static_cast<float>(current_x), static_cast<float>(current_y)});
+  }
+
+  const auto inv_quant = InvAdjustedQuant(quantization_adjustment);
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < 32; ++i) {
+      const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+      result.color_dct[c][i] =
+          color_dct_[c][i] * inv_dct_factor * kChannelWeight[c] * inv_quant;
+    }
+  }
+  for (int i = 0; i < 32; ++i) {
+    result.color_dct[0][i] += y_to_x * result.color_dct[1][i];
+    result.color_dct[2][i] += y_to_b * result.color_dct[1][i];
+  }
+  uint64_t width_estimate = 0;
+
+  uint64_t color[3] = {};
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < 32; ++i) {
+      color[c] +=
+          static_cast<uint64_t>(ceil(inv_quant * std::abs(color_dct_[c][i])));
+    }
+  }
+  color[0] += static_cast<uint64_t>(ceil(abs(y_to_x))) * color[1];
+  color[2] += static_cast<uint64_t>(ceil(abs(y_to_b))) * color[1];
+  // This is not taking kChannelWeight into account, but up to constant factors
+  // it gives an indication of the influence of the color values on the area
+  // that will need to be rendered.
+  uint64_t logcolor = std::max(
+      uint64_t(1),
+      static_cast<uint64_t>(CeilLog2Nonzero(
+          uint64_t(1) + std::max(color[1], std::max(color[0], color[2])))));
+
+  for (int i = 0; i < 32; ++i) {
+    const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
+    result.sigma_dct[i] =
+        sigma_dct_[i] * inv_dct_factor * kChannelWeight[3] * inv_quant;
+    // If we include the factor kChannelWeight[3]=.3333f here, we get a
+    // realistic area estimate. We leave it out to simplify the calculations,
+    // and understand that this way we underestimate the area by a factor of
+    // 1/(0.3333*0.3333). This is taken into account in the limits below.
+    uint64_t weight = std::max(
+        uint64_t(1),
+        static_cast<uint64_t>(ceil(inv_quant * std::abs(sigma_dct_[i]))));
+    width_estimate += weight * weight * logcolor;
+  }
+  *total_estimated_area_reached += (width_estimate * manhattan_distance);
+  if (*total_estimated_area_reached >
+      std::min((1024 * image_size + (uint64_t(1) << 32)),
+               (uint64_t(1) << 42))) {
+    return JXL_FAILURE("Too large total_estimated_area_reached: %" PRIu64,
+                       *total_estimated_area_reached);
+  }
+
+  return true;
+}
+
+Status QuantizedSpline::Decode(const std::vector<uint8_t>& context_map,
+                               ANSSymbolReader* const decoder,
+                               BitReader* const br,
+                               const size_t max_control_points,
+                               size_t* total_num_control_points) {
+  const size_t num_control_points =
+      decoder->ReadHybridUint(kNumControlPointsContext, br, context_map);
+  *total_num_control_points += num_control_points;
+  if (*total_num_control_points > max_control_points) {
+    return JXL_FAILURE("Too many control points: %" PRIuS,
+                       *total_num_control_points);
+  }
+  control_points_.resize(num_control_points);
+  // Maximal image dimension.
+  constexpr int64_t kDeltaLimit = 1u << 30;
+  for (std::pair<int64_t, int64_t>& control_point : control_points_) {
+    control_point.first = UnpackSigned(
+        decoder->ReadHybridUint(kControlPointsContext, br, context_map));
+    control_point.second = UnpackSigned(
+        decoder->ReadHybridUint(kControlPointsContext, br, context_map));
+    // Check delta-deltas are not outrageous; it is not in spec, but there is
+    // no reason to allow larger values.
+    if ((control_point.first >= kDeltaLimit) ||
+        (control_point.first <= -kDeltaLimit) ||
+        (control_point.second >= kDeltaLimit) ||
+        (control_point.second <= -kDeltaLimit)) {
+      return JXL_FAILURE("Spline delta-delta is out of bounds");
+    }
+  }
+
+  const auto decode_dct = [decoder, br, &context_map](int dct[32]) -> Status {
+    for (int i = 0; i < 32; ++i) {
+      dct[i] =
+          UnpackSigned(decoder->ReadHybridUint(kDCTContext, br, context_map));
+    }
+    return true;
+  };
+  for (int c = 0; c < 3; ++c) {
+    JXL_RETURN_IF_ERROR(decode_dct(color_dct_[c]));
+  }
+  JXL_RETURN_IF_ERROR(decode_dct(sigma_dct_));
+  return true;
+}
+
+void Splines::Clear() {
+  quantization_adjustment_ = 0;
+  splines_.clear();
+  starting_points_.clear();
+  segments_.clear();
+  segment_indices_.clear();
+  segment_y_start_.clear();
+}
+
+Status Splines::Decode(jxl::BitReader* br, const size_t num_pixels) {
+  std::vector<uint8_t> context_map;
+  ANSCode code;
+  JXL_RETURN_IF_ERROR(
+      DecodeHistograms(br, kNumSplineContexts, &code, &context_map));
+  ANSSymbolReader decoder(&code, br);
+  const size_t num_splines =
+      1 + decoder.ReadHybridUint(kNumSplinesContext, br, context_map);
+  size_t max_control_points = std::min(
+      kMaxNumControlPoints, num_pixels / kMaxNumControlPointsPerPixelRatio);
+  if (num_splines > max_control_points) {
+    return JXL_FAILURE("Too many splines: %" PRIuS, num_splines);
+  }
+  JXL_RETURN_IF_ERROR(DecodeAllStartingPoints(&starting_points_, br, &decoder,
+                                              context_map, num_splines));
+
+  quantization_adjustment_ = UnpackSigned(
+      decoder.ReadHybridUint(kQuantizationAdjustmentContext, br, context_map));
+
+  splines_.clear();
+  splines_.reserve(num_splines);
+  size_t num_control_points = num_splines;
+  for (size_t i = 0; i < num_splines; ++i) {
+    QuantizedSpline spline;
+    JXL_RETURN_IF_ERROR(spline.Decode(context_map, &decoder, br,
+                                      max_control_points, &num_control_points));
+    splines_.push_back(std::move(spline));
+  }
+
+  JXL_RETURN_IF_ERROR(decoder.CheckANSFinalState());
+
+  if (!HasAny()) {
+    return JXL_FAILURE("Decoded splines but got none");
+  }
+
+  return true;
+}
+
+void Splines::AddTo(Image3F* const opsin, const Rect& opsin_rect,
+                    const Rect& image_rect) const {
+  return Apply</*add=*/true>(opsin, opsin_rect, image_rect);
+}
+void Splines::AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                       float* JXL_RESTRICT row_b, const Rect& image_row) const {
+  return ApplyToRow</*add=*/true>(row_x, row_y, row_b, image_row);
+}
+
+void Splines::SubtractFrom(Image3F* const opsin) const {
+  return Apply</*add=*/false>(opsin, Rect(*opsin), Rect(*opsin));
+}
+
+Status Splines::InitializeDrawCache(const size_t image_xsize,
+                                    const size_t image_ysize,
+                                    const ColorCorrelationMap& cmap) {
+  // TODO(veluca): avoid storing segments that are entirely outside image
+  // boundaries.
+  segments_.clear();
+  segment_indices_.clear();
+  segment_y_start_.clear();
+  std::vector<std::pair<size_t, size_t>> segments_by_y;
+  std::vector<Spline::Point> intermediate_points;
+  uint64_t total_estimated_area_reached = 0;
+  std::vector<Spline> splines;
+  for (size_t i = 0; i < splines_.size(); ++i) {
+    Spline spline;
+    JXL_RETURN_IF_ERROR(splines_[i].Dequantize(
+        starting_points_[i], quantization_adjustment_, cmap.YtoXRatio(0),
+        cmap.YtoBRatio(0), image_xsize * image_ysize,
+        &total_estimated_area_reached, spline));
+    if (std::adjacent_find(spline.control_points.begin(),
+                           spline.control_points.end()) !=
+        spline.control_points.end()) {
+      // Otherwise division by zero might occur. Once control points coincide,
+      // the direction of curve is undefined...
+      return JXL_FAILURE(
+          "identical successive control points in spline %" PRIuS, i);
+    }
+    splines.push_back(spline);
+  }
+  // TODO(firsching) Change this into a JXL_FAILURE for level 5 codestreams.
+  if (total_estimated_area_reached >
+      std::min((8 * image_xsize * image_ysize + (uint64_t(1) << 25)),
+               (uint64_t(1) << 30))) {
+    JXL_WARNING(
+        "Large total_estimated_area_reached, expect slower decoding: %" PRIu64,
+        total_estimated_area_reached);
+  }
+
+  for (Spline& spline : splines) {
+    std::vector<std::pair<Spline::Point, float>> points_to_draw;
+    auto add_point = [&](const Spline::Point& point, const float multiplier) {
+      points_to_draw.emplace_back(point, multiplier);
+    };
+    intermediate_points.clear();
+    DrawCentripetalCatmullRomSpline(spline.control_points, intermediate_points);
+    ForEachEquallySpacedPoint(intermediate_points, add_point);
+    const float arc_length =
+        (points_to_draw.size() - 2) * kDesiredRenderingDistance +
+        points_to_draw.back().second;
+    if (arc_length <= 0.f) {
+      // This spline wouldn't have any effect.
+      continue;
+    }
+    HWY_DYNAMIC_DISPATCH(SegmentsFromPoints)
+    (spline, points_to_draw, arc_length, segments_, segments_by_y);
+  }
+
+  // TODO(eustas): consider linear sorting here.
+  std::sort(segments_by_y.begin(), segments_by_y.end());
+  segment_indices_.resize(segments_by_y.size());
+  segment_y_start_.resize(image_ysize + 1);
+  for (size_t i = 0; i < segments_by_y.size(); i++) {
+    segment_indices_[i] = segments_by_y[i].second;
+    size_t y = segments_by_y[i].first;
+    if (y < image_ysize) {
+      segment_y_start_[y + 1]++;
+    }
+  }
+  for (size_t y = 0; y < image_ysize; y++) {
+    segment_y_start_[y + 1] += segment_y_start_[y];
+  }
+  return true;
+}
+
+template <bool add>
+void Splines::ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                         float* JXL_RESTRICT row_b,
+                         const Rect& image_row) const {
+  if (segments_.empty()) return;
+  JXL_ASSERT(image_row.ysize() == 1);
+  for (size_t iy = 0; iy < image_row.ysize(); iy++) {
+    HWY_DYNAMIC_DISPATCH(DrawSegments)
+    (row_x, row_y, row_b, image_row.Line(iy), add, segments_.data(),
+     segment_indices_.data(), segment_y_start_.data());
+  }
+}
+
+template <bool add>
+void Splines::Apply(Image3F* const opsin, const Rect& opsin_rect,
+                    const Rect& image_rect) const {
+  if (segments_.empty()) return;
+  for (size_t iy = 0; iy < image_rect.ysize(); iy++) {
+    const size_t y0 = opsin_rect.Line(iy).y0();
+    const size_t x0 = opsin_rect.x0();
+    ApplyToRow<add>(opsin->PlaneRow(0, y0) + x0, opsin->PlaneRow(1, y0) + x0,
+                    opsin->PlaneRow(2, y0) + x0, image_rect.Line(iy));
+  }
+}
+
+}  // namespace jxl
+#endif  // HWY_ONCE
diff --git a/third_party/jpeg-xl/lib/jxl/splines.h b/third_party/jpeg-xl/lib/jxl/splines.h
new file mode 100644
index 0000000000..c8dad3417c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines.h
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SPLINES_H_
+#define LIB_JXL_SPLINES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+static constexpr float kDesiredRenderingDistance = 1.f;
+
+enum SplineEntropyContexts : size_t {
+  kQuantizationAdjustmentContext = 0,
+  kStartingPositionContext,
+  kNumSplinesContext,
+  kNumControlPointsContext,
+  kControlPointsContext,
+  kDCTContext,
+  kNumSplineContexts
+};
+
+struct Spline {
+  struct Point {
+    Point() : x(0.0f), y(0.0f) {}
+    Point(float x, float y) : x(x), y(y) {}
+    float x, y;
+    bool operator==(const Point& other) const {
+      return std::fabs(x - other.x) < 1e-3f && std::fabs(y - other.y) < 1e-3f;
+    }
+  };
+  std::vector<Point> control_points;
+  // X, Y, B.
+  float color_dct[3][32];
+  // Splines are draws by normalized Gaussian splatting. This controls the
+  // Gaussian's parameter along the spline.
+  float sigma_dct[32];
+};
+
+class QuantizedSplineEncoder;
+
+class QuantizedSpline {
+ public:
+  QuantizedSpline() = default;
+  explicit QuantizedSpline(const Spline& original,
+                           int32_t quantization_adjustment, float y_to_x,
+                           float y_to_b);
+
+  Status Dequantize(const Spline::Point& starting_point,
+                    int32_t quantization_adjustment, float y_to_x, float y_to_b,
+                    uint64_t image_size, uint64_t* total_estimated_area_reached,
+                    Spline& result) const;
+
+  Status Decode(const std::vector<uint8_t>& context_map,
+                ANSSymbolReader* decoder, BitReader* br,
+                size_t max_control_points, size_t* total_num_control_points);
+
+ private:
+  friend class QuantizedSplineEncoder;
+
+  std::vector<std::pair<int64_t, int64_t>>
+      control_points_;  // Double delta-encoded.
+  int color_dct_[3][32] = {};
+  int sigma_dct_[32] = {};
+};
+
+// A single "drawable unit" of a spline, i.e. a line of the region in which we
+// render each Gaussian. The structure doesn't actually depend on the exact
+// row, which allows reuse for different y values (which are tracked
+// separately).
+struct SplineSegment {
+  float center_x, center_y;
+  float maximum_distance;
+  float inv_sigma;
+  float sigma_over_4_times_intensity;
+  float color[3];
+};
+
+class Splines {
+ public:
+  Splines() = default;
+  explicit Splines(const int32_t quantization_adjustment,
+                   std::vector<QuantizedSpline> splines,
+                   std::vector<Spline::Point> starting_points)
+      : quantization_adjustment_(quantization_adjustment),
+        splines_(std::move(splines)),
+        starting_points_(std::move(starting_points)) {}
+
+  bool HasAny() const { return !splines_.empty(); }
+
+  void Clear();
+
+  Status Decode(BitReader* br, size_t num_pixels);
+
+  void AddTo(Image3F* opsin, const Rect& opsin_rect,
+             const Rect& image_rect) const;
+  void AddToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                float* JXL_RESTRICT row_b, const Rect& image_row) const;
+  void SubtractFrom(Image3F* opsin) const;
+
+  const std::vector<QuantizedSpline>& QuantizedSplines() const {
+    return splines_;
+  }
+  const std::vector<Spline::Point>& StartingPoints() const {
+    return starting_points_;
+  }
+
+  int32_t GetQuantizationAdjustment() const { return quantization_adjustment_; }
+
+  Status InitializeDrawCache(size_t image_xsize, size_t image_ysize,
+                             const ColorCorrelationMap& cmap);
+
+ private:
+  template <bool>
+  void ApplyToRow(float* JXL_RESTRICT row_x, float* JXL_RESTRICT row_y,
+                  float* JXL_RESTRICT row_b, const Rect& image_row) const;
+  template <bool>
+  void Apply(Image3F* opsin, const Rect& opsin_rect,
+             const Rect& image_rect) const;
+
+  // If positive, quantization weights are multiplied by 1 + this/8, which
+  // increases precision. If negative, they are divided by 1 - this/8. If 0,
+  // they are unchanged.
+  int32_t quantization_adjustment_ = 0;
+  std::vector<QuantizedSpline> splines_;
+  std::vector<Spline::Point> starting_points_;
+  std::vector<SplineSegment> segments_;
+  std::vector<size_t> segment_indices_;
+  std::vector<size_t> segment_y_start_;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_SPLINES_H_
diff --git a/third_party/jpeg-xl/lib/jxl/splines_gbench.cc b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc
new file mode 100644
index 0000000000..78ff6d41c0
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines_gbench.cc
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/splines.h"
+
+namespace jxl {
+namespace {
+
+constexpr int kQuantizationAdjustment = 0;
+const ColorCorrelationMap* const cmap = new ColorCorrelationMap;
+const float kYToX = cmap->YtoXRatio(0);
+const float kYToB = cmap->YtoBRatio(0);
+
+void BM_Splines(benchmark::State& state) {
+  const size_t n = state.range();
+
+  std::vector<Spline> spline_data = {
+      {/*control_points=*/{
+           {9, 54}, {118, 159}, {97, 3}, {10, 40}, {150, 25}, {120, 300}},
+       /*color_dct=*/
+       {{0.03125f, 0.00625f, 0.003125f}, {1.f, 0.321875f}, {1.f, 0.24375f}},
+       /*sigma_dct=*/{0.3125f, 0.f, 0.f, 0.0625f}}};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F drawing_area(320, 320);
+  ZeroFillImage(&drawing_area);
+  for (auto _ : state) {
+    for (size_t i = 0; i < n; ++i) {
+      JXL_CHECK(splines.InitializeDrawCache(drawing_area.xsize(),
+                                            drawing_area.ysize(), *cmap));
+      splines.AddTo(&drawing_area, Rect(drawing_area), Rect(drawing_area));
+    }
+  }
+
+  state.SetItemsProcessed(n * state.iterations());
+}
+
+BENCHMARK(BM_Splines)->Range(1, 1 << 10);
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/splines_test.cc b/third_party/jpeg-xl/lib/jxl/splines_test.cc
new file mode 100644
index 0000000000..8d6bc7ed1c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/splines_test.cc
@@ -0,0 +1,348 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/splines.h"
+
+#include "lib/extras/codec.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_splines.h"
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+
+std::ostream& operator<<(std::ostream& os, const Spline::Point& p) {
+  return os << "(" << p.x << ", " << p.y << ")";
+}
+
+std::ostream& operator<<(std::ostream& os, const Spline& spline) {
+  return os << "(spline with " << spline.control_points.size()
+            << " control points)";
+}
+
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+constexpr int kQuantizationAdjustment = 0;
+const ColorCorrelationMap* const cmap = new ColorCorrelationMap;
+const float kYToX = cmap->YtoXRatio(0);
+const float kYToB = cmap->YtoBRatio(0);
+
+constexpr float kTolerance = 0.003125;
+
+std::vector<Spline> DequantizeSplines(const Splines& splines) {
+  const auto& quantized_splines = splines.QuantizedSplines();
+  const auto& starting_points = splines.StartingPoints();
+  JXL_CHECK(quantized_splines.size() == starting_points.size());
+
+  std::vector<Spline> dequantized;
+  uint64_t total = 0;
+  for (size_t i = 0; i < quantized_splines.size(); ++i) {
+    dequantized.emplace_back();
+    JXL_CHECK(quantized_splines[i].Dequantize(
+        starting_points[i], kQuantizationAdjustment, kYToX, kYToB, 2u << 30u,
+        &total, dequantized.back()));
+  }
+  return dequantized;
+}
+
+MATCHER(ControlPointIs, "") {
+  const Spline::Point& actual = std::get<0>(arg);
+  const Spline::Point& expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(
+      AllOf(Field(&Spline::Point::x, FloatNear(expected.x, kTolerance)),
+            Field(&Spline::Point::y, FloatNear(expected.y, kTolerance))),
+      actual, result_listener);
+}
+
+MATCHER(ControlPointsMatch, "") {
+  const Spline& actual = std::get<0>(arg);
+  const Spline& expected = std::get<1>(arg);
+  return testing::ExplainMatchResult(
+      Field(&Spline::control_points,
+            Pointwise(ControlPointIs(), expected.control_points)),
+      actual, result_listener);
+}
+
+MATCHER(SplinesMatch, "") {
+  const Spline& actual = std::get<0>(arg);
+  const Spline& expected = std::get<1>(arg);
+  if (!testing::ExplainMatchResult(ControlPointsMatch(), arg,
+                                   result_listener)) {
+    return false;
+  }
+  for (int i = 0; i < 3; ++i) {
+    size_t color_dct_size =
+        sizeof(expected.color_dct[i]) / sizeof(expected.color_dct[i][0]);
+    for (size_t j = 0; j < color_dct_size; j++) {
+      testing::StringMatchResultListener color_dct_listener;
+      if (!testing::ExplainMatchResult(
+              FloatNear(expected.color_dct[i][j], kTolerance),
+              actual.color_dct[i][j], &color_dct_listener)) {
+        *result_listener << ", where color_dct[" << i << "][" << j
+                         << "] don't match, " << color_dct_listener.str();
+        return false;
+      }
+    }
+  }
+  size_t sigma_dct_size =
+      sizeof(expected.sigma_dct) / sizeof(expected.sigma_dct[0]);
+  for (size_t i = 0; i < sigma_dct_size; i++) {
+    testing::StringMatchResultListener sigma_listener;
+    if (!testing::ExplainMatchResult(
+            FloatNear(expected.sigma_dct[i], kTolerance), actual.sigma_dct[i],
+            &sigma_listener)) {
+      *result_listener << ", where sigma_dct[" << i << "] don't match, "
+                       << sigma_listener.str();
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+TEST(SplinesTest, Serialization) {
+  std::vector<Spline> spline_data = {
+      {/*control_points=*/{
+           {109, 54}, {218, 159}, {80, 3}, {110, 274}, {94, 185}, {17, 277}},
+       /*color_dct=*/
+       {{36.3, 39.7, 23.2, 67.5, 4.4,  71.5, 62.3, 32.3, 92.2, 10.1, 10.8,
+         9.2,  6.1,  10.5, 79.1, 7,    24.6, 90.8, 5.5,  84,   43.8, 49,
+         33.5, 78.9, 54.5, 77.9, 62.1, 51.4, 36.4, 14.3, 83.7, 35.4},
+        {9.4,  53.4, 9.5,  74.9, 72.7, 26.7, 7.9,  0.9, 84.9, 23.2, 26.5,
+         31.1, 91,   11.7, 74.1, 39.3, 23.7, 82.5, 4.8, 2.7,  61.2, 96.4,
+         13.7, 66.7, 62.9, 82.4, 5.9,  98.7, 21.5, 7.9, 51.7, 63.1},
+        {48,   39.3, 6.9,  26.3, 33.3, 6.2,  1.7,  98.9, 59.9, 59.6, 95,
+         61.3, 82.7, 53,   6.1,  30.4, 34.7, 96.9, 93.4, 17,   38.8, 80.8,
+         63,   18.6, 43.6, 32.3, 61,   20.2, 24.3, 28.3, 69.1, 62.4}},
+       /*sigma_dct=*/{32.7, 21.5, 44.4, 1.8,  45.8, 90.6, 29.3, 59.2,
+                      23.7, 85.2, 84.8, 27.2, 42.1, 84.1, 50.6, 17.6,
+                      93.7, 4.9,  2.6,  69.8, 94.9, 52,   24.3, 18.8,
+                      12.1, 95.7, 28.5, 81.4, 89.9, 31.4, 74.8, 52}},
+      {/*control_points=*/{{172, 309},
+                           {196, 277},
+                           {42, 238},
+                           {114, 350},
+                           {307, 290},
+                           {316, 269},
+                           {124, 66},
+                           {233, 267}},
+       /*color_dct=*/
+       {{15,   28.9, 22, 6.6,  41.8, 83,   8.6,  56.8, 68.9, 9.7,  5.4,
+         19.8, 70.8, 90, 52.5, 65.2, 7.8,  23.5, 26.4, 72.2, 64.7, 87.1,
+         1.3,  67.5, 46, 68.4, 65.4, 35.5, 29.1, 13,   41.6, 23.9},
+        {47.7, 79.4, 62.7, 29.1, 96.8, 18.5, 17.6, 15.2, 80.5, 56,  96.2,
+         59.9, 26.7, 96.1, 92.3, 42.1, 35.8, 54,   23.2, 55,   76,  35.8,
+         58.4, 88.7, 2.4,  78.1, 95.6, 27.5, 6.6,  78.5, 24.1, 69.8},
+        {43.8, 96.5, 0.9,  95.1, 49.1, 71.2, 25.1, 33.6, 75.2, 95,  82.1,
+         19.7, 10.5, 44.9, 50,   93.3, 83.5, 99.5, 64.6, 54,   3.5, 99.7,
+         45.3, 82.1, 22.4, 37.9, 60,   32.2, 12.6, 4.6,  65.5, 96.4}},
+       /*sigma_dct=*/{72.5, 2.6,  41.7, 2.2,  39.7, 79.1, 69.6, 19.9,
+                      92.3, 71.5, 41.9, 62.1, 30,   49.4, 70.3, 45.3,
+                      62.5, 47.2, 46.7, 41.2, 90.8, 46.8, 91.2, 55,
+                      8.1,  69.6, 25.4, 84.7, 61.7, 27.6, 3.7,  46.9}},
+      {/*control_points=*/{{100, 186},
+                           {257, 97},
+                           {170, 49},
+                           {25, 169},
+                           {309, 104},
+                           {232, 237},
+                           {385, 101},
+                           {122, 168},
+                           {26, 300},
+                           {390, 88}},
+       /*color_dct=*/
+       {{16.9, 64.8, 4.2,  10.6, 23.5, 17,   79.3, 5.7,  60.4, 16.6, 94.9,
+         63.7, 87.6, 10.5, 3.8,  61.1, 22.9, 81.9, 80.4, 40.5, 45.9, 25.4,
+         39.8, 30,   50.2, 90.4, 27.9, 93.7, 65.1, 48.2, 22.3, 43.9},
+        {24.9, 66,   3.5,  90.2, 97.1, 15.8, 35.6, 0.6,  68,   39.6, 24.4,
+         85.9, 57.7, 77.6, 47.5, 67.9, 4.3,  5.4,  91.2, 58.5, 0.1,  52.2,
+         3.5,  47.8, 63.2, 43.5, 85.8, 35.8, 50.2, 35.9, 19.2, 48.2},
+        {82.8, 44.9, 76.4, 39.5, 94.1, 14.3, 89.8, 10,   10.5, 74.5, 56.3,
+         65.8, 7.8,  23.3, 52.8, 99.3, 56.8, 46,   76.7, 13.5, 67,   22.4,
+         29.9, 43.3, 70.3, 26,   74.3, 53.9, 62,   19.1, 49.3, 46.7}},
+       /*sigma_dct=*/{83.5, 1.7,  25.1, 18.7, 46.5, 75.3, 28,   62.3,
+                      50.3, 23.3, 85.6, 96,   45.8, 33.1, 33.4, 52.9,
+                      26.3, 58.5, 19.6, 70,   92.6, 22.5, 57,   21.6,
+                      76.8, 87.5, 22.9, 66.3, 35.7, 35.6, 56.8, 67.2}},
+  };
+
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+  const std::vector<Spline> quantized_spline_data = DequantizeSplines(splines);
+  EXPECT_THAT(quantized_spline_data,
+              Pointwise(ControlPointsMatch(), spline_data));
+
+  BitWriter writer;
+  EncodeSplines(splines, &writer, kLayerSplines, HistogramParams(), nullptr);
+  writer.ZeroPadToByte();
+  const size_t bits_written = writer.BitsWritten();
+
+  printf("Wrote %" PRIuS " bits of splines.\n", bits_written);
+
+  BitReader reader(writer.GetSpan());
+  Splines decoded_splines;
+  ASSERT_TRUE(decoded_splines.Decode(&reader, /*num_pixels=*/1000));
+  ASSERT_TRUE(reader.JumpToByteBoundary());
+  EXPECT_EQ(reader.TotalBitsConsumed(), bits_written);
+  ASSERT_TRUE(reader.Close());
+
+  const std::vector<Spline> decoded_spline_data =
+      DequantizeSplines(decoded_splines);
+  EXPECT_THAT(decoded_spline_data,
+              Pointwise(SplinesMatch(), quantized_spline_data));
+}
+
+#ifdef JXL_CRASH_ON_ERROR
+TEST(SplinesTest, DISABLED_TooManySplinesTest) {
+#else
+TEST(SplinesTest, TooManySplinesTest) {
+#endif
+  // This is more than the limit for 1000 pixels.
+  const size_t kNumSplines = 300;
+
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (size_t i = 0; i < kNumSplines; i++) {
+    Spline spline = {
+        /*control_points=*/{{1.f + i, 2}, {10.f + i, 25}, {30.f + i, 300}},
+        /*color_dct=*/
+        {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}},
+        /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}};
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+  BitWriter writer;
+  EncodeSplines(splines, &writer, kLayerSplines,
+                HistogramParams(SpeedTier::kFalcon, 1), nullptr);
+  writer.ZeroPadToByte();
+  // Re-read splines.
+  BitReader reader(writer.GetSpan());
+  Splines decoded_splines;
+  EXPECT_FALSE(decoded_splines.Decode(&reader, /*num_pixels=*/1000));
+  EXPECT_TRUE(reader.Close());
+}
+
+#ifdef JXL_CRASH_ON_ERROR
+TEST(SplinesTest, DISABLED_DuplicatePoints) {
+#else
+TEST(SplinesTest, DuplicatePoints) {
+#endif
+  std::vector<Spline::Point> control_points{
+      {9, 54}, {118, 159}, {97, 3},  // Repeated.
+      {97, 3}, {10, 40},   {150, 25}, {120, 300}};
+  Spline spline{control_points,
+                /*color_dct=*/
+                {{1.f, 0.2f, 0.1f}, {35.7f, 10.3f}, {35.7f, 7.8f}},
+                /*sigma_dct=*/{10.f, 0.f, 0.f, 2.f}};
+  std::vector<Spline> spline_data{spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F image(320, 320);
+  ZeroFillImage(&image);
+  EXPECT_FALSE(
+      splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap));
+}
+
+TEST(SplinesTest, Drawing) {
+  CodecInOut io_expected;
+  const PaddedBytes orig = jxl::test::ReadTestData("jxl/splines.pfm");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_expected,
+                           /*pool=*/nullptr));
+
+  std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
+                                            {10, 40}, {150, 25},  {120, 300}};
+  // Use values that survive quant/decorellation roundtrip.
+  const Spline spline{
+      control_points,
+      /*color_dct=*/
+      {{0.4989345073699951171875000f, 0.4997999966144561767578125f},
+       {0.4772970676422119140625000f, 0.f, 0.5250000357627868652343750f},
+       {-0.0176776945590972900390625f, 0.4900000095367431640625000f,
+        0.5250000357627868652343750f}},
+      /*sigma_dct=*/
+      {0.9427147507667541503906250f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+       0.6665999889373779296875000f}};
+  std::vector<Spline> spline_data = {spline};
+  std::vector<QuantizedSpline> quantized_splines;
+  std::vector<Spline::Point> starting_points;
+  for (const Spline& spline : spline_data) {
+    quantized_splines.emplace_back(spline, kQuantizationAdjustment, kYToX,
+                                   kYToB);
+    starting_points.push_back(spline.control_points.front());
+  }
+  Splines splines(kQuantizationAdjustment, std::move(quantized_splines),
+                  std::move(starting_points));
+
+  Image3F image(320, 320);
+  ZeroFillImage(&image);
+  ASSERT_TRUE(splines.InitializeDrawCache(image.xsize(), image.ysize(), *cmap));
+  splines.AddTo(&image, Rect(image), Rect(image));
+
+  CodecInOut io_actual;
+  io_actual.SetFromImage(CopyImage(image), ColorEncoding::SRGB());
+  ASSERT_TRUE(io_actual.frames[0].TransformTo(io_expected.Main().c_current(),
+                                              GetJxlCms()));
+
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
+}
+
+TEST(SplinesTest, ClearedEveryFrame) {
+  CodecInOut io_expected;
+  const PaddedBytes bytes_expected =
+      jxl::test::ReadTestData("jxl/spline_on_first_frame.png");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(bytes_expected), &io_expected,
+                           /*pool=*/nullptr));
+  CodecInOut io_actual;
+  const PaddedBytes bytes_actual =
+      jxl::test::ReadTestData("jxl/spline_on_first_frame.jxl");
+  ASSERT_TRUE(
+      test::DecodeFile({}, Span<const uint8_t>(bytes_actual), &io_actual));
+
+  ASSERT_TRUE(
+      io_actual.frames[0].TransformTo(ColorEncoding::SRGB(), GetJxlCms()));
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < io_actual.ysize(); ++y) {
+      float* const JXL_RESTRICT row = io_actual.Main().color()->PlaneRow(c, y);
+      for (size_t x = 0; x < io_actual.xsize(); ++x) {
+        row[x] = Clamp1(row[x], 0.f, 1.f);
+      }
+    }
+  }
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_image.cc b/third_party/jpeg-xl/lib/jxl/test_image.cc
new file mode 100644
index 0000000000..af1d1293ef
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_image.cc
@@ -0,0 +1,453 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_image.h"
+
+#include <jxl/encode.h>
+
+#include <algorithm>
+#include <cstring>
+#include <utility>
+
+#include "lib/extras/dec/color_description.h"
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+
+namespace jxl {
+namespace test {
+
+namespace {
+
+void StoreValue(float val, size_t bits_per_sample, JxlPixelFormat format,
+                uint8_t** out) {
+  const float mul = (1u << bits_per_sample) - 1;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    **out = val * mul;
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    uint16_t uval = val * mul;
+    if (SwapEndianness(format.endianness)) {
+      uval = JXL_BSWAP16(uval);
+    }
+    memcpy(*out, &uval, 2);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    // TODO(szabadka) Add support for custom bits / exponent bits floats.
+    if (SwapEndianness(format.endianness)) {
+      val = BSwapFloat(val);
+    }
+    memcpy(*out, &val, 4);
+  } else {
+    // TODO(szabadka) Add support for FLOAT16.
+  }
+  *out += extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+}
+
+void FillPackedImage(size_t bits_per_sample, uint16_t seed,
+                     extras::PackedImage* image) {
+  const size_t xsize = image->xsize;
+  const size_t ysize = image->ysize;
+  const JxlPixelFormat format = image->format;
+
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rngu = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Returns random float in interval [0.0, max_value)
+  auto rngf = [&generator](float max_value) {
+    return generator.UniformF(0.0f, max_value);
+  };
+
+  // Dark background gradient color
+  float r0 = rngf(0.5f);
+  float g0 = rngf(0.5f);
+  float b0 = rngf(0.5f);
+  float a0 = rngf(0.5f);
+  float r1 = rngf(0.5f);
+  float g1 = rngf(0.5f);
+  float b1 = rngf(0.5f);
+  float a1 = rngf(0.5f);
+
+  // Circle with different color
+  size_t circle_x = rngu(xsize);
+  size_t circle_y = rngu(ysize);
+  size_t circle_r = rngu(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rngu(xsize);
+  size_t rect_y0 = rngu(ysize);
+  size_t rect_x1 = rngu(xsize);
+  size_t rect_y1 = rngu(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  uint8_t* out = reinterpret_cast<uint8_t*>(image->pixels());
+  const float imul16 = 1.0f / 65536.0f;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      float r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      float g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      float b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      float a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = std::min(1.0f, ((65535 - x * y) ^ seed) * imul16);
+        g = std::min(1.0f, ((x << 8) + y + seed) * imul16);
+        b = std::min(1.0f, ((y << 8) + x * seed) * imul16);
+        a = std::min(1.0f, (32768 + x * 256 - y) * imul16);
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rngf(1.0f);
+        g = rngf(1.0f);
+        b = rngf(1.0f);
+        a = rngf(1.0f);
+      }
+      if (format.num_channels == 1) {
+        StoreValue(g, bits_per_sample, format, &out);
+      } else if (format.num_channels == 2) {
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      } else if (format.num_channels == 3) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+      } else if (format.num_channels == 4) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed) {
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rng = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Dark background gradient color
+  uint16_t r0 = rng(32768);
+  uint16_t g0 = rng(32768);
+  uint16_t b0 = rng(32768);
+  uint16_t a0 = rng(32768);
+  uint16_t r1 = rng(32768);
+  uint16_t g1 = rng(32768);
+  uint16_t b1 = rng(32768);
+  uint16_t a1 = rng(32768);
+
+  // Circle with different color
+  size_t circle_x = rng(xsize);
+  size_t circle_y = rng(ysize);
+  size_t circle_r = rng(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rng(xsize);
+  size_t rect_y0 = rng(ysize);
+  size_t rect_x1 = rng(xsize);
+  size_t rect_y1 = rng(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  size_t num_pixels = xsize * ysize;
+  // 16 bits per channel, big endian, 4 channels
+  std::vector<uint8_t> pixels(num_pixels * num_channels * 2);
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = (65535 - x * y) ^ seed;
+        g = (x << 8) + y + seed;
+        b = (y << 8) + x * seed;
+        a = 32768 + x * 256 - y;
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rng(65536);
+        g = rng(65536);
+        b = rng(65536);
+        a = rng(65536);
+      }
+      size_t i = (y * xsize + x) * 2 * num_channels;
+      pixels[i + 0] = (r >> 8);
+      pixels[i + 1] = (r & 255);
+      if (num_channels >= 2) {
+        // This may store what is called 'g' in the alpha channel of a 2-channel
+        // image, but that's ok since the content is arbitrary
+        pixels[i + 2] = (g >> 8);
+        pixels[i + 3] = (g & 255);
+      }
+      if (num_channels >= 3) {
+        pixels[i + 4] = (b >> 8);
+        pixels[i + 5] = (b & 255);
+      }
+      if (num_channels >= 4) {
+        pixels[i + 6] = (a >> 8);
+        pixels[i + 7] = (a & 255);
+      }
+    }
+  }
+  return pixels;
+}
+
+TestImage::TestImage() {
+  SetChannels(3);
+  SetAllBitDepths(8);
+  SetColorEncoding("RGB_D65_SRG_Rel_SRG");
+}
+
+TestImage& TestImage::DecodeFromBytes(const PaddedBytes& bytes) {
+  ColorEncoding c_enc;
+  JXL_CHECK(
+      ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc));
+  extras::ColorHints color_hints;
+  color_hints.Add("color_space", Description(c_enc));
+  JXL_CHECK(
+      extras::DecodeBytes(Span<const uint8_t>(bytes), color_hints, &ppf_));
+  return *this;
+}
+
+TestImage& TestImage::ClearMetadata() {
+  ppf_.metadata = extras::PackedMetadata();
+  return *this;
+}
+
+TestImage& TestImage::SetDimensions(size_t xsize, size_t ysize) {
+  if (xsize <= ppf_.info.xsize && ysize <= ppf_.info.ysize) {
+    for (auto& frame : ppf_.frames) {
+      CropLayerInfo(xsize, ysize, &frame.frame_info.layer_info);
+      CropImage(xsize, ysize, &frame.color);
+      for (auto& ec : frame.extra_channels) {
+        CropImage(xsize, ysize, &ec);
+      }
+    }
+  } else {
+    JXL_CHECK(ppf_.info.xsize == 0 && ppf_.info.ysize == 0);
+  }
+  ppf_.info.xsize = xsize;
+  ppf_.info.ysize = ysize;
+  return *this;
+}
+
+TestImage& TestImage::SetChannels(size_t num_channels) {
+  JXL_CHECK(ppf_.frames.empty());
+  JXL_CHECK(!ppf_.preview_frame);
+  ppf_.info.num_color_channels = num_channels < 3 ? 1 : 3;
+  ppf_.info.num_extra_channels = num_channels - ppf_.info.num_color_channels;
+  if (ppf_.info.num_extra_channels > 0 && ppf_.info.alpha_bits == 0) {
+    ppf_.info.alpha_bits = ppf_.info.bits_per_sample;
+    ppf_.info.alpha_exponent_bits = ppf_.info.exponent_bits_per_sample;
+  }
+  ppf_.extra_channels_info.clear();
+  for (size_t i = 1; i < ppf_.info.num_extra_channels; ++i) {
+    extras::PackedExtraChannel ec;
+    ec.index = i;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &ec.ec_info);
+    if (ec.ec_info.bits_per_sample == 0) {
+      ec.ec_info.bits_per_sample = ppf_.info.bits_per_sample;
+      ec.ec_info.exponent_bits_per_sample = ppf_.info.exponent_bits_per_sample;
+    }
+    ppf_.extra_channels_info.emplace_back(std::move(ec));
+  }
+  format_.num_channels = std::min(static_cast<size_t>(4), num_channels);
+  if (ppf_.info.num_color_channels == 1 &&
+      ppf_.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+    SetColorEncoding("Gra_D65_Rel_SRG");
+  }
+  return *this;
+}
+
+// Sets the same bit depth on color, alpha and all extra channels.
+TestImage& TestImage::SetAllBitDepths(uint32_t bits_per_sample,
+                                      uint32_t exponent_bits_per_sample) {
+  ppf_.info.bits_per_sample = bits_per_sample;
+  ppf_.info.exponent_bits_per_sample = exponent_bits_per_sample;
+  if (ppf_.info.num_extra_channels > 0) {
+    ppf_.info.alpha_bits = bits_per_sample;
+    ppf_.info.alpha_exponent_bits = exponent_bits_per_sample;
+  }
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    extras::PackedExtraChannel& ec = ppf_.extra_channels_info[i];
+    ec.ec_info.bits_per_sample = bits_per_sample;
+    ec.ec_info.exponent_bits_per_sample = exponent_bits_per_sample;
+  }
+  format_.data_type = DefaultDataType(ppf_.info);
+  return *this;
+}
+
+TestImage& TestImage::SetDataType(JxlDataType data_type) {
+  format_.data_type = data_type;
+  return *this;
+}
+
+TestImage& TestImage::SetEndianness(JxlEndianness endianness) {
+  format_.endianness = endianness;
+  return *this;
+}
+
+TestImage& TestImage::SetColorEncoding(const std::string& description) {
+  JXL_CHECK(ParseDescription(description, &ppf_.color_encoding));
+  ColorEncoding c_enc;
+  JXL_CHECK(
+      ConvertExternalToInternalColorEncoding(ppf_.color_encoding, &c_enc));
+  JXL_CHECK(c_enc.CreateICC());
+  PaddedBytes icc = c_enc.ICC();
+  ppf_.icc.assign(icc.begin(), icc.end());
+  return *this;
+}
+
+TestImage& TestImage::CoalesceGIFAnimationWithAlpha() {
+  extras::PackedFrame canvas = ppf_.frames[0].Copy();
+  JXL_CHECK(canvas.color.format.num_channels == 3);
+  JXL_CHECK(canvas.color.format.data_type == JXL_TYPE_UINT8);
+  JXL_CHECK(canvas.extra_channels.size() == 1);
+  for (size_t i = 1; i < ppf_.frames.size(); i++) {
+    const extras::PackedFrame& frame = ppf_.frames[i];
+    JXL_CHECK(frame.extra_channels.size() == 1);
+    const JxlLayerInfo& layer_info = frame.frame_info.layer_info;
+    extras::PackedFrame rendered = canvas.Copy();
+    uint8_t* pixels_rendered =
+        reinterpret_cast<uint8_t*>(rendered.color.pixels());
+    const uint8_t* pixels_frame =
+        reinterpret_cast<const uint8_t*>(frame.color.pixels());
+    uint8_t* alpha_rendered =
+        reinterpret_cast<uint8_t*>(rendered.extra_channels[0].pixels());
+    const uint8_t* alpha_frame =
+        reinterpret_cast<const uint8_t*>(frame.extra_channels[0].pixels());
+    for (size_t y = 0; y < frame.color.ysize; y++) {
+      for (size_t x = 0; x < frame.color.xsize; x++) {
+        size_t idx_frame = y * frame.color.xsize + x;
+        size_t idx_rendered = ((layer_info.crop_y0 + y) * rendered.color.xsize +
+                               (layer_info.crop_x0 + x));
+        if (alpha_frame[idx_frame] != 0) {
+          memcpy(&pixels_rendered[idx_rendered * 3],
+                 &pixels_frame[idx_frame * 3], 3);
+          alpha_rendered[idx_rendered] = alpha_frame[idx_frame];
+        }
+      }
+    }
+    if (layer_info.save_as_reference != 0) {
+      canvas = rendered.Copy();
+    }
+    ppf_.frames[i] = std::move(rendered);
+  }
+  return *this;
+}
+
+TestImage::Frame::Frame(TestImage* parent, bool is_preview, size_t index)
+    : parent_(parent), is_preview_(is_preview), index_(index) {}
+
+void TestImage::Frame::ZeroFill() {
+  memset(frame().color.pixels(), 0, frame().color.pixels_size);
+  for (auto& ec : frame().extra_channels) {
+    memset(ec.pixels(), 0, ec.pixels_size);
+  }
+}
+
+void TestImage::Frame::RandomFill(uint16_t seed) {
+  FillPackedImage(ppf().info.bits_per_sample, seed, &frame().color);
+  for (size_t i = 0; i < ppf().extra_channels_info.size(); ++i) {
+    FillPackedImage(ppf().extra_channels_info[i].ec_info.bits_per_sample,
+                    seed + 1 + i, &frame().extra_channels[i]);
+  }
+}
+
+void TestImage::Frame::SetValue(size_t y, size_t x, size_t c, float val) {
+  const extras::PackedImage& color = frame().color;
+  JxlPixelFormat format = color.format;
+  JXL_CHECK(y < ppf().info.ysize);
+  JXL_CHECK(x < ppf().info.xsize);
+  JXL_CHECK(c < format.num_channels);
+  size_t pwidth = extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+  size_t idx = ((y * color.xsize + x) * format.num_channels + c) * pwidth;
+  uint8_t* pixels = reinterpret_cast<uint8_t*>(frame().color.pixels());
+  uint8_t* p = pixels + idx;
+  StoreValue(val, ppf().info.bits_per_sample, frame().color.format, &p);
+}
+
+TestImage::Frame TestImage::AddFrame() {
+  size_t index = ppf_.frames.size();
+  extras::PackedFrame frame(ppf_.info.xsize, ppf_.info.ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(ppf_.info.xsize, ppf_.info.ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.frames.emplace_back(std::move(frame));
+  return Frame(this, false, index);
+}
+
+TestImage::Frame TestImage::AddPreview(size_t xsize, size_t ysize) {
+  extras::PackedFrame frame(xsize, ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(xsize, ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.preview_frame = make_unique<extras::PackedFrame>(std::move(frame));
+  return Frame(this, true, 0);
+}
+
+void TestImage::CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info) {
+  if (info->crop_x0 < static_cast<ssize_t>(xsize)) {
+    info->xsize = std::min<size_t>(info->xsize, xsize - info->crop_x0);
+  } else {
+    info->xsize = 0;
+  }
+  if (info->crop_y0 < static_cast<ssize_t>(ysize)) {
+    info->ysize = std::min<size_t>(info->ysize, ysize - info->crop_y0);
+  } else {
+    info->ysize = 0;
+  }
+}
+
+void TestImage::CropImage(size_t xsize, size_t ysize,
+                          extras::PackedImage* image) {
+  size_t new_stride = (image->stride / image->xsize) * xsize;
+  uint8_t* buf = reinterpret_cast<uint8_t*>(image->pixels());
+  for (size_t y = 0; y < ysize; ++y) {
+    memmove(&buf[y * new_stride], &buf[y * image->stride], new_stride);
+  }
+  image->xsize = xsize;
+  image->ysize = ysize;
+  image->stride = new_stride;
+  image->pixels_size = ysize * new_stride;
+}
+
+JxlDataType TestImage::DefaultDataType(const JxlBasicInfo& info) {
+  if (info.bits_per_sample == 16 && info.exponent_bits_per_sample == 5) {
+    return JXL_TYPE_FLOAT16;
+  } else if (info.exponent_bits_per_sample > 0 || info.bits_per_sample > 16) {
+    return JXL_TYPE_FLOAT;
+  } else if (info.bits_per_sample > 8) {
+    return JXL_TYPE_UINT16;
+  } else {
+    return JXL_TYPE_UINT8;
+  }
+}
+
+}  // namespace test
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_image.h b/third_party/jpeg-xl/lib/jxl/test_image.h
new file mode 100644
index 0000000000..0106a4b341
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_image.h
@@ -0,0 +1,94 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TEST_IMAGE_H_
+#define LIB_JXL_TEST_IMAGE_H_
+
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/padded_bytes.h"
+
+namespace jxl {
+namespace test {
+
+// Returns a test image with some autogenerated pixel content, using 16 bits per
+// channel, big endian order, 1 to 4 channels
+// The seed parameter allows to create images with different pixel content.
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed);
+
+class TestImage {
+ public:
+  TestImage();
+
+  extras::PackedPixelFile& ppf() { return ppf_; }
+
+  TestImage& DecodeFromBytes(const PaddedBytes& bytes);
+
+  TestImage& ClearMetadata();
+
+  TestImage& SetDimensions(size_t xsize, size_t ysize);
+
+  TestImage& SetChannels(size_t num_channels);
+
+  // Sets the same bit depth on color, alpha and all extra channels.
+  TestImage& SetAllBitDepths(uint32_t bits_per_sample,
+                             uint32_t exponent_bits_per_sample = 0);
+
+  TestImage& SetDataType(JxlDataType data_type);
+
+  TestImage& SetEndianness(JxlEndianness endianness);
+
+  TestImage& SetColorEncoding(const std::string& description);
+
+  TestImage& CoalesceGIFAnimationWithAlpha();
+
+  class Frame {
+   public:
+    Frame(TestImage* parent, bool is_preview, size_t index);
+
+    void ZeroFill();
+    void RandomFill(uint16_t seed = 177);
+
+    void SetValue(size_t y, size_t x, size_t c, float val);
+
+   private:
+    extras::PackedPixelFile& ppf() const { return parent_->ppf(); }
+
+    extras::PackedFrame& frame() {
+      return is_preview_ ? *ppf().preview_frame : ppf().frames[index_];
+    }
+
+    TestImage* parent_;
+    bool is_preview_;
+    size_t index_;
+  };
+
+  Frame AddFrame();
+
+  Frame AddPreview(size_t xsize, size_t ysize);
+
+ private:
+  extras::PackedPixelFile ppf_;
+  JxlPixelFormat format_ = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  static void CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info);
+
+  static void CropImage(size_t xsize, size_t ysize, extras::PackedImage* image);
+
+  static JxlDataType DefaultDataType(const JxlBasicInfo& info);
+};
+
+}  // namespace test
+}  // namespace jxl
+
+#endif  // LIB_JXL_TEST_IMAGE_H_
diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.cc b/third_party/jpeg-xl/lib/jxl/test_utils.cc
new file mode 100644
index 0000000000..223641a6a5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_utils.cc
@@ -0,0 +1,673 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_utils.h"
+
+#include <memory>
+#include <string>
+
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_file.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jxl {
+namespace test {
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+PaddedBytes ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  PaddedBytes data;
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  JXL_CHECK(jxl::ReadFile(full_path, &data));
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool) {
+  SetThreadParallelRunner(dparams, pool);
+  extras::PackedPixelFile ppf;
+  JXL_RETURN_IF_ERROR(DecodeImageJXL(file.data(), file.size(), dparams,
+                                     /*decoded_bytes=*/nullptr, &ppf));
+  JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
+  return true;
+}
+
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format) {
+  JxlEncoderInitBasicInfo(basic_info);
+  switch (pixel_format->data_type) {
+    case JXL_TYPE_FLOAT:
+      basic_info->bits_per_sample = 32;
+      basic_info->exponent_bits_per_sample = 8;
+      break;
+    case JXL_TYPE_FLOAT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_UINT8:
+      basic_info->bits_per_sample = 8;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+  if (pixel_format->num_channels < 3) {
+    basic_info->num_color_channels = 1;
+  } else {
+    basic_info->num_color_channels = 3;
+  }
+  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
+    basic_info->alpha_exponent_bits = basic_info->exponent_bits_per_sample;
+    basic_info->alpha_bits = basic_info->bits_per_sample;
+    basic_info->num_extra_channels = 1;
+  } else {
+    basic_info->alpha_exponent_bits = 0;
+    basic_info->alpha_bits = 0;
+  }
+}
+
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc) {
+  ColorEncoding c;
+  c.SetColorSpace(desc.color_space);
+  if (desc.color_space != ColorSpace::kXYB) {
+    c.white_point = desc.white_point;
+    c.primaries = desc.primaries;
+    c.tf.SetTransferFunction(desc.tf);
+  }
+  c.rendering_intent = desc.rendering_intent;
+  JXL_CHECK(c.CreateICC());
+  return c;
+}
+
+namespace {
+void CheckSameEncodings(const std::vector<ColorEncoding>& a,
+                        const std::vector<ColorEncoding>& b,
+                        const std::string& check_name,
+                        std::stringstream& failures) {
+  JXL_CHECK(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    if ((a[i].ICC() == b[i].ICC()) ||
+        ((a[i].primaries == b[i].primaries) && a[i].tf.IsSame(b[i].tf))) {
+      continue;
+    }
+    failures << "CheckSameEncodings " << check_name << ": " << i
+             << "-th encoding mismatch\n";
+  }
+}
+}  // namespace
+
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size, ThreadPool* pool, AuxOut* aux_out) {
+  if (compressed_size) {
+    *compressed_size = static_cast<size_t>(-1);
+  }
+  PaddedBytes compressed;
+
+  std::vector<ColorEncoding> original_metadata_encodings;
+  std::vector<ColorEncoding> original_current_encodings;
+  std::vector<ColorEncoding> metadata_encodings_1;
+  std::vector<ColorEncoding> metadata_encodings_2;
+  std::vector<ColorEncoding> current_encodings_2;
+  original_metadata_encodings.reserve(io->frames.size());
+  original_current_encodings.reserve(io->frames.size());
+  metadata_encodings_1.reserve(io->frames.size());
+  metadata_encodings_2.reserve(io->frames.size());
+  current_encodings_2.reserve(io->frames.size());
+
+  for (const ImageBundle& ib : io->frames) {
+    // Remember original encoding, will be returned by decoder.
+    original_metadata_encodings.push_back(ib.metadata()->color_encoding);
+    // c_current should not change during encoding.
+    original_current_encodings.push_back(ib.c_current());
+  }
+
+  std::unique_ptr<PassesEncoderState> enc_state =
+      jxl::make_unique<PassesEncoderState>();
+  JXL_CHECK(EncodeFile(cparams, io, enc_state.get(), &compressed, GetJxlCms(),
+                       aux_out, pool));
+
+  for (const ImageBundle& ib1 : io->frames) {
+    metadata_encodings_1.push_back(ib1.metadata()->color_encoding);
+  }
+
+  // Should still be in the same color space after encoding.
+  CheckSameEncodings(metadata_encodings_1, original_metadata_encodings,
+                     "original vs after encoding", failures);
+
+  JXL_CHECK(DecodeFile(dparams, Span<const uint8_t>(compressed), io2, pool));
+  JXL_CHECK(io2->frames.size() == io->frames.size());
+
+  for (const ImageBundle& ib2 : io2->frames) {
+    metadata_encodings_2.push_back(ib2.metadata()->color_encoding);
+    current_encodings_2.push_back(ib2.c_current());
+  }
+
+  // We always produce the original color encoding if a color transform hook is
+  // set.
+  CheckSameEncodings(current_encodings_2, original_current_encodings,
+                     "current: original vs decoded", failures);
+
+  // Decoder returns the originals passed to the encoder.
+  CheckSameEncodings(metadata_encodings_2, original_metadata_encodings,
+                     "metadata: original vs decoded", failures);
+
+  if (compressed_size) {
+    *compressed_size = compressed.size();
+  }
+
+  return failures.str().empty();
+}
+
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out) {
+  SetThreadParallelRunner(cparams, pool);
+  SetThreadParallelRunner(dparams, pool);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(extras::EncodeImageJXL(cparams, ppf_in, /*jpeg_bytes=*/nullptr,
+                                   &compressed));
+  size_t decoded_bytes = 0;
+  JXL_CHECK(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                   dparams, &decoded_bytes, ppf_out));
+  JXL_CHECK(decoded_bytes == compressed.size());
+  return compressed.size();
+}
+
+std::vector<ColorEncodingDescriptor> AllEncodings() {
+  std::vector<ColorEncodingDescriptor> all_encodings;
+  all_encodings.reserve(300);
+  ColorEncoding c;
+
+  for (ColorSpace cs : Values<ColorSpace>()) {
+    if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB) continue;
+    c.SetColorSpace(cs);
+
+    for (WhitePoint wp : Values<WhitePoint>()) {
+      if (wp == WhitePoint::kCustom) continue;
+      if (c.ImplicitWhitePoint() && c.white_point != wp) continue;
+      c.white_point = wp;
+
+      for (Primaries primaries : Values<Primaries>()) {
+        if (primaries == Primaries::kCustom) continue;
+        if (!c.HasPrimaries()) continue;
+        c.primaries = primaries;
+
+        for (TransferFunction tf : Values<TransferFunction>()) {
+          if (tf == TransferFunction::kUnknown) continue;
+          if (c.tf.SetImplicit() &&
+              (c.tf.IsGamma() || c.tf.GetTransferFunction() != tf)) {
+            continue;
+          }
+          c.tf.SetTransferFunction(tf);
+
+          for (RenderingIntent ri : Values<RenderingIntent>()) {
+            ColorEncodingDescriptor cdesc;
+            cdesc.color_space = cs;
+            cdesc.white_point = wp;
+            cdesc.primaries = primaries;
+            cdesc.tf = tf;
+            cdesc.rendering_intent = ri;
+            all_encodings.push_back(cdesc);
+          }
+        }
+      }
+    }
+  }
+
+  return all_encodings;
+}
+
+jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
+                                          size_t num_channels, size_t xsize,
+                                          size_t ysize) {
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetAlphaBits(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(
+      /*is_gray=*/num_channels == 1 || num_channels == 2);
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
+  JXL_CHECK(ConvertFromExternal(
+      jxl::Span<const uint8_t>(buf.data(), buf.size()), xsize, ysize,
+      jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels < 3),
+      /*bits_per_sample=*/16, format,
+      /*pool=*/nullptr,
+      /*ib=*/&io.Main()));
+  return io;
+}
+
+bool Near(double expected, double value, double max_dist) {
+  double dist = expected > value ? expected - value : value - expected;
+  return dist <= max_dist;
+}
+
+float LoadFloat16(uint16_t bits16) {
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+  float result;
+  memcpy(&result, &bits32, 4);
+  return result;
+}
+
+float LoadLEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadLE16(p);
+  return LoadFloat16(bits16);
+}
+
+float LoadBEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadBE16(p);
+  return LoadFloat16(bits16);
+}
+
+size_t GetPrecision(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      // Floating point mantissa precision
+      return 24;
+    case JXL_TYPE_FLOAT16:
+      return 11;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+size_t GetDataBits(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
+                                    size_t ysize, const JxlPixelFormat& format,
+                                    double factor) {
+  std::vector<double> result(xsize * ysize * 4);
+  size_t num_channels = format.num_channels;
+  bool gray = num_channels == 1 || num_channels == 2;
+  bool alpha = num_channels == 2 || num_channels == 4;
+  JxlEndianness endianness = format.endianness;
+  // Compute actual type:
+  if (endianness == JXL_NATIVE_ENDIAN) {
+    endianness = IsLittleEndian() ? JXL_LITTLE_ENDIAN : JXL_BIG_ENDIAN;
+  }
+
+  size_t stride =
+      xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels,
+                           jxl::kBitsPerByte);
+  if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align);
+
+  if (format.data_type == JXL_TYPE_UINT8) {
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 255.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels;
+        double r = pixels[i];
+        double g = gray ? r : pixels[i + 1];
+        double b = gray ? r : pixels[i + 2];
+        double a = alpha ? pixels[i + num_channels - 1] : 255;
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 65535.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = (pixels[i + 0] << 8) + pixels[i + 1];
+          g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3];
+          b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5];
+          a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) +
+                          pixels[i + num_channels * 2 - 1]
+                    : 65535;
+        } else {
+          r = (pixels[i + 1] << 8) + pixels[i + 0];
+          g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2];
+          b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4];
+          a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) +
+                          pixels[i + num_channels * 2 - 2]
+                    : 65535;
+        }
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 4;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat(pixels + i);
+          g = gray ? r : LoadBEFloat(pixels + i + 4);
+          b = gray ? r : LoadBEFloat(pixels + i + 8);
+          a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        } else {
+          r = LoadLEFloat(pixels + i);
+          g = gray ? r : LoadLEFloat(pixels + i + 4);
+          b = gray ? r : LoadLEFloat(pixels + i + 8);
+          a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat16(pixels + i);
+          g = gray ? r : LoadBEFloat16(pixels + i + 2);
+          b = gray ? r : LoadBEFloat16(pixels + i + 4);
+          a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        } else {
+          r = LoadLEFloat16(pixels + i);
+          g = gray ? r : LoadLEFloat16(pixels + i + 2);
+          b = gray ? r : LoadLEFloat16(pixels + i + 4);
+          a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else {
+    JXL_ASSERT(false);  // Unsupported type
+  }
+  return result;
+}
+
+size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
+                     size_t ysize, const JxlPixelFormat& format_a,
+                     const JxlPixelFormat& format_b,
+                     double threshold_multiplier) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format_a);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format_b);
+  bool gray_a = format_a.num_channels < 3;
+  bool gray_b = format_b.num_channels < 3;
+  bool alpha_a = !(format_a.num_channels & 1);
+  bool alpha_b = !(format_b.num_channels & 1);
+  size_t bits_a = GetPrecision(format_a.data_type);
+  size_t bits_b = GetPrecision(format_b.data_type);
+  size_t bits = std::min(bits_a, bits_b);
+  // How much distance is allowed in case of pixels with lower bit depths, given
+  // that the double precision float images use range 0-1.0.
+  // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must
+  // map to 1.
+  double precision = 0.5 * threshold_multiplier / ((1ull << bits) - 1ull);
+  if (format_a.data_type == JXL_TYPE_FLOAT16 ||
+      format_b.data_type == JXL_TYPE_FLOAT16) {
+    // Lower the precision for float16, because it currently looks like the
+    // scalar and wasm implementations of hwy have 1 less bit of precision
+    // than the x86 implementations.
+    // TODO(lode): Set the required precision back to 11 bits when possible.
+    precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull);
+  }
+  size_t numdiff = 0;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      bool ok = true;
+      if (gray_a || gray_b) {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false;
+        // If the input was grayscale and the output not, then the output must
+        // have all channels equal.
+        if (gray_a && b_full[i + 0] != b_full[i + 1] &&
+            b_full[i + 2] != b_full[i + 2]) {
+          ok = false;
+        }
+      } else {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision) ||
+            !Near(a_full[i + 1], b_full[i + 1], precision) ||
+            !Near(a_full[i + 2], b_full[i + 2], precision)) {
+          ok = false;
+        }
+      }
+      if (alpha_a && alpha_b) {
+        if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false;
+      } else {
+        // If the input had no alpha channel, the output should be opaque
+        // after roundtrip.
+        if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false;
+      }
+      if (!ok) numdiff++;
+    }
+  }
+  return numdiff;
+}
+
+double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
+                   size_t ysize, const JxlPixelFormat& format) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format);
+  double sum = 0.0;
+  for (size_t y = 0; y < ysize; y++) {
+    double row_sum = 0.0;
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      for (size_t c = 0; c < format.num_channels; ++c) {
+        double diff = a_full[i + c] - b_full[i + c];
+        row_sum += diff * diff;
+      }
+    }
+    sum += row_sum;
+  }
+  sum /= (xsize * ysize);
+  return sqrt(sum);
+}
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  // TODO(eustas): simplify?
+  return ButteraugliDistance(io0.frames, io1.frames, ButteraugliParams(),
+                             GetJxlCms(),
+                             /*distmap=*/nullptr, pool);
+}
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  ButteraugliParams ba;
+  ImageF distmap;
+  ButteraugliDistance(io0.frames, io1.frames, ba, GetJxlCms(), &distmap, pool);
+  return ComputeDistanceP(distmap, ba, 3);
+}
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, nullptr, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, nullptr, &io1));
+  return ComputeDistance2(io0.Main(), io1.Main(), GetJxlCms());
+}
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.alpha_bits == b.info.alpha_bits);
+  JXL_CHECK(a.info.alpha_exponent_bits == b.info.alpha_exponent_bits);
+  JXL_CHECK(a.info.alpha_bits > 0);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const extras::PackedImage& color_a = a.frames[i].color;
+    const extras::PackedImage& color_b = b.frames[i].color;
+    JXL_CHECK(color_a.format.num_channels == color_b.format.num_channels);
+    JXL_CHECK(color_a.format.data_type == color_b.format.data_type);
+    JXL_CHECK(color_a.format.endianness == color_b.format.endianness);
+    JXL_CHECK(color_a.pixels_size == color_b.pixels_size);
+    size_t pwidth =
+        extras::PackedImage::BitsPerChannel(color_a.format.data_type) / 8;
+    size_t num_color = color_a.format.num_channels < 3 ? 1 : 3;
+    const uint8_t* p_a = reinterpret_cast<const uint8_t*>(color_a.pixels());
+    const uint8_t* p_b = reinterpret_cast<const uint8_t*>(color_b.pixels());
+    for (size_t y = 0; y < a.info.ysize; ++y) {
+      for (size_t x = 0; x < a.info.xsize; ++x) {
+        size_t idx =
+            ((y * a.info.xsize + x) * color_a.format.num_channels + num_color) *
+            pwidth;
+        if (memcmp(&p_a[idx], &p_b[idx], pwidth) != 0) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b) {
+  JXL_CHECK(a.xsize == b.xsize);
+  JXL_CHECK(a.ysize == b.ysize);
+  JXL_CHECK(a.format.num_channels == b.format.num_channels);
+  JXL_CHECK(a.format.data_type == b.format.data_type);
+  JXL_CHECK(a.format.endianness == b.format.endianness);
+  JXL_CHECK(a.pixels_size == b.pixels_size);
+  const uint8_t* p_a = reinterpret_cast<const uint8_t*>(a.pixels());
+  const uint8_t* p_b = reinterpret_cast<const uint8_t*>(b.pixels());
+  for (size_t y = 0; y < a.ysize; ++y) {
+    for (size_t x = 0; x < a.xsize; ++x) {
+      size_t idx = (y * a.xsize + x) * a.pixel_stride();
+      if (memcmp(&p_a[idx], &p_b[idx], a.pixel_stride()) != 0) {
+        printf("Mismatch at row %" PRIuS " col %" PRIuS "\n", y, x);
+        printf("  a: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_a[idx + j]);
+        }
+        printf("\n  b: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_b[idx + j]);
+        }
+        printf("\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.bits_per_sample == b.info.bits_per_sample);
+  JXL_CHECK(a.info.exponent_bits_per_sample == b.info.exponent_bits_per_sample);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const auto& frame_a = a.frames[i];
+    const auto& frame_b = b.frames[i];
+    if (!SamePixels(frame_a.color, frame_b.color)) {
+      return false;
+    }
+    JXL_CHECK(frame_a.extra_channels.size() == frame_b.extra_channels.size());
+    for (size_t j = 0; j < frame_a.extra_channels.size(); ++j) {
+      if (!SamePixels(frame_a.extra_channels[i], frame_b.extra_channels[i])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace test
+
+bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
+  if (a.size() != b.size()) return false;
+  if (memcmp(a.data(), b.data(), a.size()) != 0) return false;
+  return true;
+}
+
+// Allow using EXPECT_EQ on jxl::PaddedBytes
+bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
+  return !(a == b);
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/test_utils.h b/third_party/jpeg-xl/lib/jxl/test_utils.h
new file mode 100644
index 0000000000..8c5cd434f5
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/test_utils.h
@@ -0,0 +1,175 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TEST_UTILS_H_
+#define LIB_JXL_TEST_UTILS_H_
+
+// TODO(eustas): reduce includes (move to .cc)
+
+// Macros and functions useful for tests.
+
+#include <jxl/codestream_header.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <ostream>
+#include <vector>
+
+#include "lib/extras/dec/jxl.h"
+#include "lib/extras/enc/jxl.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_params.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+namespace test {
+
+std::string GetTestDataPath(const std::string& filename);
+PaddedBytes ReadTestData(const std::string& filename);
+
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format);
+
+template <typename Params>
+void SetThreadParallelRunner(Params params, ThreadPool* pool) {
+  if (pool && !params.runner_opaque) {
+    params.runner = pool->runner();
+    params.runner_opaque = pool->runner_opaque();
+  }
+}
+
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool = nullptr);
+
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size = nullptr, ThreadPool* pool = nullptr,
+               AuxOut* aux_out = nullptr);
+
+// Returns compressed size [bytes].
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out);
+
+// A POD descriptor of a ColorEncoding. Only used in tests as the return value
+// of AllEncodings().
+struct ColorEncodingDescriptor {
+  ColorSpace color_space;
+  WhitePoint white_point;
+  Primaries primaries;
+  TransferFunction tf;
+  RenderingIntent rendering_intent;
+};
+
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc);
+
+// Define the operator<< for tests.
+static inline ::std::ostream& operator<<(::std::ostream& os,
+                                         const ColorEncodingDescriptor& c) {
+  return os << "ColorEncoding/" << Description(ColorEncodingFromDescriptor(c));
+}
+
+// Returns ColorEncodingDescriptors, which are only used in tests. To obtain a
+// ColorEncoding object call ColorEncodingFromDescriptor and then call
+// ColorEncoding::CreateProfile() on that object to generate a profile.
+std::vector<ColorEncodingDescriptor> AllEncodings();
+
+// Returns a CodecInOut based on the buf, xsize, ysize, and the assumption
+// that the buffer was created using `GetSomeTestImage`.
+jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
+                                          size_t num_channels, size_t xsize,
+                                          size_t ysize);
+
+bool Near(double expected, double value, double max_dist);
+
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16);
+
+float LoadLEFloat16(const uint8_t* p);
+
+float LoadBEFloat16(const uint8_t* p);
+
+size_t GetPrecision(JxlDataType data_type);
+
+size_t GetDataBits(JxlDataType data_type);
+
+// Procedure to convert pixels to double precision, not efficient, but
+// well-controlled for testing. It uses double, to be able to represent all
+// precisions needed for the maximum data types the API supports: uint32_t
+// integers, and, single precision float. The values are in range 0-1 for SDR.
+std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
+                                    size_t ysize, const JxlPixelFormat& format,
+                                    double factor = 0.0);
+
+// Returns amount of pixels which differ between the two pictures. Image b is
+// the image after roundtrip after roundtrip, image a before roundtrip. There
+// are more strict requirements for the alpha channel and grayscale values of
+// the output image.
+size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
+                     size_t ysize, const JxlPixelFormat& format_a,
+                     const JxlPixelFormat& format_b,
+                     double threshold_multiplier = 1.0);
+
+double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
+                   size_t ysize, const JxlPixelFormat& format);
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b,
+                          ThreadPool* pool = nullptr);
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b,
+                       ThreadPool* pool = nullptr);
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b);
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b);
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b);
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b);
+
+class ThreadPoolForTests {
+ public:
+  explicit ThreadPoolForTests(int num_threads) {
+    runner_ =
+        JxlThreadParallelRunnerMake(/* memory_manager */ nullptr, num_threads);
+    pool_ =
+        jxl::make_unique<ThreadPool>(JxlThreadParallelRunner, runner_.get());
+  }
+  ThreadPoolForTests(const ThreadPoolForTests&) = delete;
+  ThreadPoolForTests& operator&(const ThreadPoolForTests&) = delete;
+  ThreadPool* operator&() { return pool_.get(); }
+
+ private:
+  JxlThreadParallelRunnerPtr runner_;
+  std::unique_ptr<ThreadPool> pool_;
+};
+
+}  // namespace test
+
+bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b);
+
+// Allow using EXPECT_EQ on jxl::PaddedBytes
+bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_TEST_UTILS_H_
diff --git a/third_party/jpeg-xl/lib/jxl/testing.h b/third_party/jpeg-xl/lib/jxl/testing.h
new file mode 100644
index 0000000000..d10b0c3c54
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/testing.h
@@ -0,0 +1,73 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TESTING_H_
+#define LIB_JXL_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+#define JXL_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_SLOW_TEST(X) X
+#endif  // JXL_DISABLE_SLOW_TESTS
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#define JXL_TRANSCODE_JPEG_TEST(X) X
+#else
+#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+#if JPEGXL_ENABLE_BOXES
+#define JXL_BOXES_TEST(X) X
+#else
+#define JXL_BOXES_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_BOXES
+
+#ifdef THREAD_SANITIZER
+#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_TSAN_SLOW_TEST(X) X
+#endif  // THREAD_SANITIZER
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#define JXL_EXPECT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    EXPECT_TRUE(F) << _.str(); \
+  }
+
+#define JXL_ASSERT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    ASSERT_TRUE(F) << _.str(); \
+  }
+
+#endif  // LIB_JXL_TESTING_H_
diff --git a/third_party/jpeg-xl/lib/jxl/tf_gbench.cc b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc
new file mode 100644
index 0000000000..9c010d460a
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/tf_gbench.cc
@@ -0,0 +1,143 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark/benchmark.h"
+#include "lib/jxl/image_ops.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/tf_gbench.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/transfer_functions-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#define RUN_BENCHMARK(F)                                            \
+  constexpr size_t kNum = 1 << 12;                                  \
+  HWY_FULL(float) d;                                                \
+  /* Three parallel runs, as this will run on R, G and B. */        \
+  auto sum1 = Zero(d);                                              \
+  auto sum2 = Zero(d);                                              \
+  auto sum3 = Zero(d);                                              \
+  for (auto _ : state) {                                            \
+    auto x = Set(d, 1e-5);                                          \
+    auto v1 = Set(d, 1e-5);                                         \
+    auto v2 = Set(d, 1.1e-5);                                       \
+    auto v3 = Set(d, 1.2e-5);                                       \
+    for (size_t i = 0; i < kNum; i++) {                             \
+      sum1 += F(d, v1);                                             \
+      sum2 += F(d, v2);                                             \
+      sum3 += F(d, v3);                                             \
+      v1 += x;                                                      \
+      v2 += x;                                                      \
+      v3 += x;                                                      \
+    }                                                               \
+  }                                                                 \
+  /* floats per second */                                           \
+  state.SetItemsProcessed(kNum* state.iterations() * Lanes(d) * 3); \
+  benchmark::DoNotOptimize(sum1 + sum2 + sum3);
+
+#define RUN_BENCHMARK_SCALAR(F)                              \
+  constexpr size_t kNum = 1 << 12;                           \
+  /* Three parallel runs, as this will run on R, G and B. */ \
+  float sum1 = 0, sum2 = 0, sum3 = 0;                        \
+  for (auto _ : state) {                                     \
+    float x = 1e-5;                                          \
+    float v1 = 1e-5;                                         \
+    float v2 = 1.1e-5;                                       \
+    float v3 = 1.2e-5;                                       \
+    for (size_t i = 0; i < kNum; i++) {                      \
+      sum1 += F(v1);                                         \
+      sum2 += F(v2);                                         \
+      sum3 += F(v3);                                         \
+      v1 += x;                                               \
+      v2 += x;                                               \
+      v3 += x;                                               \
+    }                                                        \
+  }                                                          \
+  /* floats per second */                                    \
+  state.SetItemsProcessed(kNum* state.iterations() * 3);     \
+  benchmark::DoNotOptimize(sum1 + sum2 + sum3);
+
+HWY_NOINLINE void BM_FastSRGB(benchmark::State& state) {
+  RUN_BENCHMARK(FastLinearToSRGB);
+}
+
+HWY_NOINLINE void BM_TFSRGB(benchmark::State& state) {
+  RUN_BENCHMARK(TF_SRGB().EncodedFromDisplay);
+}
+
+HWY_NOINLINE void BM_PQDFE(benchmark::State& state) {
+  RUN_BENCHMARK(TF_PQ().DisplayFromEncoded);
+}
+
+HWY_NOINLINE void BM_PQEFD(benchmark::State& state) {
+  RUN_BENCHMARK(TF_PQ().EncodedFromDisplay);
+}
+
+HWY_NOINLINE void BM_PQSlowDFE(benchmark::State& state) {
+  RUN_BENCHMARK_SCALAR(TF_PQ().DisplayFromEncoded);
+}
+
+HWY_NOINLINE void BM_PQSlowEFD(benchmark::State& state) {
+  RUN_BENCHMARK_SCALAR(TF_PQ().EncodedFromDisplay);
+}
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+namespace {
+
+HWY_EXPORT(BM_FastSRGB);
+HWY_EXPORT(BM_TFSRGB);
+HWY_EXPORT(BM_PQDFE);
+HWY_EXPORT(BM_PQEFD);
+HWY_EXPORT(BM_PQSlowDFE);
+HWY_EXPORT(BM_PQSlowEFD);
+
+float SRGB_pow(float x) {
+  return x < 0.0031308f ? 12.92f * x : 1.055f * powf(x, 1.0f / 2.4f) - 0.055f;
+}
+
+void BM_FastSRGB(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_FastSRGB)(state);
+}
+void BM_TFSRGB(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_TFSRGB)(state);
+}
+void BM_PQDFE(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQDFE)(state);
+}
+void BM_PQEFD(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQEFD)(state);
+}
+void BM_PQSlowDFE(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQSlowDFE)(state);
+}
+void BM_PQSlowEFD(benchmark::State& state) {
+  HWY_DYNAMIC_DISPATCH(BM_PQSlowEFD)(state);
+}
+
+void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow); }
+
+BENCHMARK(BM_FastSRGB);
+BENCHMARK(BM_TFSRGB);
+BENCHMARK(BM_SRGB_pow);
+BENCHMARK(BM_PQDFE);
+BENCHMARK(BM_PQEFD);
+BENCHMARK(BM_PQSlowDFE);
+BENCHMARK(BM_PQSlowEFD);
+
+}  // namespace
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl/toc.cc b/third_party/jpeg-xl/lib/jxl/toc.cc
new file mode 100644
index 0000000000..fd7740c144
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc.cc
@@ -0,0 +1,105 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/toc.h"
+
+#include <stdint.h>
+
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+size_t MaxBits(const size_t num_sizes) {
+  const size_t entry_bits = U32Coder::MaxEncodedBits(kTocDist) * num_sizes;
+  // permutation bit (not its tokens!), padding, entries, padding.
+  return 1 + kBitsPerByte + entry_bits + kBitsPerByte;
+}
+
+Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+               std::vector<uint32_t>* JXL_RESTRICT sizes,
+               std::vector<coeff_order_t>* JXL_RESTRICT permutation) {
+  if (toc_entries > 65536) {
+    // Prevent out of memory if invalid JXL codestream causes a bogus amount
+    // of toc_entries such as 2720436919446 to be computed.
+    // TODO(lode): verify whether 65536 is a reasonable upper bound
+    return JXL_FAILURE("too many toc entries");
+  }
+
+  sizes->clear();
+  sizes->resize(toc_entries);
+  if (reader->TotalBitsConsumed() >= reader->TotalBytes() * kBitsPerByte) {
+    return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC");
+  }
+  const auto check_bit_budget = [&](size_t num_entries) -> Status {
+    // U32Coder reads 2 bits to recognize variant and kTocDist cheapest variant
+    // is Bits(10), this way at least 12 bits are required per toc-entry.
+    size_t minimal_bit_cost = num_entries * (2 + 10);
+    size_t bit_budget = reader->TotalBytes() * 8;
+    size_t expenses = reader->TotalBitsConsumed();
+    if ((expenses <= bit_budget) &&
+        (minimal_bit_cost <= bit_budget - expenses)) {
+      return true;
+    }
+    return JXL_STATUS(StatusCode::kNotEnoughBytes, "Not enough bytes for TOC");
+  };
+
+  JXL_DASSERT(toc_entries > 0);
+  if (reader->ReadFixedBits<1>() == 1) {
+    JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries));
+    permutation->resize(toc_entries);
+    JXL_RETURN_IF_ERROR(DecodePermutation(/*skip=*/0, toc_entries,
+                                          permutation->data(), reader));
+  }
+  JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+  JXL_RETURN_IF_ERROR(check_bit_budget(toc_entries));
+  for (size_t i = 0; i < toc_entries; ++i) {
+    (*sizes)[i] = U32Coder::Read(kTocDist, reader);
+  }
+  JXL_RETURN_IF_ERROR(reader->JumpToByteBoundary());
+  JXL_RETURN_IF_ERROR(check_bit_budget(0));
+  return true;
+}
+
+Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+                        std::vector<uint64_t>* JXL_RESTRICT offsets,
+                        std::vector<uint32_t>* JXL_RESTRICT sizes,
+                        uint64_t* total_size) {
+  std::vector<coeff_order_t> permutation;
+  JXL_RETURN_IF_ERROR(ReadToc(toc_entries, reader, sizes, &permutation));
+
+  offsets->clear();
+  offsets->resize(toc_entries);
+
+  // Prefix sum starting with 0 and ending with the offset of the last group
+  uint64_t offset = 0;
+  for (size_t i = 0; i < toc_entries; ++i) {
+    if (offset + (*sizes)[i] < offset) {
+      return JXL_FAILURE("group offset overflow");
+    }
+    (*offsets)[i] = offset;
+    offset += (*sizes)[i];
+  }
+  if (total_size) {
+    *total_size = offset;
+  }
+
+  if (!permutation.empty()) {
+    std::vector<uint64_t> permuted_offsets;
+    std::vector<uint32_t> permuted_sizes;
+    permuted_offsets.reserve(toc_entries);
+    permuted_sizes.reserve(toc_entries);
+    for (coeff_order_t index : permutation) {
+      permuted_offsets.push_back((*offsets)[index]);
+      permuted_sizes.push_back((*sizes)[index]);
+    }
+    std::swap(*offsets, permuted_offsets);
+    std::swap(*sizes, permuted_sizes);
+  }
+
+  return true;
+}
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/toc.h b/third_party/jpeg-xl/lib/jxl/toc.h
new file mode 100644
index 0000000000..a97197ad45
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc.h
@@ -0,0 +1,55 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TOC_H_
+#define LIB_JXL_TOC_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
+
+namespace jxl {
+
+// (2+bits) = 2,3,4 bytes so encoders can patch TOC after encoding.
+// 30 is sufficient for 4K channels of uncompressed 16-bit samples.
+constexpr U32Enc kTocDist(Bits(10), BitsOffset(14, 1024), BitsOffset(22, 17408),
+                          BitsOffset(30, 4211712));
+
+size_t MaxBits(const size_t num_sizes);
+
+// TODO(veluca): move these to FrameDimensions.
+static JXL_INLINE size_t AcGroupIndex(size_t pass, size_t group,
+                                      size_t num_groups, size_t num_dc_groups,
+                                      bool has_ac_global) {
+  return 1 + num_dc_groups + static_cast<size_t>(has_ac_global) +
+         pass * num_groups + group;
+}
+
+static JXL_INLINE size_t NumTocEntries(size_t num_groups, size_t num_dc_groups,
+                                       size_t num_passes, bool has_ac_global) {
+  if (num_groups == 1 && num_passes == 1) return 1;
+  return AcGroupIndex(0, 0, num_groups, num_dc_groups, has_ac_global) +
+         num_groups * num_passes;
+}
+
+Status ReadToc(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+               std::vector<uint32_t>* JXL_RESTRICT sizes,
+               std::vector<coeff_order_t>* JXL_RESTRICT permutation);
+
+Status ReadGroupOffsets(size_t toc_entries, BitReader* JXL_RESTRICT reader,
+                        std::vector<uint64_t>* JXL_RESTRICT offsets,
+                        std::vector<uint32_t>* JXL_RESTRICT sizes,
+                        uint64_t* total_size);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_TOC_H_
diff --git a/third_party/jpeg-xl/lib/jxl/toc_test.cc b/third_party/jpeg-xl/lib/jxl/toc_test.cc
new file mode 100644
index 0000000000..a7f0f2c27b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/toc_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/toc.h"
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_toc.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+void Roundtrip(size_t num_entries, bool permute, Rng* rng) {
+  // Generate a random permutation.
+  std::vector<coeff_order_t> permutation(num_entries);
+  std::vector<coeff_order_t> inv_permutation(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    permutation[i] = i;
+    inv_permutation[i] = i;
+  }
+  if (permute) {
+    rng->Shuffle(permutation.data(), permutation.size());
+    for (size_t i = 0; i < num_entries; i++) {
+      inv_permutation[permutation[i]] = i;
+    }
+  }
+
+  // Generate num_entries groups of random (byte-aligned) length
+  std::vector<BitWriter> group_codes(num_entries);
+  for (BitWriter& writer : group_codes) {
+    const size_t max_bits = (*rng)() & 0xFFF;
+    BitWriter::Allotment allotment(&writer, max_bits + kBitsPerByte);
+    size_t i = 0;
+    for (; i + BitWriter::kMaxBitsPerCall < max_bits;
+         i += BitWriter::kMaxBitsPerCall) {
+      writer.Write(BitWriter::kMaxBitsPerCall, 0);
+    }
+    for (; i < max_bits; i += 1) {
+      writer.Write(/*n_bits=*/1, 0);
+    }
+    writer.ZeroPadToByte();
+    AuxOut aux_out;
+    allotment.ReclaimAndCharge(&writer, 0, &aux_out);
+  }
+
+  BitWriter writer;
+  AuxOut aux_out;
+  ASSERT_TRUE(WriteGroupOffsets(group_codes, permute ? &permutation : nullptr,
+                                &writer, &aux_out));
+
+  BitReader reader(writer.GetSpan());
+  std::vector<uint64_t> group_offsets;
+  std::vector<uint32_t> group_sizes;
+  uint64_t total_size;
+  ASSERT_TRUE(ReadGroupOffsets(num_entries, &reader, &group_offsets,
+                               &group_sizes, &total_size));
+  ASSERT_EQ(num_entries, group_offsets.size());
+  ASSERT_EQ(num_entries, group_sizes.size());
+  EXPECT_TRUE(reader.Close());
+
+  uint64_t prefix_sum = 0;
+  for (size_t i = 0; i < num_entries; ++i) {
+    EXPECT_EQ(prefix_sum, group_offsets[inv_permutation[i]]);
+
+    EXPECT_EQ(0u, group_codes[i].BitsWritten() % kBitsPerByte);
+    prefix_sum += group_codes[i].BitsWritten() / kBitsPerByte;
+
+    if (i + 1 < num_entries) {
+      EXPECT_EQ(
+          group_offsets[inv_permutation[i]] + group_sizes[inv_permutation[i]],
+          group_offsets[inv_permutation[i + 1]]);
+    }
+  }
+  EXPECT_EQ(prefix_sum, total_size);
+}
+
+TEST(TocTest, Test) {
+  Rng rng(0);
+  for (size_t num_entries = 1; num_entries < 10; ++num_entries) {
+    for (bool permute : std::vector<bool>{false, true}) {
+      Roundtrip(num_entries, permute, &rng);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h
new file mode 100644
index 0000000000..9f4c10c76d
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/transfer_functions-inl.h
@@ -0,0 +1,413 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Transfer functions for color encodings.
+
+#if defined(LIB_JXL_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#undef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#else
+#define LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <hwy/highway.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/rational_polynomial-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::TableLookupBytes;
+
+// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD):
+// "display" is linear light (nits) normalized to [0, 1].
+// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1].
+// "scene" is a linear function of photon counts, normalized to [0, 1].
+
+// Despite the stated ranges, we need unbounded transfer functions: see
+// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or
+// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused
+// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see
+// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb)
+// and extend the function domains above 1.
+
+// Hybrid Log-Gamma.
+class TF_HLG {
+ public:
+  // EOTF. e = encoded.
+  JXL_INLINE double DisplayFromEncoded(const double e) const {
+    return OOTF(InvOETF(e));
+  }
+
+  // Inverse EOTF. d = display.
+  JXL_INLINE double EncodedFromDisplay(const double d) const {
+    return OETF(InvOOTF(d));
+  }
+
+  // Maximum error 5e-7.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x));
+    const V e =
+        MulAdd(Set(d, kA * 0.693147181f),
+               FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC));
+    const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  // OETF (defines the HLG approach). s = scene, returns encoded.
+  JXL_INLINE double OETF(double s) const {
+    if (s == 0.0) return 0.0;
+    const double original_sign = s;
+    s = std::abs(s);
+
+    if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign);
+
+    const double e = kA * std::log(12 * s - kB) + kC;
+    JXL_ASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+  // e = encoded, returns scene.
+  JXL_INLINE double InvOETF(double e) const {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign);
+
+    const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12;
+    JXL_ASSERT(s >= 0);
+    return copysignf(s, original_sign);
+  }
+
+  // s = scene, returns display.
+  JXL_INLINE double OOTF(const double s) const {
+    // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where
+    // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return
+    // "display" (normalized [0, 1]) instead of nits. Our transfer function
+    // interface does not allow a dependency on YS. Fortunately, the system
+    // gamma at 334 nits is 1.0, so this reduces to RD = RS.
+    return s;
+  }
+
+  // d = display, returns scene.
+  JXL_INLINE double InvOOTF(const double d) const {
+    return d;  // see OOTF().
+  }
+
+  static constexpr double kA = 0.17883277;
+  static constexpr double kRA = 1.0 / kA;
+  static constexpr double kB = 1 - 4 * kA;
+  static constexpr double kC = 0.5599107295;
+  static constexpr double kDiv12 = 1.0 / 12;
+};
+
+class TF_709 {
+ public:
+  JXL_INLINE double EncodedFromDisplay(const double d) const {
+    if (d < kThresh) return kMulLow * d;
+    return kMulHi * std::pow(d, kPowHi) + kSub;
+  }
+
+  // Maximum error 1e-6.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    auto low = Mul(Set(d, kMulLow), x);
+    auto hi =
+        MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub));
+    return IfThenElse(Le(x, Set(d, kThresh)), low, hi);
+  }
+
+  template <class D, class V>
+  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
+    auto low = Mul(Set(d, kInvMulLow), x);
+    auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)),
+                       Set(d, kInvPowHi));
+    return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi);
+  }
+
+ private:
+  static constexpr double kThresh = 0.018;
+  static constexpr double kMulLow = 4.5;
+  static constexpr double kMulHi = 1.099;
+  static constexpr double kPowHi = 0.45;
+  static constexpr double kSub = -0.099;
+
+  static constexpr double kInvThresh = 0.081;
+  static constexpr double kInvMulLow = 1 / 4.5;
+  static constexpr double kInvMulHi = 1 / 1.099;
+  static constexpr double kInvPowHi = 1 / 0.45;
+  static constexpr double kInvAdd = 0.099 * kInvMulHi;
+};
+
+// Perceptual Quantization
+class TF_PQ {
+ public:
+  // EOTF (defines the PQ approach). e = encoded.
+  JXL_INLINE double DisplayFromEncoded(double e) const {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    const double xp = std::pow(e, 1.0 / kM2);
+    const double num = std::max(xp - kC1, 0.0);
+    const double den = kC2 - kC3 * xp;
+    JXL_DASSERT(den != 0.0);
+    const double d = std::pow(num / den, 1.0 / kM1);
+    JXL_DASSERT(d >= 0.0);  // Equal for e ~= 1E-9
+    return copysignf(d, original_sign);
+  }
+
+  // Maximum error 3e-6
+  template <class D, class V>
+  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    // 4-over-4-degree rational polynomial approximation on x+x*x. This improves
+    // the maximum error by about 5x over a rational polynomial for x.
+    auto xpxx = MulAdd(x, x, x);
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f),
+        HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f),
+        HWY_REP4(5.50034862e-01f),
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f),
+        HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f),
+        HWY_REP4(2.67718770e+00f),
+    };
+    auto magnitude = EvalRationalPolynomial(d, xpxx, p, q);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+  // Inverse EOTF. d = display.
+  JXL_INLINE double EncodedFromDisplay(double d) const {
+    if (d == 0.0) return 0.0;
+    const double original_sign = d;
+    d = std::abs(d);
+
+    const double xp = std::pow(d, kM1);
+    const double num = kC1 + xp * kC2;
+    const double den = 1.0 + xp * kC3;
+    const double e = std::pow(num / den, kM2);
+    JXL_DASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+  // Maximum error 7e-7.
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+    // 4-over-4-degree rational polynomial approximation on x**0.25, with two
+    // different polynomials above and below 1e-4.
+    auto xto025 = Sqrt(Sqrt(x));
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
+        HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
+        HWY_REP4(4.838434e+01f),
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f),
+        HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f),
+        HWY_REP4(2.590418e+01f),
+    };
+
+    HWY_ALIGN constexpr float plo[(4 + 1) * 4] = {
+        HWY_REP4(9.863406e-06f),  HWY_REP4(3.881234e-01f),
+        HWY_REP4(1.352821e+02f),  HWY_REP4(6.889862e+04f),
+        HWY_REP4(-2.864824e+05f),
+    };
+    HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = {
+        HWY_REP4(3.371868e+01f),  HWY_REP4(1.477719e+03f),
+        HWY_REP4(1.608477e+04f),  HWY_REP4(-4.389884e+04f),
+        HWY_REP4(-2.072546e+05f),
+    };
+
+    auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)),
+                                EvalRationalPolynomial(d, xto025, plo, qlo),
+                                EvalRationalPolynomial(d, xto025, p, q));
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  static constexpr double kM1 = 2610.0 / 16384;
+  static constexpr double kM2 = (2523.0 / 4096) * 128;
+  static constexpr double kC1 = 3424.0 / 4096;
+  static constexpr double kC2 = (2413.0 / 4096) * 32;
+  static constexpr double kC3 = (2392.0 / 4096) * 32;
+};
+
+// sRGB
+class TF_SRGB {
+ public:
+  template <typename V>
+  JXL_INLINE V DisplayFromEncoded(V x) const {
+    const HWY_FULL(float) d;
+    const HWY_FULL(uint32_t) du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+
+    // TODO(janwas): range reduction
+    // Computed via af_cheb_rational (k=100); replicated 4x.
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f, 2.200248328e-04f,
+        1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f, 1.043637593e-02f,
+        1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f, 1.624820318e-01f,
+        7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f, 7.961564959e-01f,
+        8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f, 8.210152774e-01f,
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        2.631846970e-01f,  2.631846970e-01f,  2.631846970e-01f,
+        2.631846970e-01f,  1.076976492e+00f,  1.076976492e+00f,
+        1.076976492e+00f,  1.076976492e+00f,  4.987528350e-01f,
+        4.987528350e-01f,  4.987528350e-01f,  4.987528350e-01f,
+        -5.512498495e-02f, -5.512498495e-02f, -5.512498495e-02f,
+        -5.512498495e-02f, 6.521209011e-03f,  6.521209011e-03f,
+        6.521209011e-03f,  6.521209011e-03f,
+    };
+    const V linear = Mul(x, Set(d, kLowDivInv));
+    const V poly = EvalRationalPolynomial(d, x, p, q);
+    const V magnitude =
+        IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+  // Error ~5e-07
+  template <class D, class V>
+  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
+    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+    const V kSign = BitCast(d, Set(du, 0x80000000u));
+    const V original_sign = And(x, kSign);
+    x = AndNot(kSign, x);  // abs
+
+    // Computed via af_cheb_rational (k=100); replicated 4x.
+    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
+        -5.135152395e-04f, -5.135152395e-04f, -5.135152395e-04f,
+        -5.135152395e-04f, 5.287254571e-03f,  5.287254571e-03f,
+        5.287254571e-03f,  5.287254571e-03f,  3.903842876e-01f,
+        3.903842876e-01f,  3.903842876e-01f,  3.903842876e-01f,
+        1.474205315e+00f,  1.474205315e+00f,  1.474205315e+00f,
+        1.474205315e+00f,  7.352629620e-01f,  7.352629620e-01f,
+        7.352629620e-01f,  7.352629620e-01f,
+    };
+    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
+        1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f, 1.004519624e-02f,
+        3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f, 3.036675394e-01f,
+        1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f, 1.340816930e+00f,
+        9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f, 9.258482155e-01f,
+        2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f, 2.424867759e-02f,
+    };
+    const V linear = Mul(x, Set(d, kLowDiv));
+    const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q);
+    const V magnitude =
+        IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear);
+    return Or(AndNot(kSign, magnitude), original_sign);
+  }
+
+ private:
+  static constexpr float kThreshSRGBToLinear = 0.04045f;
+  static constexpr float kThreshLinearToSRGB = 0.0031308f;
+  static constexpr float kLowDiv = 12.92f;
+  static constexpr float kLowDivInv = 1.0f / kLowDiv;
+};
+
+// Linear to sRGB conversion with error of at most 1.2e-4.
+template <typename D, typename V>
+V FastLinearToSRGB(D d, V v) {
+  const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
+  const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di;
+  // Convert to 0.25 - 0.5 range.
+  auto v025_05 = BitCast(
+      d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff)));
+  // third degree polynomial approximation between 0.25 and 0.5
+  // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves
+  // accuracy by about 3x.
+  auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f));
+  auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f));
+  auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f));
+  // Compute extra multiplier depending on exponent. Valid exponent range for
+  // [0.0031308f, 1.0) is 0...8 after subtracting 118.
+  // The next three constants contain a representation of the powers of
+  // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are
+  // always the same and in k2to512powers_basebits, and the two arrays contain
+  // the next groups of 8 bits. This ends up being a 22-bit representation (with
+  // a mantissa of 13 bits). The choice of polynomial to approximate is such
+  // that the multiplication factor has the highest 5 bits constant, and that
+  // the factor for the lowest possible exponent is a power of two (thus making
+  // the additional bits 0, which is used to correctly merge back together the
+  // floats).
+  constexpr uint32_t k2to512powers_basebits = 0x40000000;
+  HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = {
+      0x0,  0xa,  0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c,
+      0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6,
+  };
+  HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = {
+      0x0,  0xb7, 0x4,  0xd,  0xcb, 0xe7, 0x41, 0x68,
+      0x51, 0xd1, 0xeb, 0xf2, 0x0,  0xb7, 0x4,  0xd,
+  };
+  // Note that vld1q_s8_x2 on ARM seems to actually be slower.
+#if HWY_TARGET != HWY_SCALAR
+  using hwy::HWY_NAMESPACE::ShiftLeft;
+  using hwy::HWY_NAMESPACE::ShiftRight;
+  // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}.
+  auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118));
+  auto pow25to18bits = TableLookupBytes(
+      LoadDup128(di,
+                 reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)),
+      exp);
+  auto pow17to10bits = TableLookupBytes(
+      LoadDup128(di,
+                 reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)),
+      exp);
+  // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here
+  // we take advantage of the fact that each table has its position 0 equal to
+  // 0.
+  // We can now just reassemble the float.
+  auto mul = BitCast(
+      d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)),
+            Set(di, k2to512powers_basebits)));
+#else
+  // Fallback for scalar.
+  uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf;
+  auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) |
+                                    (k2to512powers_17to10bits[exp] << 10) |
+                                    k2to512powers_basebits));
+#endif
+  return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)),
+                    MulAdd(pow, mul, Set(d, -0.055)));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/transpose-inl.h b/third_party/jpeg-xl/lib/jxl/transpose-inl.h
new file mode 100644
index 0000000000..4674420737
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/transpose-inl.h
@@ -0,0 +1,203 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Block transpose for DCT/IDCT
+
+#if defined(LIB_JXL_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_TRANSPOSE_INL_H_
+#undef LIB_JXL_TRANSPOSE_INL_H_
+#else
+#define LIB_JXL_TRANSPOSE_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+#include <type_traits>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/dct_block-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+#ifndef JXL_INLINE_TRANSPOSE
+// Workaround for issue #42 - (excessive?) inlining causes invalid codegen.
+#if defined(__arm__)
+#define JXL_INLINE_TRANSPOSE HWY_NOINLINE
+#else
+#define JXL_INLINE_TRANSPOSE HWY_INLINE
+#endif
+#endif  // JXL_INLINE_TRANSPOSE
+
+// Simple wrapper that ensures that a function will not be inlined.
+template <typename T, typename... Args>
+JXL_NOINLINE void NoInlineWrapper(const T& f, const Args&... args) {
+  return f(args...);
+}
+
+template <bool enabled>
+struct TransposeSimdTag {};
+
+// TODO(veluca): it's not super useful to have this in the SIMD namespace.
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<false>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  for (size_t n = 0; n < ROWS; ++n) {
+    for (size_t m = 0; m < COLS; ++m) {
+      to.Write(from.Read(n, m), m, n);
+    }
+  }
+}
+
+// TODO(veluca): AVX3?
+#if HWY_CAP_GE256
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) {
+  return ROWS % 8 == 0 && COLS % 8 == 0;
+}
+
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<true>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  static_assert(MaxLanes(BlockDesc<8>()) == 8, "Invalid descriptor size");
+  static_assert(ROWS_or_0 % 8 == 0, "Invalid number of rows");
+  static_assert(COLS_or_0 % 8 == 0, "Invalid number of columns");
+  for (size_t n = 0; n < ROWS; n += 8) {
+    for (size_t m = 0; m < COLS; m += 8) {
+      const BlockDesc<8> d;
+      auto i0 = from.LoadPart(d, n + 0, m + 0);
+      auto i1 = from.LoadPart(d, n + 1, m + 0);
+      auto i2 = from.LoadPart(d, n + 2, m + 0);
+      auto i3 = from.LoadPart(d, n + 3, m + 0);
+      auto i4 = from.LoadPart(d, n + 4, m + 0);
+      auto i5 = from.LoadPart(d, n + 5, m + 0);
+      auto i6 = from.LoadPart(d, n + 6, m + 0);
+      auto i7 = from.LoadPart(d, n + 7, m + 0);
+      // Surprisingly, this straightforward implementation (24 cycles on port5)
+      // is faster than load128+insert and LoadDup128+ConcatUpperLower+blend.
+      const auto q0 = InterleaveLower(d, i0, i2);
+      const auto q1 = InterleaveLower(d, i1, i3);
+      const auto q2 = InterleaveUpper(d, i0, i2);
+      const auto q3 = InterleaveUpper(d, i1, i3);
+      const auto q4 = InterleaveLower(d, i4, i6);
+      const auto q5 = InterleaveLower(d, i5, i7);
+      const auto q6 = InterleaveUpper(d, i4, i6);
+      const auto q7 = InterleaveUpper(d, i5, i7);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      const auto r4 = InterleaveLower(d, q4, q5);
+      const auto r5 = InterleaveUpper(d, q4, q5);
+      const auto r6 = InterleaveLower(d, q6, q7);
+      const auto r7 = InterleaveUpper(d, q6, q7);
+
+      i0 = ConcatLowerLower(d, r4, r0);
+      i1 = ConcatLowerLower(d, r5, r1);
+      i2 = ConcatLowerLower(d, r6, r2);
+      i3 = ConcatLowerLower(d, r7, r3);
+      i4 = ConcatUpperUpper(d, r4, r0);
+      i5 = ConcatUpperUpper(d, r5, r1);
+      i6 = ConcatUpperUpper(d, r6, r2);
+      i7 = ConcatUpperUpper(d, r7, r3);
+      to.StorePart(d, i0, m + 0, n + 0);
+      to.StorePart(d, i1, m + 1, n + 0);
+      to.StorePart(d, i2, m + 2, n + 0);
+      to.StorePart(d, i3, m + 3, n + 0);
+      to.StorePart(d, i4, m + 4, n + 0);
+      to.StorePart(d, i5, m + 5, n + 0);
+      to.StorePart(d, i6, m + 6, n + 0);
+      to.StorePart(d, i7, m + 7, n + 0);
+    }
+  }
+}
+#elif HWY_TARGET != HWY_SCALAR
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) {
+  return ROWS % 4 == 0 && COLS % 4 == 0;
+}
+
+template <size_t ROWS_or_0, size_t COLS_or_0, class From, class To>
+JXL_INLINE_TRANSPOSE void GenericTransposeBlock(TransposeSimdTag<true>,
+                                                const From& from, const To& to,
+                                                size_t ROWSp, size_t COLSp) {
+  size_t ROWS = ROWS_or_0 == 0 ? ROWSp : ROWS_or_0;
+  size_t COLS = COLS_or_0 == 0 ? COLSp : COLS_or_0;
+  static_assert(MaxLanes(BlockDesc<4>()) == 4, "Invalid descriptor size");
+  static_assert(ROWS_or_0 % 4 == 0, "Invalid number of rows");
+  static_assert(COLS_or_0 % 4 == 0, "Invalid number of columns");
+  for (size_t n = 0; n < ROWS; n += 4) {
+    for (size_t m = 0; m < COLS; m += 4) {
+      const BlockDesc<4> d;
+      const auto p0 = from.LoadPart(d, n + 0, m + 0);
+      const auto p1 = from.LoadPart(d, n + 1, m + 0);
+      const auto p2 = from.LoadPart(d, n + 2, m + 0);
+      const auto p3 = from.LoadPart(d, n + 3, m + 0);
+
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+
+      to.StorePart(d, r0, m + 0, n + 0);
+      to.StorePart(d, r1, m + 1, n + 0);
+      to.StorePart(d, r2, m + 2, n + 0);
+      to.StorePart(d, r3, m + 3, n + 0);
+    }
+  }
+}
+#else
+constexpr bool TransposeUseSimd(size_t ROWS, size_t COLS) { return false; }
+#endif
+
+template <size_t N, size_t M, typename = void>
+struct Transpose {
+  template <typename From, typename To>
+  static void Run(const From& from, const To& to) {
+    // This does not guarantee anything, just saves from the most stupid
+    // mistakes.
+    JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0));
+    TransposeSimdTag<TransposeUseSimd(N, M)> tag;
+    GenericTransposeBlock<N, M>(tag, from, to, N, M);
+  }
+};
+
+// Avoid inlining and unrolling transposes for large blocks.
+template <size_t N, size_t M>
+struct Transpose<
+    N, M, typename std::enable_if<(N >= 8 && M >= 8 && N * M >= 512)>::type> {
+  template <typename From, typename To>
+  static void Run(const From& from, const To& to) {
+    // This does not guarantee anything, just saves from the most stupid
+    // mistakes.
+    JXL_DASSERT(from.Address(0, 0) != to.Address(0, 0));
+    TransposeSimdTag<TransposeUseSimd(N, M)> tag;
+    constexpr void (*transpose)(TransposeSimdTag<TransposeUseSimd(N, M)>,
+                                const From&, const To&, size_t, size_t) =
+        GenericTransposeBlock<0, 0, From, To>;
+    NoInlineWrapper(transpose, tag, from, to, N, M);
+  }
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_TRANSPOSE_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/version.h.in b/third_party/jpeg-xl/lib/jxl/version.h.in
new file mode 100644
index 0000000000..d077abec79
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/version.h.in
@@ -0,0 +1,39 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_common
+ * @{
+ * @file version.h
+ * @brief libjxl version information
+ */
+
+#ifndef JXL_VERSION_H_
+#define JXL_VERSION_H_
+
+#define JPEGXL_MAJOR_VERSION @JPEGXL_MAJOR_VERSION@ ///< JPEG XL Major version
+#define JPEGXL_MINOR_VERSION @JPEGXL_MINOR_VERSION@ ///< JPEG XL Minor version
+#define JPEGXL_PATCH_VERSION @JPEGXL_PATCH_VERSION@ ///< JPEG XL Patch version
+
+/** Can be used to conditionally compile code for a specific JXL version
+ * @param[maj] major version
+ * @param[min] minor version
+ *
+ * @code
+ * #if JPEGXL_NUMERIC_VERSION < JPEGXL_COMPUTE_NUMERIC_VERSION(0,8,0)
+ * // use old/deprecated api
+ * #else
+ * // use current api
+ * #endif
+ * @endcode
+ */
+#define JPEGXL_COMPUTE_NUMERIC_VERSION(major,minor,patch) ((major<<24) | (minor<<16) | (patch<<8) | 0)
+
+/* Numeric representation of the version */
+#define JPEGXL_NUMERIC_VERSION JPEGXL_COMPUTE_NUMERIC_VERSION(JPEGXL_MAJOR_VERSION,JPEGXL_MINOR_VERSION,JPEGXL_PATCH_VERSION)
+
+#endif /* JXL_VERSION_H_ */
+
+/** @}*/
diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h
new file mode 100644
index 0000000000..a473d591f2
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus-inl.h
@@ -0,0 +1,103 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fast but weak random generator.
+
+#if defined(LIB_JXL_XORSHIFT128PLUS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_XORSHIFT128PLUS_INL_H_
+#undef LIB_JXL_XORSHIFT128PLUS_INL_H_
+#else
+#define LIB_JXL_XORSHIFT128PLUS_INL_H_
+#endif
+
+#include <stddef.h>
+
+#include <hwy/highway.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Xor;
+
+// Adapted from https://github.com/vpxyz/xorshift/blob/master/xorshift128plus/
+// (MIT-license)
+class Xorshift128Plus {
+ public:
+  // 8 independent generators (= single iteration for AVX-512)
+  enum { N = 8 };
+
+  explicit HWY_MAYBE_UNUSED Xorshift128Plus(const uint64_t seed) {
+    // Init state using SplitMix64 generator
+    s0_[0] = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+    s1_[0] = SplitMix64(s0_[0]);
+    for (size_t i = 1; i < N; ++i) {
+      s0_[i] = SplitMix64(s1_[i - 1]);
+      s1_[i] = SplitMix64(s0_[i]);
+    }
+  }
+
+  HWY_MAYBE_UNUSED Xorshift128Plus(const uint32_t seed1, const uint32_t seed2,
+                                   const uint32_t seed3, const uint32_t seed4) {
+    // Init state using SplitMix64 generator
+    s0_[0] = SplitMix64(((static_cast<uint64_t>(seed1) << 32) + seed2) +
+                        0x9E3779B97F4A7C15ull);
+    s1_[0] = SplitMix64(((static_cast<uint64_t>(seed3) << 32) + seed4) +
+                        0x9E3779B97F4A7C15ull);
+    for (size_t i = 1; i < N; ++i) {
+      s0_[i] = SplitMix64(s0_[i - 1]);
+      s1_[i] = SplitMix64(s1_[i - 1]);
+    }
+  }
+
+  HWY_INLINE HWY_MAYBE_UNUSED void Fill(uint64_t* HWY_RESTRICT random_bits) {
+#if HWY_CAP_INTEGER64
+    const HWY_FULL(uint64_t) d;
+    for (size_t i = 0; i < N; i += Lanes(d)) {
+      auto s1 = Load(d, s0_ + i);
+      const auto s0 = Load(d, s1_ + i);
+      const auto bits = Add(s1, s0);  // b, c
+      Store(s0, d, s0_ + i);
+      s1 = Xor(s1, ShiftLeft<23>(s1));
+      Store(bits, d, random_bits + i);
+      s1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
+      Store(s1, d, s1_ + i);
+    }
+#else
+    for (size_t i = 0; i < N; ++i) {
+      auto s1 = s0_[i];
+      const auto s0 = s1_[i];
+      const auto bits = s1 + s0;  // b, c
+      s0_[i] = s0;
+      s1 ^= s1 << 23;
+      random_bits[i] = bits;
+      s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+      s1_[i] = s1;
+    }
+#endif
+  }
+
+ private:
+  static uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+  HWY_ALIGN uint64_t s0_[N];
+  HWY_ALIGN uint64_t s1_[N];
+};
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_XORSHIFT128PLUS_INL_H_
diff --git a/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc
new file mode 100644
index 0000000000..2b0c78b1d1
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/xorshift128plus_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/xorshift128plus_test.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+#include <hwy/tests/test_util-inl.h>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/xorshift128plus-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sub;
+
+// Define to nonzero in order to print the (new) golden outputs.
+#define PRINT_RESULTS 0
+
+const size_t kVectors = 64;
+
+#if PRINT_RESULTS
+
+template <int kNumLanes>
+void Print(const uint64_t (&result)[kNumLanes]) {
+  printf("{ ");
+  for (int i = 0; i < kNumLanes; ++i) {
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("0x%016llXull", result[i]);
+  }
+  printf("},\n");
+}
+
+#else  // PRINT_RESULTS
+
+const uint64_t kExpected[kVectors][Xorshift128Plus::N] = {
+    {0x6E901576D477CBB1ull, 0xE9E53789195DA2A2ull, 0xB681F6DDA5E0AE99ull,
+     0x8EFD18CE21FD6896ull, 0xA898A80DF75CF532ull, 0x50CEB2C9E2DE7E32ull,
+     0x3CA7C2FEB25C0DD0ull, 0xA4D0866B80B4D836ull},
+    {0x8CD6A1E6233D3A26ull, 0x3D4603ADE98B112Dull, 0xDC427AF674019E36ull,
+     0xE28B4D230705AC53ull, 0x7297E9BBA88783DDull, 0x34D3D23CFCD9B41Aull,
+     0x5A223615ADBE96B8ull, 0xE5EB529027CFBD01ull},
+    {0xC1894CF00DFAC6A2ull, 0x18EDF8AE9085E404ull, 0x8E936625296B4CCDull,
+     0x31971EF3A14A899Bull, 0xBE87535FCE0BF26Aull, 0x576F7A752BC6649Full,
+     0xA44CBADCE0C6B937ull, 0x3DBA819BB17A353Aull},
+    {0x27CE38DFCC1C5EB6ull, 0x920BEB5606340256ull, 0x3986CBC40C9AFC2Cull,
+     0xE22BCB3EEB1E191Eull, 0x6E1FCDD3602A8FBAull, 0x052CB044E5415A29ull,
+     0x46266646EFB9ECD7ull, 0x8F44914618D29335ull},
+    {0xDD30AEDF72A362C5ull, 0xBC1D824E16BB98F4ull, 0x9EA6009C2AA3D2F1ull,
+     0xF65C0FBBE17AF081ull, 0x22424D06A8738991ull, 0x8A62763F2B7611D2ull,
+     0x2F3E89F722637939ull, 0x84D338BEF50AFD50ull},
+    {0x00F46494898E2B0Bull, 0x81239DC4FB8E8003ull, 0x414AD93EC5773FE7ull,
+     0x791473C450E4110Full, 0x87F127BF68C959ACull, 0x6429282D695EF67Bull,
+     0x661082E11546CBA8ull, 0x5815D53FA5436BFDull},
+    {0xB3DEADAB9BE6E0F9ull, 0xAA1B7B8F7CED0202ull, 0x4C5ED437699D279Eull,
+     0xA4471727F1CB39D3ull, 0xE439DA193F802F70ull, 0xF89401BB04FA6493ull,
+     0x3B08045A4FE898BAull, 0x32137BFE98227950ull},
+    {0xFBAE4A092897FEF3ull, 0x0639F6CE56E71C8Eull, 0xF0AD6465C07F0C1Eull,
+     0xFF8E28563361DCE5ull, 0xC2013DB7F86BC6B9ull, 0x8EFCC0503330102Full,
+     0x3F6B767EA5C4DA40ull, 0xB9864B950B2232E1ull},
+    {0x76EB58DE8E5EC22Aull, 0x9BBBF49A18B32F4Full, 0xC8405F02B2B2FAB9ull,
+     0xC3E122A5F146BC34ull, 0xC90BB046660F5765ull, 0xB933981310DBECCFull,
+     0x5A2A7BFC9126FD1Cull, 0x8BB388C94DF87901ull},
+    {0x753EB89AD63EF3C3ull, 0xF24AAF40C89D65ADull, 0x23F68931C1A6AA6Dull,
+     0xF47E79BF702C6DD0ull, 0xA3AD113244EE7EAEull, 0xD42CBEA28F793DC3ull,
+     0xD896FCF1820F497Cull, 0x042B86D2818948C1ull},
+    {0x8F2A4FC5A4265763ull, 0xEC499E6F95EAA10Cull, 0xE3786D4ECCD0DEB5ull,
+     0xC725C53D3AC4CC43ull, 0x065A4ACBBF83610Eull, 0x35C61C9FEF167129ull,
+     0x7B720AEAA7D70048ull, 0x14206B841377D039ull},
+    {0xAD27D78BF96055F6ull, 0x5F43B20FF47ADCD4ull, 0xE184C2401E2BF71Eull,
+     0x30B263D78990045Dull, 0xC22F00EBFF9BA201ull, 0xAE7F86522B53A562ull,
+     0x2853312BC039F0A4ull, 0x868D619E6549C3C8ull},
+    {0xFD5493D8AE9A8371ull, 0x773D5E224DF61B3Bull, 0x5377C54FBB1A8280ull,
+     0xCAD4DE3B8265CAFAull, 0xCDF3F19C91EBD5F6ull, 0xC8EA0F182D73BD78ull,
+     0x220502D593433FF1ull, 0xB81205E612DC31B1ull},
+    {0x8F32A39EAEDA4C70ull, 0x1D4B0914AA4DAC7Full, 0x56EF1570F3A8B405ull,
+     0x29812CB17404A592ull, 0x97A2AAF69CAE90F2ull, 0x12BF5E02778BBFE5ull,
+     0x9D4B55AD42A05FD2ull, 0x06C2BAB5E6086620ull},
+    {0x8DB4B9648302B253ull, 0xD756AD9E3AEA12C7ull, 0x68709B7F11D4B188ull,
+     0x7CC299DDCD707A4Bull, 0x97B860C370A7661Dull, 0xCECD314FC20E64F5ull,
+     0x55F412CDFB4C7EC3ull, 0x55EE97591193B525ull},
+    {0xCF70F3ACA96E6254ull, 0x022FEDECA2E09F46ull, 0x686823DB60AE1ECFull,
+     0xFD36190D3739830Eull, 0x74E1C09027F68120ull, 0xB5883A835C093842ull,
+     0x93E1EFB927E9E4E3ull, 0xB2721E249D7E5EBEull},
+    {0x69B6E21C44188CB8ull, 0x5D6CFB853655A7AAull, 0x3E001A0B425A66DCull,
+     0x8C57451103A5138Full, 0x7BF8B4BE18EAB402ull, 0x494102EB8761A365ull,
+     0xB33796A9F6A81F0Eull, 0x10005AB3BCCFD960ull},
+    {0xB2CF25740AE965DCull, 0x6F7C1DF7EF53D670ull, 0x648DD6087AC2251Eull,
+     0x040955D9851D487Dull, 0xBD550FC7E21A7F66ull, 0x57408F484DEB3AB5ull,
+     0x481E24C150B506C1ull, 0x72C0C3EAF91A40D6ull},
+    {0x1997A481858A5D39ull, 0x539718F4BEF50DC1ull, 0x2EC4DC4787E7E368ull,
+     0xFF1CE78879419845ull, 0xE219A93DD6F6DD30ull, 0x85328618D02FEC1Aull,
+     0xC86E02D969181B20ull, 0xEBEC8CD8BBA34E6Eull},
+    {0x28B55088A16CE947ull, 0xDD25AC11E6350195ull, 0xBD1F176694257B1Cull,
+     0x09459CCF9FCC9402ull, 0xF8047341E386C4E4ull, 0x7E8E9A9AD984C6C0ull,
+     0xA4661E95062AA092ull, 0x70A9947005ED1152ull},
+    {0x4C01CF75DBE98CCDull, 0x0BA076CDFC7373B9ull, 0x6C5E7A004B57FB59ull,
+     0x336B82297FD3BC56ull, 0x7990C0BE74E8D60Full, 0xF0275CC00EC5C8C8ull,
+     0x6CF29E682DFAD2E9ull, 0xFA4361524BD95D72ull},
+    {0x631D2A19FF62F018ull, 0x41C43863B985B3FAull, 0xE052B2267038EFD9ull,
+     0xE2A535FAC575F430ull, 0xE004EEA90B1FF5B8ull, 0x42DFE2CA692A1F26ull,
+     0x90FB0BFC9A189ECCull, 0x4484102BD3536BD0ull},
+    {0xD027134E9ACCA5A5ull, 0xBBAB4F966D476A9Bull, 0x713794A96E03D693ull,
+     0x9F6335E6B94CD44Aull, 0xC5090C80E7471617ull, 0x6D9C1B0C87B58E33ull,
+     0x1969CE82E31185A5ull, 0x2099B97E87754EBEull},
+    {0x60EBAF4ED934350Full, 0xC26FBF0BA5E6ECFFull, 0x9E54150F0312EC57ull,
+     0x0973B48364ED0041ull, 0x800A523241426CFCull, 0x03AB5EC055F75989ull,
+     0x8CF315935DEEB40Aull, 0x83D3FC0190BD1409ull},
+    {0x26D35394CF720A51ull, 0xCE9EAA15243CBAFEull, 0xE2B45FBAF21B29E0ull,
+     0xDB92E98EDE73F9E0ull, 0x79B16F5101C26387ull, 0x1AC15959DE88C86Full,
+     0x387633AEC6D6A580ull, 0xA6FC05807BFC5EB8ull},
+    {0x2D26C8E47C6BADA9ull, 0x820E6EC832D52D73ull, 0xB8432C3E0ED0EE5Bull,
+     0x0F84B3C4063AAA87ull, 0xF393E4366854F651ull, 0x749E1B4D2366A567ull,
+     0x805EACA43480D004ull, 0x244EBF3AA54400A5ull},
+    {0xBFDC3763AA79F75Aull, 0x9E3A74CC751F41DBull, 0xF401302A149DBC55ull,
+     0x6B25F7973D7BF7BCull, 0x13371D34FDBC3DAEull, 0xC5E1998C8F484DCDull,
+     0x7031B8AE5C364464ull, 0x3847F0C4F3DA2C25ull},
+    {0x24C6387D2C0F1225ull, 0x77CCE960255C67A4ull, 0x21A0947E497B10EBull,
+     0xBB5DB73A825A9D7Eull, 0x26294A41999E553Dull, 0x3953E0089F87D925ull,
+     0x3DAE6E5D4E5EAAFEull, 0x74B545460341A7AAull},
+    {0x710E5EB08A7DB820ull, 0x7E43C4E77CAEA025ull, 0xD4C91529C8B060C1ull,
+     0x09AE26D8A7B0CA29ull, 0xAB9F356BB360A772ull, 0xB68834A25F19F6E9ull,
+     0x79B8D9894C5734E2ull, 0xC6847E7C8FFD265Full},
+    {0x10C4BCB06A5111E6ull, 0x57CB50955B6A2516ull, 0xEF53C87798B6995Full,
+     0xAB38E15BBD8D0197ull, 0xA51C6106EFF73C93ull, 0x83D7F0E2270A7134ull,
+     0x0923FD330397FCE5ull, 0xF9DE54EDFE58FB45ull},
+    {0x07D44833ACCD1A94ull, 0xAAD3C9E945E2F9F3ull, 0xABF4C879B876AA37ull,
+     0xF29C69A21B301619ull, 0x2DDCE959111C788Bull, 0x7CEDB48F8AC1729Bull,
+     0x93F3BA9A02B659BEull, 0xF20A87FF17933CBEull},
+    {0x8E96EBE93180CFE6ull, 0x94CAA12873937079ull, 0x05F613D9380D4189ull,
+     0xBCAB40C1DC79F38Aull, 0x0AD8907B7C61D19Eull, 0x88534E189D103910ull,
+     0x2DB2FAABA160AB8Full, 0xA070E7506B06F15Cull},
+    {0x6FB1FCDAFFEF87A9ull, 0xE735CF25337A090Dull, 0x172C6EDCEFEF1825ull,
+     0x76957EA49EF0542Dull, 0x819BF4CD250F7C49ull, 0xD6FF23E4AD00C4D4ull,
+     0xE79673C1EC358FF0ull, 0xAC9C048144337938ull},
+    {0x4C5387FF258B3AF4ull, 0xEDB68FAEC2CB1AA3ull, 0x02A624E67B4E1DA4ull,
+     0x5C44797A38E08AF2ull, 0x36546A70E9411B4Bull, 0x47C17B24D2FD9675ull,
+     0x101957AAA020CA26ull, 0x47A1619D4779F122ull},
+    {0xF84B8BCDC92D9A3Cull, 0x951D7D2C74B3066Bull, 0x7AC287C06EDDD9B2ull,
+     0x4C38FC476608D38Full, 0x224D793B19CB4BCDull, 0x835A255899BF1A41ull,
+     0x4AD250E9F62DB4ABull, 0xD9B44F4B58781096ull},
+    {0xABBAF99A8EB5C6B8ull, 0xFB568E900D3A9F56ull, 0x11EDF63D23C5DF11ull,
+     0xA9C3011D3FA7C5A8ull, 0xAEDD3CF11AFFF725ull, 0xABCA472B5F1EDD6Bull,
+     0x0600B6BB5D879804ull, 0xDB4DE007F22191A0ull},
+    {0xD76CC9EFF0CE9392ull, 0xF5E0A772B59BA49Aull, 0x7D1AE1ED0C1261B5ull,
+     0x79224A33B5EA4F4Aull, 0x6DD825D80C40EA60ull, 0x47FC8E747E51C953ull,
+     0x695C05F72888BF98ull, 0x1A012428440B9015ull},
+    {0xD754DD61F9B772BFull, 0xC4A2FCF4C0F9D4EBull, 0x461167CDF67A24A2ull,
+     0x434748490EBCB9D4ull, 0x274DD9CDCA5781DEull, 0x36BAC63BA9A85209ull,
+     0x30324DAFDA36B70Full, 0x337570DB4FE6DAB3ull},
+    {0xF46CBDD57C551546ull, 0x8E02507E676DA3E3ull, 0xD826245A8C15406Dull,
+     0xDFB38A5B71113B72ull, 0x5EA38454C95B16B5ull, 0x28C054FB87ABF3E1ull,
+     0xAA2724C0BA1A8096ull, 0xECA83EC980304F2Full},
+    {0x6AA76EC294EB3303ull, 0x42D4CDB2A8032E3Bull, 0x7999EDF75DCD8735ull,
+     0xB422BFFE696CCDCCull, 0x8F721461FD7CCDFEull, 0x148E1A5814FDE253ull,
+     0x4DC941F4375EF8FFull, 0x27B2A9E0EB5B49CFull},
+    {0xCEA592EF9343EBE1ull, 0xF7D38B5FA7698903ull, 0x6CCBF352203FEAB6ull,
+     0x830F3095FCCDA9C5ull, 0xDBEEF4B81B81C8F4ull, 0x6D7EB9BCEECA5CF9ull,
+     0xC58ABB0FBE436C69ull, 0xE4B97E6DB2041A4Bull},
+    {0x7E40FC772978AF14ull, 0xCDDA4BBAE28354A1ull, 0xE4F993B832C32613ull,
+     0xD3608093C68A4B35ull, 0x9A3B60E01BEE3699ull, 0x03BEF248F3288713ull,
+     0x70B9294318F3E9B4ull, 0x8D2ABB913B8610DEull},
+    {0x37F209128E7D8B2Cull, 0x81D2AB375BD874BCull, 0xA716A1B7373F7408ull,
+     0x0CEE97BEC4706540ull, 0xA40C5FD9CDBC1512ull, 0x73CAF6C8918409E7ull,
+     0x45E11BCEDF0BBAA1ull, 0x612C612BFF6E6605ull},
+    {0xF8ECB14A12D0F649ull, 0xDA683CD7C01BA1ACull, 0xA2203F7510E124C1ull,
+     0x7F83E52E162F3C78ull, 0x77D2BB73456ACADBull, 0x37FC34FC840BBA6Full,
+     0x3076BC7D4C6EBC1Full, 0x4F514123632B5FA9ull},
+    {0x44D789DED935E884ull, 0xF8291591E09FEC9Full, 0xD9CED2CF32A2E4B7ull,
+     0x95F70E1EB604904Aull, 0xDE438FE43C14F6ABull, 0x4C8D23E4FAFCF8D8ull,
+     0xC716910A3067EB86ull, 0x3D6B7915315095D3ull},
+    {0x3170FDBADAB92095ull, 0x8F1963933FC5650Bull, 0x72F94F00ABECFEABull,
+     0x6E3AE826C6AAB4CEull, 0xA677A2BF31068258ull, 0x9660CDC4F363AF10ull,
+     0xD81A15A152379EF1ull, 0x5D7D285E1080A3F9ull},
+    {0xDAD5DDFF9A2249B3ull, 0x6F9721D926103FAEull, 0x1418CBB83FFA349Aull,
+     0xE71A30AD48C012B2ull, 0xBE76376C63751132ull, 0x3496467ACA713AE6ull,
+     0x8D7EC01369F991A3ull, 0xD8C73A88B96B154Eull},
+    {0x8B5D9C74AEB4833Aull, 0xF914FB3F867B912Full, 0xB894EA034936B1DCull,
+     0x8A16D21BE51C4F5Bull, 0x31FF048ED582D98Eull, 0xB95AB2F4DC65B820ull,
+     0x04082B9170561AF7ull, 0xA215610A5DC836FAull},
+    {0xB2ADE592C092FAACull, 0x7A1E683BCBF13294ull, 0xC7A4DBF86858C096ull,
+     0x3A49940F97BFF316ull, 0xCAE5C06B82C46703ull, 0xC7F413A0F951E2BDull,
+     0x6665E7BB10EB5916ull, 0x86F84A5A94EDE319ull},
+    {0x4EA199D8FAA79CA3ull, 0xDFA26E5BF1981704ull, 0x0F5E081D37FA4E01ull,
+     0x9CB632F89CD675CDull, 0x4A09DB89D48C0304ull, 0x88142742EA3C7672ull,
+     0xAC4F149E6D2E9BDBull, 0x6D9E1C23F8B1C6C6ull},
+    {0xD58BE47B92DEC0E9ull, 0x8E57573645E34328ull, 0x4CC094CCB5FB5126ull,
+     0x5F1D66AF6FB40E3Cull, 0x2BA15509132D3B00ull, 0x0D6545646120E567ull,
+     0x3CF680C45C223666ull, 0x96B28E32930179DAull},
+    {0x5900C45853AC7990ull, 0x61881E3E8B7FF169ull, 0x4DE5F835DF2230FFull,
+     0x4427A9E7932F73FFull, 0x9B641BAD379A8C8Dull, 0xDF271E5BF98F4E5Cull,
+     0xDFDA16DB830FF5EEull, 0x371C7E7CFB89C0E9ull},
+    {0x4410A8576247A250ull, 0x6AD2DA12B45AC0D9ull, 0x18DFC72AAC85EECCull,
+     0x06FC8BB2A0EF25C8ull, 0xEB287619C85E6118ull, 0x19553ECA67F25A2Cull,
+     0x3B9557F1DCEC5BAAull, 0x7BAD9E8B710D1079ull},
+    {0x34F365D66BD22B28ull, 0xE6E124B9F10F835Dull, 0x0573C38ABF2B24DCull,
+     0xD32E6AF10A0125AEull, 0x383590ACEA979519ull, 0x8376ED7A39E28205ull,
+     0xF0B7F184DCBDA435ull, 0x062A203390E31794ull},
+    {0xA2AFFD7E41918760ull, 0x7F90FC1BD0819C86ull, 0x5033C08E5A969533ull,
+     0x2707AF5C6D039590ull, 0x57BBD5980F17DF9Cull, 0xD3FE6E61D763268Aull,
+     0x9E0A0AE40F335A3Bull, 0x43CF4EB0A99613C5ull},
+    {0xD4D2A397CE1A7C2Eull, 0x3DF7CE7CC3212DADull, 0x0880F0D5D356C75Aull,
+     0xA8AFC44DD03B1346ull, 0x79263B46C13A29E0ull, 0x11071B3C0ED58E7Aull,
+     0xED46DC9F538406BFull, 0x2C94974F2B94843Dull},
+    {0xE246E13C39AB5D5Eull, 0xAC1018489D955B20ull, 0x8601B558771852B8ull,
+     0x110BD4C06DB40173ull, 0x738FC8A18CCA0EBBull, 0x6673E09BE0EA76E5ull,
+     0x024BC7A0C7527877ull, 0x45E6B4652E2EC34Eull},
+    {0xD1ED26A1A375CDC8ull, 0xAABC4E896A617CB8ull, 0x0A9C9E8E57D753C6ull,
+     0xA3774A75FEB4C30Eull, 0x30B816C01C93E49Eull, 0xF405BABC06D2408Cull,
+     0xCC0CE6B4CE788ABCull, 0x75E7922D0447956Cull},
+    {0xD07C1676A698BC95ull, 0x5F9AEA4840E2D860ull, 0xD5FC10D58BDF6F02ull,
+     0xF190A2AD4BC2EEA7ull, 0x0C24D11F51726931ull, 0xDB646899A16B6512ull,
+     0x7BC10670047B1DD8ull, 0x2413A5ABCD45F092ull},
+    {0x4E66892190CFD923ull, 0xF10162440365EC8Eull, 0x158ACA5A6A2280AEull,
+     0x0D60ED11C0224166ull, 0x7CD2E9A71B9D7488ull, 0x450D7289706AB2A3ull,
+     0x88FAE34EC9A0D7DCull, 0x96FF9103575A97DAull},
+    {0x77990FAC6046C446ull, 0xB174B5FB30C76676ull, 0xE352CE3EB56CF82Aull,
+     0xC6039B6873A9A082ull, 0xE3F80F3AE333148Aull, 0xB853BA24BA3539B9ull,
+     0xE8863E52ECCB0C74ull, 0x309B4CC1092CC245ull},
+    {0xBC2B70BEE8388D9Full, 0xE48D92AE22216DCEull, 0xF15F3BF3E2C15D8Full,
+     0x1DD964D4812D8B24ull, 0xD56AF02FB4665E4Cull, 0x98002200595BD9A3ull,
+     0x049246D50BB8FA12ull, 0x1B542DF485B579B9ull},
+    {0x2347409ADFA8E497ull, 0x36015C2211D62498ull, 0xE9F141F32EB82690ull,
+     0x1F839912D0449FB9ull, 0x4E4DCFFF2D02D97Cull, 0xF8A03AB4C0F625C9ull,
+     0x0605F575795DAC5Cull, 0x4746C9BEA0DDA6B1ull},
+    {0xCA5BB519ECE7481Bull, 0xFD496155E55CA945ull, 0xF753B9DBB1515F81ull,
+     0x50549E8BAC0F70E7ull, 0x8614FB0271E21C60ull, 0x60C72947EB0F0070ull,
+     0xA6511C10AEE742B6ull, 0x48FB48F2CACCB43Eull}};
+
+#endif  // PRINT_RESULTS
+
+// Ensures Xorshift128+ returns consistent and unchanging values.
+void TestGolden() {
+  HWY_ALIGN Xorshift128Plus rng(12345);
+  for (uint64_t vector = 0; vector < kVectors; ++vector) {
+    HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+    rng.Fill(lanes);
+#if PRINT_RESULTS
+    Print(lanes);
+#else
+    for (size_t i = 0; i < Xorshift128Plus::N; ++i) {
+      ASSERT_EQ(kExpected[vector][i], lanes[i])
+          << "Where vector=" << vector << " i=" << i;
+    }
+#endif
+  }
+}
+
+// Output changes when given different seeds
+void TestSeedChanges() {
+  HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+
+  std::vector<uint64_t> first;
+  constexpr size_t kNumSeeds = 16384;
+  first.reserve(kNumSeeds);
+
+  // All 14-bit seeds
+  for (size_t seed = 0; seed < kNumSeeds; ++seed) {
+    HWY_ALIGN Xorshift128Plus rng(seed);
+
+    rng.Fill(lanes);
+    first.push_back(lanes[0]);
+  }
+
+  // All outputs are unique
+  ASSERT_EQ(kNumSeeds, first.size());
+  std::sort(first.begin(), first.end());
+  first.erase(std::unique(first.begin(), first.end()), first.end());
+  EXPECT_EQ(kNumSeeds, first.size());
+}
+
+void TestFloat() {
+  test::ThreadPoolForTests pool(8);
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 256;
+#else   // JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 4096;
+#endif  // JXL_DISABLE_SLOW_TESTS
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kMaxSeed, ThreadPool::NoInit,
+      [](const uint32_t seed, size_t /*thread*/) {
+        HWY_ALIGN Xorshift128Plus rng(seed);
+
+        const HWY_FULL(uint32_t) du;
+        const HWY_FULL(float) df;
+        HWY_ALIGN uint64_t batch[Xorshift128Plus::N];
+        HWY_ALIGN float lanes[MaxLanes(df)];
+        double sum = 0.0;
+        size_t count = 0;
+        const size_t kReps = 2000;
+        for (size_t reps = 0; reps < kReps; ++reps) {
+          rng.Fill(batch);
+          for (size_t i = 0; i < Xorshift128Plus::N * 2; i += Lanes(df)) {
+            const auto bits =
+                Load(du, reinterpret_cast<const uint32_t*>(batch) + i);
+            // 1.0 + 23 random mantissa bits = [1, 2)
+            const auto rand12 =
+                BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000)));
+            const auto rand01 = Sub(rand12, Set(df, 1.0f));
+            Store(rand01, df, lanes);
+            for (float lane : lanes) {
+              sum += lane;
+              count += 1;
+              EXPECT_LE(lane, 1.0f);
+              EXPECT_GE(lane, 0.0f);
+            }
+          }
+        }
+
+        // Verify average (uniform distribution)
+        EXPECT_NEAR(0.5, sum / count, 0.00702);
+      },
+      "TestXorShift"));
+}
+
+// Not more than one 64-bit zero
+void TestNotZero() {
+  test::ThreadPoolForTests pool(8);
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 500;
+#else   // JXL_DISABLE_SLOW_TESTS
+  const uint32_t kMaxSeed = 2000;
+#endif  // JXL_DISABLE_SLOW_TESTS
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kMaxSeed, ThreadPool::NoInit,
+      [](const uint32_t task, size_t /*thread*/) {
+        HWY_ALIGN uint64_t lanes[Xorshift128Plus::N];
+
+        HWY_ALIGN Xorshift128Plus rng(task);
+        size_t num_zero = 0;
+        for (size_t vectors = 0; vectors < 10000; ++vectors) {
+          rng.Fill(lanes);
+          for (uint64_t lane : lanes) {
+            num_zero += static_cast<size_t>(lane == 0);
+          }
+        }
+        EXPECT_LE(num_zero, 1u);
+      },
+      "TestNotZero"));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class Xorshift128Test : public hwy::TestWithParamTarget {};
+
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Xorshift128Test);
+
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestNotZero);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestGolden);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestSeedChanges);
+HWY_EXPORT_AND_TEST_P(Xorshift128Test, TestFloat);
+
+}  // namespace jxl
+#endif
diff --git a/third_party/jpeg-xl/lib/jxl_benchmark.cmake b/third_party/jpeg-xl/lib/jxl_benchmark.cmake
new file mode 100644
index 0000000000..10871e3073
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_benchmark.cmake
@@ -0,0 +1,36 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(jxl_lists.cmake)
+
+# benchmark.h doesn't work in our MINGW set up since it ends up including the
+# wrong stdlib header. We don't run gbench on MINGW targets anyway.
+if(NOT MINGW)
+
+# This is the Google benchmark project (https://github.com/google/benchmark).
+find_package(benchmark QUIET)
+
+if(benchmark_FOUND)
+  if(JPEGXL_STATIC AND NOT MINGW)
+    # benchmark::benchmark hardcodes the librt.so which obviously doesn't
+    # compile in static mode.
+    set_target_properties(benchmark::benchmark PROPERTIES
+      INTERFACE_LINK_LIBRARIES "Threads::Threads;-lrt")
+  endif()
+
+  # Compiles all the benchmark files into a single binary. Individual benchmarks
+  # can be run with --benchmark_filter.
+  add_executable(jxl_gbench "${JPEGXL_INTERNAL_GBENCH_SOURCES}" gbench_main.cc)
+
+  target_compile_definitions(jxl_gbench PRIVATE
+    -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
+  target_link_libraries(jxl_gbench
+    jxl_extras-static
+    jxl-static
+    benchmark::benchmark
+  )
+endif() # benchmark_FOUND
+
+endif() # MINGW
diff --git a/third_party/jpeg-xl/lib/jxl_extras.cmake b/third_party/jpeg-xl/lib/jxl_extras.cmake
new file mode 100644
index 0000000000..c1071278e4
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_extras.cmake
@@ -0,0 +1,169 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(compatibility.cmake)
+include(jxl_lists.cmake)
+
+list(APPEND JPEGXL_EXTRAS_CORE_SOURCES
+  "${JPEGXL_INTERNAL_EXTRAS_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_JXL_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_PGX_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_PNM_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_NPY_SOURCES}"
+)
+
+add_library(jxl_extras_codec-obj OBJECT "${JPEGXL_EXTRAS_CORE_SOURCES}")
+target_compile_options(jxl_extras_codec-obj PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+target_compile_definitions(jxl_extras_codec-obj PRIVATE -DJXL_EXPORT=)
+set_property(TARGET jxl_extras_codec-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_extras_codec-obj PUBLIC
+  ${PROJECT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+  ${CMAKE_CURRENT_BINARY_DIR}/include
+  ${JXL_HWY_INCLUDE_DIRS}
+)
+set(JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES)
+set(JXL_EXTRAS_CODEC_PUBLIC_COMPILE_DEFINITIONS)
+
+# We only define a static library for jxl_extras since it uses internal parts
+# of jxl library which are not accessible from outside the library in the
+# shared library case.
+add_library(jxl_extras-static STATIC EXCLUDE_FROM_ALL
+  "${JPEGXL_EXTRAS_CORE_SOURCES}"
+  "${JPEGXL_INTERNAL_EXTRAS_FOR_TOOLS_SOURCES}"
+)
+target_compile_options(jxl_extras-static PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+set_property(TARGET jxl_extras-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_extras-static PUBLIC "${PROJECT_SOURCE_DIR}")
+target_link_libraries(jxl_extras-static PUBLIC
+  jxl-static
+  jxl_threads-static
+)
+
+# Define an extras library that does not have the image codecs, only the core
+# extras code. This is needed for some of the fuzzers.
+add_library(jxl_extras_nocodec-static STATIC EXCLUDE_FROM_ALL
+  "${JPEGXL_EXTRAS_CORE_SOURCES}"
+  "${JPEGXL_INTERNAL_EXTRAS_FOR_TOOLS_SOURCES}"
+)
+target_compile_options(jxl_extras_nocodec-static PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+set_property(TARGET jxl_extras_nocodec-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jxl_extras_nocodec-static PUBLIC "${PROJECT_SOURCE_DIR}")
+target_link_libraries(jxl_extras_nocodec-static PUBLIC
+  jxl-static
+  jxl_threads-static
+)
+
+find_package(GIF 5.1)
+if(GIF_FOUND)
+  target_sources(jxl_extras_codec-obj PRIVATE
+    extras/dec/gif.cc
+    extras/dec/gif.h
+  )
+  target_include_directories(jxl_extras_codec-obj PRIVATE "${GIF_INCLUDE_DIRS}")
+  list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${GIF_LIBRARIES})
+  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_GIF=1)
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libgif-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.libgif COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+endif()
+
+find_package(JPEG)
+if(JPEG_FOUND)
+  target_sources(jxl_extras_codec-obj PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_JPG_SOURCES}"
+  )
+  target_include_directories(jxl_extras_codec-obj PRIVATE "${JPEG_INCLUDE_DIRS}")
+  list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${JPEG_LIBRARIES})
+  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_JPEG=1)
+  target_sources(jxl_extras-static PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_JPG_SOURCES}"
+  )
+  target_include_directories(jxl_extras-static PRIVATE "${JPEG_INCLUDE_DIRS}")
+  target_link_libraries(jxl_extras-static PRIVATE ${JPEG_LIBRARIES})
+  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_JPEG=1)
+  if(JPEGXL_ENABLE_JPEGLI)
+    target_sources(jxl_extras-static PRIVATE
+      "${JPEGXL_INTERNAL_CODEC_JPEGLI_SOURCES}"
+    )
+    target_link_libraries(jxl_extras-static PRIVATE jpegli-static)
+    target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_JPEGLI=1)
+  endif()
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libjpeg-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.libjpeg COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+endif()
+
+if(NOT JPEGXL_BUNDLE_LIBPNG)
+  find_package(PNG)
+endif()
+if(PNG_FOUND)
+  target_sources(jxl_extras_codec-obj PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_APNG_SOURCES}"
+  )
+  target_include_directories(jxl_extras_codec-obj PRIVATE "${PNG_INCLUDE_DIRS}")
+  list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${PNG_LIBRARIES})
+  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_APNG=1)
+  target_sources(jxl_extras-static PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_APNG_SOURCES}"
+  )
+  target_include_directories(jxl_extras-static PUBLIC "${PNG_INCLUDE_DIRS}")
+  target_link_libraries(jxl_extras-static PUBLIC ${PNG_LIBRARIES})
+  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_APNG=1)
+  configure_file(extras/LICENSE.apngdis
+                 ${PROJECT_BINARY_DIR}/LICENSE.apngdis COPYONLY)
+endif()
+
+if (JPEGXL_ENABLE_SJPEG)
+  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_SJPEG=1)
+  target_link_libraries(jxl_extras-static PRIVATE sjpeg)
+endif ()
+
+if (JPEGXL_ENABLE_OPENEXR)
+pkg_check_modules(OpenEXR IMPORTED_TARGET OpenEXR)
+if (OpenEXR_FOUND)
+  target_sources(jxl_extras_codec-obj PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_EXR_SOURCES}"
+  )
+  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_EXR=1)
+  target_include_directories(jxl_extras_codec-obj PRIVATE "${OpenEXR_INCLUDE_DIRS}")
+  list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES PkgConfig::OpenEXR)
+  target_sources(jxl_extras-static PRIVATE
+    "${JPEGXL_INTERNAL_CODEC_EXR_SOURCES}"
+  )
+  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_EXR=1)
+  target_link_libraries(jxl_extras-static PRIVATE PkgConfig::OpenEXR)
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libopenexr-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.libopenexr COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+  # OpenEXR generates exceptions, so we need exception support to catch them.
+  # Actually those flags counteract the ones set in JPEGXL_INTERNAL_FLAGS.
+  if (NOT WIN32)
+    set_source_files_properties(extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS -fexceptions)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      set_source_files_properties(extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS -fcxx-exceptions)
+    endif()
+  endif()
+endif() # OpenEXR_FOUND
+endif() # JPEGXL_ENABLE_OPENEXR
+
+target_compile_definitions(jxl_extras_codec-obj PRIVATE ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
+
+### Static library.
+add_library(jxl_extras_codec-static STATIC $<TARGET_OBJECTS:jxl_extras_codec-obj>)
+target_compile_definitions(jxl_extras_codec-static PUBLIC ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
+target_link_libraries(jxl_extras_codec-static PRIVATE ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES} jxl)
+
+### Shared library.
+if (BUILD_SHARED_LIBS)
+add_library(jxl_extras_codec SHARED $<TARGET_OBJECTS:jxl_extras_codec-obj>)
+target_compile_definitions(jxl_extras_codec PUBLIC ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
+target_link_libraries(jxl_extras_codec PRIVATE ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES} jxl)
+else()
+add_library(jxl_extras_codec ALIAS jxl_extras_codec-static)
+endif()  # BUILD_SHARED_LIBS
diff --git a/third_party/jpeg-xl/lib/jxl_lists.bzl b/third_party/jpeg-xl/lib/jxl_lists.bzl
new file mode 100644
index 0000000000..dbaf90a659
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_lists.bzl
@@ -0,0 +1,637 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file is generated, do not modify by manually.
+# Run `tools/scripts/build_cleaner.py --update` to regenerate it.
+
+libjxl_base_sources = [
+    "jxl/base/arch_macros.h",
+    "jxl/base/bits.h",
+    "jxl/base/byte_order.h",
+    "jxl/base/cache_aligned.cc",
+    "jxl/base/cache_aligned.h",
+    "jxl/base/compiler_specific.h",
+    "jxl/base/data_parallel.cc",
+    "jxl/base/data_parallel.h",
+    "jxl/base/file_io.h",
+    "jxl/base/float.h",
+    "jxl/base/iaca.h",
+    "jxl/base/os_macros.h",
+    "jxl/base/override.h",
+    "jxl/base/padded_bytes.cc",
+    "jxl/base/padded_bytes.h",
+    "jxl/base/printf_macros.h",
+    "jxl/base/profiler.cc",
+    "jxl/base/profiler.h",
+    "jxl/base/random.cc",
+    "jxl/base/random.h",
+    "jxl/base/sanitizer_definitions.h",
+    "jxl/base/scope_guard.h",
+    "jxl/base/span.h",
+    "jxl/base/status.h",
+    "jxl/base/tsc_timer.h",
+]
+
+libjxl_codec_apng_sources = [
+    "extras/dec/apng.cc",
+    "extras/dec/apng.h",
+    "extras/enc/apng.cc",
+    "extras/enc/apng.h",
+]
+
+libjxl_codec_exr_sources = [
+    "extras/dec/exr.cc",
+    "extras/dec/exr.h",
+    "extras/enc/exr.cc",
+    "extras/enc/exr.h",
+]
+
+libjxl_codec_gif_sources = [
+    "extras/dec/gif.cc",
+    "extras/dec/gif.h",
+]
+
+libjxl_codec_jpegli_sources = [
+    "extras/dec/jpegli.cc",
+    "extras/dec/jpegli.h",
+    "extras/enc/jpegli.cc",
+    "extras/enc/jpegli.h",
+]
+
+libjxl_codec_jpg_sources = [
+    "extras/dec/jpg.cc",
+    "extras/dec/jpg.h",
+    "extras/enc/jpg.cc",
+    "extras/enc/jpg.h",
+]
+
+libjxl_codec_jxl_sources = [
+    "extras/dec/jxl.cc",
+    "extras/dec/jxl.h",
+    "extras/enc/jxl.cc",
+    "extras/enc/jxl.h",
+]
+
+libjxl_codec_npy_sources = [
+    "extras/enc/npy.cc",
+    "extras/enc/npy.h",
+]
+
+libjxl_codec_pgx_sources = [
+    "extras/dec/pgx.cc",
+    "extras/dec/pgx.h",
+    "extras/enc/pgx.cc",
+    "extras/enc/pgx.h",
+]
+
+libjxl_codec_pnm_sources = [
+    "extras/dec/pnm.cc",
+    "extras/dec/pnm.h",
+    "extras/enc/pnm.cc",
+    "extras/enc/pnm.h",
+]
+
+libjxl_dec_box_sources = [
+    "jxl/box_content_decoder.cc",
+    "jxl/box_content_decoder.h",
+]
+
+libjxl_dec_jpeg_sources = [
+    "jxl/decode_to_jpeg.cc",
+    "jxl/decode_to_jpeg.h",
+    "jxl/jpeg/dec_jpeg_data.cc",
+    "jxl/jpeg/dec_jpeg_data.h",
+    "jxl/jpeg/dec_jpeg_data_writer.cc",
+    "jxl/jpeg/dec_jpeg_data_writer.h",
+    "jxl/jpeg/dec_jpeg_output_chunk.h",
+    "jxl/jpeg/dec_jpeg_serialization_state.h",
+    "jxl/jpeg/jpeg_data.cc",
+    "jxl/jpeg/jpeg_data.h",
+]
+
+libjxl_dec_sources = [
+    "jxl/ac_context.h",
+    "jxl/ac_strategy.cc",
+    "jxl/ac_strategy.h",
+    "jxl/alpha.cc",
+    "jxl/alpha.h",
+    "jxl/ans_common.cc",
+    "jxl/ans_common.h",
+    "jxl/ans_params.h",
+    "jxl/blending.cc",
+    "jxl/blending.h",
+    "jxl/chroma_from_luma.cc",
+    "jxl/chroma_from_luma.h",
+    "jxl/codec_in_out.h",
+    "jxl/coeff_order.cc",
+    "jxl/coeff_order.h",
+    "jxl/coeff_order_fwd.h",
+    "jxl/color_encoding_internal.cc",
+    "jxl/color_encoding_internal.h",
+    "jxl/color_management.cc",
+    "jxl/color_management.h",
+    "jxl/common.h",
+    "jxl/compressed_dc.cc",
+    "jxl/compressed_dc.h",
+    "jxl/convolve-inl.h",
+    "jxl/convolve.h",
+    "jxl/convolve_separable5.cc",
+    "jxl/convolve_separable7.cc",
+    "jxl/convolve_slow.cc",
+    "jxl/convolve_symmetric3.cc",
+    "jxl/convolve_symmetric5.cc",
+    "jxl/dct-inl.h",
+    "jxl/dct_block-inl.h",
+    "jxl/dct_scales.cc",
+    "jxl/dct_scales.h",
+    "jxl/dct_util.h",
+    "jxl/dec_ans.cc",
+    "jxl/dec_ans.h",
+    "jxl/dec_bit_reader.h",
+    "jxl/dec_cache.cc",
+    "jxl/dec_cache.h",
+    "jxl/dec_context_map.cc",
+    "jxl/dec_context_map.h",
+    "jxl/dec_external_image.cc",
+    "jxl/dec_external_image.h",
+    "jxl/dec_frame.cc",
+    "jxl/dec_frame.h",
+    "jxl/dec_group.cc",
+    "jxl/dec_group.h",
+    "jxl/dec_group_border.cc",
+    "jxl/dec_group_border.h",
+    "jxl/dec_huffman.cc",
+    "jxl/dec_huffman.h",
+    "jxl/dec_modular.cc",
+    "jxl/dec_modular.h",
+    "jxl/dec_noise.cc",
+    "jxl/dec_noise.h",
+    "jxl/dec_patch_dictionary.cc",
+    "jxl/dec_patch_dictionary.h",
+    "jxl/dec_tone_mapping-inl.h",
+    "jxl/dec_transforms-inl.h",
+    "jxl/dec_xyb-inl.h",
+    "jxl/dec_xyb.cc",
+    "jxl/dec_xyb.h",
+    "jxl/decode.cc",
+    "jxl/entropy_coder.cc",
+    "jxl/entropy_coder.h",
+    "jxl/epf.cc",
+    "jxl/epf.h",
+    "jxl/exif.h",
+    "jxl/fast_dct-inl.h",
+    "jxl/fast_dct.cc",
+    "jxl/fast_dct.h",
+    "jxl/fast_dct128-inl.h",
+    "jxl/fast_dct16-inl.h",
+    "jxl/fast_dct256-inl.h",
+    "jxl/fast_dct32-inl.h",
+    "jxl/fast_dct64-inl.h",
+    "jxl/fast_dct8-inl.h",
+    "jxl/fast_math-inl.h",
+    "jxl/field_encodings.h",
+    "jxl/fields.cc",
+    "jxl/fields.h",
+    "jxl/frame_header.cc",
+    "jxl/frame_header.h",
+    "jxl/gauss_blur.cc",
+    "jxl/gauss_blur.h",
+    "jxl/headers.cc",
+    "jxl/headers.h",
+    "jxl/huffman_table.cc",
+    "jxl/huffman_table.h",
+    "jxl/icc_codec.cc",
+    "jxl/icc_codec.h",
+    "jxl/icc_codec_common.cc",
+    "jxl/icc_codec_common.h",
+    "jxl/image.cc",
+    "jxl/image.h",
+    "jxl/image_bundle.cc",
+    "jxl/image_bundle.h",
+    "jxl/image_metadata.cc",
+    "jxl/image_metadata.h",
+    "jxl/image_ops.h",
+    "jxl/inverse_mtf-inl.h",
+    "jxl/jxl_inspection.h",
+    "jxl/lehmer_code.h",
+    "jxl/loop_filter.cc",
+    "jxl/loop_filter.h",
+    "jxl/luminance.cc",
+    "jxl/luminance.h",
+    "jxl/matrix_ops.h",
+    "jxl/memory_manager_internal.cc",
+    "jxl/memory_manager_internal.h",
+    "jxl/modular/encoding/context_predict.h",
+    "jxl/modular/encoding/dec_ma.cc",
+    "jxl/modular/encoding/dec_ma.h",
+    "jxl/modular/encoding/encoding.cc",
+    "jxl/modular/encoding/encoding.h",
+    "jxl/modular/encoding/ma_common.h",
+    "jxl/modular/modular_image.cc",
+    "jxl/modular/modular_image.h",
+    "jxl/modular/options.h",
+    "jxl/modular/transform/palette.cc",
+    "jxl/modular/transform/palette.h",
+    "jxl/modular/transform/rct.cc",
+    "jxl/modular/transform/rct.h",
+    "jxl/modular/transform/squeeze.cc",
+    "jxl/modular/transform/squeeze.h",
+    "jxl/modular/transform/transform.cc",
+    "jxl/modular/transform/transform.h",
+    "jxl/noise.h",
+    "jxl/opsin_params.cc",
+    "jxl/opsin_params.h",
+    "jxl/passes_state.cc",
+    "jxl/passes_state.h",
+    "jxl/patch_dictionary_internal.h",
+    "jxl/quant_weights.cc",
+    "jxl/quant_weights.h",
+    "jxl/quantizer-inl.h",
+    "jxl/quantizer.cc",
+    "jxl/quantizer.h",
+    "jxl/rational_polynomial-inl.h",
+    "jxl/render_pipeline/low_memory_render_pipeline.cc",
+    "jxl/render_pipeline/low_memory_render_pipeline.h",
+    "jxl/render_pipeline/render_pipeline.cc",
+    "jxl/render_pipeline/render_pipeline.h",
+    "jxl/render_pipeline/render_pipeline_stage.h",
+    "jxl/render_pipeline/simple_render_pipeline.cc",
+    "jxl/render_pipeline/simple_render_pipeline.h",
+    "jxl/render_pipeline/stage_blending.cc",
+    "jxl/render_pipeline/stage_blending.h",
+    "jxl/render_pipeline/stage_chroma_upsampling.cc",
+    "jxl/render_pipeline/stage_chroma_upsampling.h",
+    "jxl/render_pipeline/stage_epf.cc",
+    "jxl/render_pipeline/stage_epf.h",
+    "jxl/render_pipeline/stage_from_linear.cc",
+    "jxl/render_pipeline/stage_from_linear.h",
+    "jxl/render_pipeline/stage_gaborish.cc",
+    "jxl/render_pipeline/stage_gaborish.h",
+    "jxl/render_pipeline/stage_noise.cc",
+    "jxl/render_pipeline/stage_noise.h",
+    "jxl/render_pipeline/stage_patches.cc",
+    "jxl/render_pipeline/stage_patches.h",
+    "jxl/render_pipeline/stage_splines.cc",
+    "jxl/render_pipeline/stage_splines.h",
+    "jxl/render_pipeline/stage_spot.cc",
+    "jxl/render_pipeline/stage_spot.h",
+    "jxl/render_pipeline/stage_to_linear.cc",
+    "jxl/render_pipeline/stage_to_linear.h",
+    "jxl/render_pipeline/stage_tone_mapping.cc",
+    "jxl/render_pipeline/stage_tone_mapping.h",
+    "jxl/render_pipeline/stage_upsampling.cc",
+    "jxl/render_pipeline/stage_upsampling.h",
+    "jxl/render_pipeline/stage_write.cc",
+    "jxl/render_pipeline/stage_write.h",
+    "jxl/render_pipeline/stage_xyb.cc",
+    "jxl/render_pipeline/stage_xyb.h",
+    "jxl/render_pipeline/stage_ycbcr.cc",
+    "jxl/render_pipeline/stage_ycbcr.h",
+    "jxl/sanitizers.h",
+    "jxl/simd_util-inl.h",
+    "jxl/splines.cc",
+    "jxl/splines.h",
+    "jxl/toc.cc",
+    "jxl/toc.h",
+    "jxl/transfer_functions-inl.h",
+    "jxl/transpose-inl.h",
+    "jxl/xorshift128plus-inl.h",
+]
+
+libjxl_enc_sources = [
+    "jxl/butteraugli/butteraugli.cc",
+    "jxl/butteraugli/butteraugli.h",
+    "jxl/butteraugli_wrapper.cc",
+    "jxl/enc_ac_strategy.cc",
+    "jxl/enc_ac_strategy.h",
+    "jxl/enc_adaptive_quantization.cc",
+    "jxl/enc_adaptive_quantization.h",
+    "jxl/enc_ans.cc",
+    "jxl/enc_ans.h",
+    "jxl/enc_ans_params.h",
+    "jxl/enc_ar_control_field.cc",
+    "jxl/enc_ar_control_field.h",
+    "jxl/enc_aux_out.cc",
+    "jxl/enc_aux_out.h",
+    "jxl/enc_bit_writer.cc",
+    "jxl/enc_bit_writer.h",
+    "jxl/enc_butteraugli_comparator.cc",
+    "jxl/enc_butteraugli_comparator.h",
+    "jxl/enc_butteraugli_pnorm.cc",
+    "jxl/enc_butteraugli_pnorm.h",
+    "jxl/enc_cache.cc",
+    "jxl/enc_cache.h",
+    "jxl/enc_chroma_from_luma.cc",
+    "jxl/enc_chroma_from_luma.h",
+    "jxl/enc_cluster.cc",
+    "jxl/enc_cluster.h",
+    "jxl/enc_coeff_order.cc",
+    "jxl/enc_coeff_order.h",
+    "jxl/enc_color_management.cc",
+    "jxl/enc_color_management.h",
+    "jxl/enc_comparator.cc",
+    "jxl/enc_comparator.h",
+    "jxl/enc_context_map.cc",
+    "jxl/enc_context_map.h",
+    "jxl/enc_detect_dots.cc",
+    "jxl/enc_detect_dots.h",
+    "jxl/enc_dot_dictionary.cc",
+    "jxl/enc_dot_dictionary.h",
+    "jxl/enc_entropy_coder.cc",
+    "jxl/enc_entropy_coder.h",
+    "jxl/enc_external_image.cc",
+    "jxl/enc_external_image.h",
+    "jxl/enc_fast_lossless.cc",
+    "jxl/enc_fast_lossless.h",
+    "jxl/enc_fields.cc",
+    "jxl/enc_fields.h",
+    "jxl/enc_file.cc",
+    "jxl/enc_file.h",
+    "jxl/enc_frame.cc",
+    "jxl/enc_frame.h",
+    "jxl/enc_gaborish.cc",
+    "jxl/enc_gaborish.h",
+    "jxl/enc_gamma_correct.h",
+    "jxl/enc_group.cc",
+    "jxl/enc_group.h",
+    "jxl/enc_heuristics.cc",
+    "jxl/enc_heuristics.h",
+    "jxl/enc_huffman.cc",
+    "jxl/enc_huffman.h",
+    "jxl/enc_huffman_tree.cc",
+    "jxl/enc_huffman_tree.h",
+    "jxl/enc_icc_codec.cc",
+    "jxl/enc_icc_codec.h",
+    "jxl/enc_image_bundle.cc",
+    "jxl/enc_image_bundle.h",
+    "jxl/enc_jxl_skcms.h",
+    "jxl/enc_linalg.cc",
+    "jxl/enc_linalg.h",
+    "jxl/enc_modular.cc",
+    "jxl/enc_modular.h",
+    "jxl/enc_noise.cc",
+    "jxl/enc_noise.h",
+    "jxl/enc_optimize.cc",
+    "jxl/enc_optimize.h",
+    "jxl/enc_params.h",
+    "jxl/enc_patch_dictionary.cc",
+    "jxl/enc_patch_dictionary.h",
+    "jxl/enc_photon_noise.cc",
+    "jxl/enc_photon_noise.h",
+    "jxl/enc_progressive_split.cc",
+    "jxl/enc_progressive_split.h",
+    "jxl/enc_quant_weights.cc",
+    "jxl/enc_quant_weights.h",
+    "jxl/enc_splines.cc",
+    "jxl/enc_splines.h",
+    "jxl/enc_toc.cc",
+    "jxl/enc_toc.h",
+    "jxl/enc_transforms-inl.h",
+    "jxl/enc_transforms.cc",
+    "jxl/enc_transforms.h",
+    "jxl/enc_xyb.cc",
+    "jxl/enc_xyb.h",
+    "jxl/encode.cc",
+    "jxl/encode_internal.h",
+    "jxl/jpeg/enc_jpeg_data.cc",
+    "jxl/jpeg/enc_jpeg_data.h",
+    "jxl/jpeg/enc_jpeg_data_reader.cc",
+    "jxl/jpeg/enc_jpeg_data_reader.h",
+    "jxl/jpeg/enc_jpeg_huffman_decode.cc",
+    "jxl/jpeg/enc_jpeg_huffman_decode.h",
+    "jxl/modular/encoding/enc_debug_tree.cc",
+    "jxl/modular/encoding/enc_debug_tree.h",
+    "jxl/modular/encoding/enc_encoding.cc",
+    "jxl/modular/encoding/enc_encoding.h",
+    "jxl/modular/encoding/enc_ma.cc",
+    "jxl/modular/encoding/enc_ma.h",
+    "jxl/modular/transform/enc_palette.cc",
+    "jxl/modular/transform/enc_palette.h",
+    "jxl/modular/transform/enc_rct.cc",
+    "jxl/modular/transform/enc_rct.h",
+    "jxl/modular/transform/enc_squeeze.cc",
+    "jxl/modular/transform/enc_squeeze.h",
+    "jxl/modular/transform/enc_transform.cc",
+    "jxl/modular/transform/enc_transform.h",
+]
+
+libjxl_extras_for_tools_sources = [
+    "extras/codec.cc",
+    "extras/codec.h",
+    "extras/hlg.cc",
+    "extras/hlg.h",
+    "extras/packed_image_convert.cc",
+    "extras/packed_image_convert.h",
+    "extras/tone_mapping.cc",
+    "extras/tone_mapping.h",
+]
+
+libjxl_extras_sources = [
+    "extras/dec/color_description.cc",
+    "extras/dec/color_description.h",
+    "extras/dec/color_hints.cc",
+    "extras/dec/color_hints.h",
+    "extras/dec/decode.cc",
+    "extras/dec/decode.h",
+    "extras/enc/encode.cc",
+    "extras/enc/encode.h",
+    "extras/exif.cc",
+    "extras/exif.h",
+    "extras/packed_image.h",
+    "extras/size_constraints.h",
+    "extras/time.cc",
+    "extras/time.h",
+]
+
+libjxl_gbench_sources = [
+    "extras/tone_mapping_gbench.cc",
+    "jxl/dec_external_image_gbench.cc",
+    "jxl/enc_external_image_gbench.cc",
+    "jxl/gauss_blur_gbench.cc",
+    "jxl/splines_gbench.cc",
+    "jxl/tf_gbench.cc",
+]
+
+libjxl_jpegli_sources = [
+    "jpegli/adaptive_quantization.cc",
+    "jpegli/adaptive_quantization.h",
+    "jpegli/bit_writer.cc",
+    "jpegli/bit_writer.h",
+    "jpegli/bitstream.cc",
+    "jpegli/bitstream.h",
+    "jpegli/color_quantize.cc",
+    "jpegli/color_quantize.h",
+    "jpegli/color_transform.cc",
+    "jpegli/color_transform.h",
+    "jpegli/common.cc",
+    "jpegli/common.h",
+    "jpegli/common_internal.h",
+    "jpegli/dct-inl.h",
+    "jpegli/dct.cc",
+    "jpegli/dct.h",
+    "jpegli/decode.cc",
+    "jpegli/decode.h",
+    "jpegli/decode_internal.h",
+    "jpegli/decode_marker.cc",
+    "jpegli/decode_marker.h",
+    "jpegli/decode_scan.cc",
+    "jpegli/decode_scan.h",
+    "jpegli/destination_manager.cc",
+    "jpegli/downsample.cc",
+    "jpegli/downsample.h",
+    "jpegli/encode.cc",
+    "jpegli/encode.h",
+    "jpegli/encode_internal.h",
+    "jpegli/entropy_coding.cc",
+    "jpegli/entropy_coding.h",
+    "jpegli/error.cc",
+    "jpegli/error.h",
+    "jpegli/huffman.cc",
+    "jpegli/huffman.h",
+    "jpegli/idct.cc",
+    "jpegli/idct.h",
+    "jpegli/input.cc",
+    "jpegli/input.h",
+    "jpegli/memory_manager.cc",
+    "jpegli/memory_manager.h",
+    "jpegli/quant.cc",
+    "jpegli/quant.h",
+    "jpegli/render.cc",
+    "jpegli/render.h",
+    "jpegli/simd.cc",
+    "jpegli/simd.h",
+    "jpegli/source_manager.cc",
+    "jpegli/transpose-inl.h",
+    "jpegli/upsample.cc",
+    "jpegli/upsample.h",
+]
+
+libjxl_jpegli_testlib_files = [
+    "jpegli/test_utils.cc",
+    "jpegli/test_utils.h",
+]
+
+libjxl_jpegli_tests = [
+    "jpegli/decode_api_test.cc",
+    "jpegli/encode_api_test.cc",
+    "jpegli/error_handling_test.cc",
+    "jpegli/input_suspension_test.cc",
+    "jpegli/output_suspension_test.cc",
+    "jpegli/source_manager_test.cc",
+    "jpegli/streaming_test.cc",
+    "jpegli/transcode_api_test.cc",
+]
+
+libjxl_jpegli_wrapper_sources = [
+    "jpegli/libjpeg_wrapper.cc",
+]
+
+libjxl_major_version = 0
+
+libjxl_minor_version = 9
+
+libjxl_patch_version = 0
+
+libjxl_public_headers = [
+    "include/jxl/butteraugli.h",
+    "include/jxl/butteraugli_cxx.h",
+    "include/jxl/cms_interface.h",
+    "include/jxl/codestream_header.h",
+    "include/jxl/color_encoding.h",
+    "include/jxl/decode.h",
+    "include/jxl/decode_cxx.h",
+    "include/jxl/encode.h",
+    "include/jxl/encode_cxx.h",
+    "include/jxl/memory_manager.h",
+    "include/jxl/parallel_runner.h",
+    "include/jxl/types.h",
+]
+
+libjxl_testlib_files = [
+    "jxl/dct_for_test.h",
+    "jxl/dec_transforms_testonly.cc",
+    "jxl/dec_transforms_testonly.h",
+    "jxl/fake_parallel_runner_testonly.h",
+    "jxl/image_test_utils.h",
+    "jxl/render_pipeline/test_render_pipeline_stages.h",
+    "jxl/test_image.cc",
+    "jxl/test_image.h",
+    "jxl/test_utils.cc",
+    "jxl/test_utils.h",
+]
+
+libjxl_tests = [
+    "extras/codec_test.cc",
+    "extras/dec/color_description_test.cc",
+    "extras/dec/pgx_test.cc",
+    "extras/jpegli_test.cc",
+    "jxl/ac_strategy_test.cc",
+    "jxl/alpha_test.cc",
+    "jxl/ans_common_test.cc",
+    "jxl/ans_test.cc",
+    "jxl/bit_reader_test.cc",
+    "jxl/bits_test.cc",
+    "jxl/blending_test.cc",
+    "jxl/butteraugli_test.cc",
+    "jxl/byte_order_test.cc",
+    "jxl/coeff_order_test.cc",
+    "jxl/color_encoding_internal_test.cc",
+    "jxl/color_management_test.cc",
+    "jxl/convolve_test.cc",
+    "jxl/data_parallel_test.cc",
+    "jxl/dct_test.cc",
+    "jxl/decode_test.cc",
+    "jxl/enc_external_image_test.cc",
+    "jxl/enc_gaborish_test.cc",
+    "jxl/enc_linalg_test.cc",
+    "jxl/enc_optimize_test.cc",
+    "jxl/enc_photon_noise_test.cc",
+    "jxl/encode_test.cc",
+    "jxl/entropy_coder_test.cc",
+    "jxl/fast_dct_test.cc",
+    "jxl/fast_math_test.cc",
+    "jxl/fields_test.cc",
+    "jxl/gamma_correct_test.cc",
+    "jxl/gauss_blur_test.cc",
+    "jxl/gradient_test.cc",
+    "jxl/iaca_test.cc",
+    "jxl/icc_codec_test.cc",
+    "jxl/image_bundle_test.cc",
+    "jxl/image_ops_test.cc",
+    "jxl/jxl_test.cc",
+    "jxl/lehmer_code_test.cc",
+    "jxl/modular_test.cc",
+    "jxl/opsin_image_test.cc",
+    "jxl/opsin_inverse_test.cc",
+    "jxl/padded_bytes_test.cc",
+    "jxl/passes_test.cc",
+    "jxl/patch_dictionary_test.cc",
+    "jxl/preview_test.cc",
+    "jxl/quant_weights_test.cc",
+    "jxl/quantizer_test.cc",
+    "jxl/rational_polynomial_test.cc",
+    "jxl/render_pipeline/render_pipeline_test.cc",
+    "jxl/roundtrip_test.cc",
+    "jxl/simd_util_test.cc",
+    "jxl/speed_tier_test.cc",
+    "jxl/splines_test.cc",
+    "jxl/toc_test.cc",
+    "jxl/xorshift128plus_test.cc",
+    "threads/thread_parallel_runner_test.cc",
+]
+
+libjxl_threads_public_headers = [
+    "include/jxl/resizable_parallel_runner.h",
+    "include/jxl/resizable_parallel_runner_cxx.h",
+    "include/jxl/thread_parallel_runner.h",
+    "include/jxl/thread_parallel_runner_cxx.h",
+]
+
+libjxl_threads_sources = [
+    "threads/resizable_parallel_runner.cc",
+    "threads/thread_parallel_runner.cc",
+    "threads/thread_parallel_runner_internal.cc",
+    "threads/thread_parallel_runner_internal.h",
+]
diff --git a/third_party/jpeg-xl/lib/jxl_lists.cmake b/third_party/jpeg-xl/lib/jxl_lists.cmake
new file mode 100644
index 0000000000..787684d249
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_lists.cmake
@@ -0,0 +1,631 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file is generated, do not modify by manually.
+# Run `tools/scripts/build_cleaner.py --update` to regenerate it.
+
+set(JPEGXL_INTERNAL_BASE_SOURCES
+  jxl/base/arch_macros.h
+  jxl/base/bits.h
+  jxl/base/byte_order.h
+  jxl/base/cache_aligned.cc
+  jxl/base/cache_aligned.h
+  jxl/base/compiler_specific.h
+  jxl/base/data_parallel.cc
+  jxl/base/data_parallel.h
+  jxl/base/file_io.h
+  jxl/base/float.h
+  jxl/base/iaca.h
+  jxl/base/os_macros.h
+  jxl/base/override.h
+  jxl/base/padded_bytes.cc
+  jxl/base/padded_bytes.h
+  jxl/base/printf_macros.h
+  jxl/base/profiler.cc
+  jxl/base/profiler.h
+  jxl/base/random.cc
+  jxl/base/random.h
+  jxl/base/sanitizer_definitions.h
+  jxl/base/scope_guard.h
+  jxl/base/span.h
+  jxl/base/status.h
+  jxl/base/tsc_timer.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_APNG_SOURCES
+  extras/dec/apng.cc
+  extras/dec/apng.h
+  extras/enc/apng.cc
+  extras/enc/apng.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_EXR_SOURCES
+  extras/dec/exr.cc
+  extras/dec/exr.h
+  extras/enc/exr.cc
+  extras/enc/exr.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_GIF_SOURCES
+  extras/dec/gif.cc
+  extras/dec/gif.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JPEGLI_SOURCES
+  extras/dec/jpegli.cc
+  extras/dec/jpegli.h
+  extras/enc/jpegli.cc
+  extras/enc/jpegli.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JPG_SOURCES
+  extras/dec/jpg.cc
+  extras/dec/jpg.h
+  extras/enc/jpg.cc
+  extras/enc/jpg.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JXL_SOURCES
+  extras/dec/jxl.cc
+  extras/dec/jxl.h
+  extras/enc/jxl.cc
+  extras/enc/jxl.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_NPY_SOURCES
+  extras/enc/npy.cc
+  extras/enc/npy.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_PGX_SOURCES
+  extras/dec/pgx.cc
+  extras/dec/pgx.h
+  extras/enc/pgx.cc
+  extras/enc/pgx.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_PNM_SOURCES
+  extras/dec/pnm.cc
+  extras/dec/pnm.h
+  extras/enc/pnm.cc
+  extras/enc/pnm.h
+)
+
+set(JPEGXL_INTERNAL_DEC_BOX_SOURCES
+  jxl/box_content_decoder.cc
+  jxl/box_content_decoder.h
+)
+
+set(JPEGXL_INTERNAL_DEC_JPEG_SOURCES
+  jxl/decode_to_jpeg.cc
+  jxl/decode_to_jpeg.h
+  jxl/jpeg/dec_jpeg_data.cc
+  jxl/jpeg/dec_jpeg_data.h
+  jxl/jpeg/dec_jpeg_data_writer.cc
+  jxl/jpeg/dec_jpeg_data_writer.h
+  jxl/jpeg/dec_jpeg_output_chunk.h
+  jxl/jpeg/dec_jpeg_serialization_state.h
+  jxl/jpeg/jpeg_data.cc
+  jxl/jpeg/jpeg_data.h
+)
+
+set(JPEGXL_INTERNAL_DEC_SOURCES
+  jxl/ac_context.h
+  jxl/ac_strategy.cc
+  jxl/ac_strategy.h
+  jxl/alpha.cc
+  jxl/alpha.h
+  jxl/ans_common.cc
+  jxl/ans_common.h
+  jxl/ans_params.h
+  jxl/blending.cc
+  jxl/blending.h
+  jxl/chroma_from_luma.cc
+  jxl/chroma_from_luma.h
+  jxl/codec_in_out.h
+  jxl/coeff_order.cc
+  jxl/coeff_order.h
+  jxl/coeff_order_fwd.h
+  jxl/color_encoding_internal.cc
+  jxl/color_encoding_internal.h
+  jxl/color_management.cc
+  jxl/color_management.h
+  jxl/common.h
+  jxl/compressed_dc.cc
+  jxl/compressed_dc.h
+  jxl/convolve-inl.h
+  jxl/convolve.h
+  jxl/convolve_separable5.cc
+  jxl/convolve_separable7.cc
+  jxl/convolve_slow.cc
+  jxl/convolve_symmetric3.cc
+  jxl/convolve_symmetric5.cc
+  jxl/dct-inl.h
+  jxl/dct_block-inl.h
+  jxl/dct_scales.cc
+  jxl/dct_scales.h
+  jxl/dct_util.h
+  jxl/dec_ans.cc
+  jxl/dec_ans.h
+  jxl/dec_bit_reader.h
+  jxl/dec_cache.cc
+  jxl/dec_cache.h
+  jxl/dec_context_map.cc
+  jxl/dec_context_map.h
+  jxl/dec_external_image.cc
+  jxl/dec_external_image.h
+  jxl/dec_frame.cc
+  jxl/dec_frame.h
+  jxl/dec_group.cc
+  jxl/dec_group.h
+  jxl/dec_group_border.cc
+  jxl/dec_group_border.h
+  jxl/dec_huffman.cc
+  jxl/dec_huffman.h
+  jxl/dec_modular.cc
+  jxl/dec_modular.h
+  jxl/dec_noise.cc
+  jxl/dec_noise.h
+  jxl/dec_patch_dictionary.cc
+  jxl/dec_patch_dictionary.h
+  jxl/dec_tone_mapping-inl.h
+  jxl/dec_transforms-inl.h
+  jxl/dec_xyb-inl.h
+  jxl/dec_xyb.cc
+  jxl/dec_xyb.h
+  jxl/decode.cc
+  jxl/entropy_coder.cc
+  jxl/entropy_coder.h
+  jxl/epf.cc
+  jxl/epf.h
+  jxl/exif.h
+  jxl/fast_dct-inl.h
+  jxl/fast_dct.cc
+  jxl/fast_dct.h
+  jxl/fast_dct128-inl.h
+  jxl/fast_dct16-inl.h
+  jxl/fast_dct256-inl.h
+  jxl/fast_dct32-inl.h
+  jxl/fast_dct64-inl.h
+  jxl/fast_dct8-inl.h
+  jxl/fast_math-inl.h
+  jxl/field_encodings.h
+  jxl/fields.cc
+  jxl/fields.h
+  jxl/frame_header.cc
+  jxl/frame_header.h
+  jxl/gauss_blur.cc
+  jxl/gauss_blur.h
+  jxl/headers.cc
+  jxl/headers.h
+  jxl/huffman_table.cc
+  jxl/huffman_table.h
+  jxl/icc_codec.cc
+  jxl/icc_codec.h
+  jxl/icc_codec_common.cc
+  jxl/icc_codec_common.h
+  jxl/image.cc
+  jxl/image.h
+  jxl/image_bundle.cc
+  jxl/image_bundle.h
+  jxl/image_metadata.cc
+  jxl/image_metadata.h
+  jxl/image_ops.h
+  jxl/inverse_mtf-inl.h
+  jxl/jxl_inspection.h
+  jxl/lehmer_code.h
+  jxl/loop_filter.cc
+  jxl/loop_filter.h
+  jxl/luminance.cc
+  jxl/luminance.h
+  jxl/matrix_ops.h
+  jxl/memory_manager_internal.cc
+  jxl/memory_manager_internal.h
+  jxl/modular/encoding/context_predict.h
+  jxl/modular/encoding/dec_ma.cc
+  jxl/modular/encoding/dec_ma.h
+  jxl/modular/encoding/encoding.cc
+  jxl/modular/encoding/encoding.h
+  jxl/modular/encoding/ma_common.h
+  jxl/modular/modular_image.cc
+  jxl/modular/modular_image.h
+  jxl/modular/options.h
+  jxl/modular/transform/palette.cc
+  jxl/modular/transform/palette.h
+  jxl/modular/transform/rct.cc
+  jxl/modular/transform/rct.h
+  jxl/modular/transform/squeeze.cc
+  jxl/modular/transform/squeeze.h
+  jxl/modular/transform/transform.cc
+  jxl/modular/transform/transform.h
+  jxl/noise.h
+  jxl/opsin_params.cc
+  jxl/opsin_params.h
+  jxl/passes_state.cc
+  jxl/passes_state.h
+  jxl/patch_dictionary_internal.h
+  jxl/quant_weights.cc
+  jxl/quant_weights.h
+  jxl/quantizer-inl.h
+  jxl/quantizer.cc
+  jxl/quantizer.h
+  jxl/rational_polynomial-inl.h
+  jxl/render_pipeline/low_memory_render_pipeline.cc
+  jxl/render_pipeline/low_memory_render_pipeline.h
+  jxl/render_pipeline/render_pipeline.cc
+  jxl/render_pipeline/render_pipeline.h
+  jxl/render_pipeline/render_pipeline_stage.h
+  jxl/render_pipeline/simple_render_pipeline.cc
+  jxl/render_pipeline/simple_render_pipeline.h
+  jxl/render_pipeline/stage_blending.cc
+  jxl/render_pipeline/stage_blending.h
+  jxl/render_pipeline/stage_chroma_upsampling.cc
+  jxl/render_pipeline/stage_chroma_upsampling.h
+  jxl/render_pipeline/stage_epf.cc
+  jxl/render_pipeline/stage_epf.h
+  jxl/render_pipeline/stage_from_linear.cc
+  jxl/render_pipeline/stage_from_linear.h
+  jxl/render_pipeline/stage_gaborish.cc
+  jxl/render_pipeline/stage_gaborish.h
+  jxl/render_pipeline/stage_noise.cc
+  jxl/render_pipeline/stage_noise.h
+  jxl/render_pipeline/stage_patches.cc
+  jxl/render_pipeline/stage_patches.h
+  jxl/render_pipeline/stage_splines.cc
+  jxl/render_pipeline/stage_splines.h
+  jxl/render_pipeline/stage_spot.cc
+  jxl/render_pipeline/stage_spot.h
+  jxl/render_pipeline/stage_to_linear.cc
+  jxl/render_pipeline/stage_to_linear.h
+  jxl/render_pipeline/stage_tone_mapping.cc
+  jxl/render_pipeline/stage_tone_mapping.h
+  jxl/render_pipeline/stage_upsampling.cc
+  jxl/render_pipeline/stage_upsampling.h
+  jxl/render_pipeline/stage_write.cc
+  jxl/render_pipeline/stage_write.h
+  jxl/render_pipeline/stage_xyb.cc
+  jxl/render_pipeline/stage_xyb.h
+  jxl/render_pipeline/stage_ycbcr.cc
+  jxl/render_pipeline/stage_ycbcr.h
+  jxl/sanitizers.h
+  jxl/simd_util-inl.h
+  jxl/splines.cc
+  jxl/splines.h
+  jxl/toc.cc
+  jxl/toc.h
+  jxl/transfer_functions-inl.h
+  jxl/transpose-inl.h
+  jxl/xorshift128plus-inl.h
+)
+
+set(JPEGXL_INTERNAL_ENC_SOURCES
+  jxl/butteraugli/butteraugli.cc
+  jxl/butteraugli/butteraugli.h
+  jxl/butteraugli_wrapper.cc
+  jxl/enc_ac_strategy.cc
+  jxl/enc_ac_strategy.h
+  jxl/enc_adaptive_quantization.cc
+  jxl/enc_adaptive_quantization.h
+  jxl/enc_ans.cc
+  jxl/enc_ans.h
+  jxl/enc_ans_params.h
+  jxl/enc_ar_control_field.cc
+  jxl/enc_ar_control_field.h
+  jxl/enc_aux_out.cc
+  jxl/enc_aux_out.h
+  jxl/enc_bit_writer.cc
+  jxl/enc_bit_writer.h
+  jxl/enc_butteraugli_comparator.cc
+  jxl/enc_butteraugli_comparator.h
+  jxl/enc_butteraugli_pnorm.cc
+  jxl/enc_butteraugli_pnorm.h
+  jxl/enc_cache.cc
+  jxl/enc_cache.h
+  jxl/enc_chroma_from_luma.cc
+  jxl/enc_chroma_from_luma.h
+  jxl/enc_cluster.cc
+  jxl/enc_cluster.h
+  jxl/enc_coeff_order.cc
+  jxl/enc_coeff_order.h
+  jxl/enc_color_management.cc
+  jxl/enc_color_management.h
+  jxl/enc_comparator.cc
+  jxl/enc_comparator.h
+  jxl/enc_context_map.cc
+  jxl/enc_context_map.h
+  jxl/enc_detect_dots.cc
+  jxl/enc_detect_dots.h
+  jxl/enc_dot_dictionary.cc
+  jxl/enc_dot_dictionary.h
+  jxl/enc_entropy_coder.cc
+  jxl/enc_entropy_coder.h
+  jxl/enc_external_image.cc
+  jxl/enc_external_image.h
+  jxl/enc_fast_lossless.cc
+  jxl/enc_fast_lossless.h
+  jxl/enc_fields.cc
+  jxl/enc_fields.h
+  jxl/enc_file.cc
+  jxl/enc_file.h
+  jxl/enc_frame.cc
+  jxl/enc_frame.h
+  jxl/enc_gaborish.cc
+  jxl/enc_gaborish.h
+  jxl/enc_gamma_correct.h
+  jxl/enc_group.cc
+  jxl/enc_group.h
+  jxl/enc_heuristics.cc
+  jxl/enc_heuristics.h
+  jxl/enc_huffman.cc
+  jxl/enc_huffman.h
+  jxl/enc_huffman_tree.cc
+  jxl/enc_huffman_tree.h
+  jxl/enc_icc_codec.cc
+  jxl/enc_icc_codec.h
+  jxl/enc_image_bundle.cc
+  jxl/enc_image_bundle.h
+  jxl/enc_jxl_skcms.h
+  jxl/enc_linalg.cc
+  jxl/enc_linalg.h
+  jxl/enc_modular.cc
+  jxl/enc_modular.h
+  jxl/enc_noise.cc
+  jxl/enc_noise.h
+  jxl/enc_optimize.cc
+  jxl/enc_optimize.h
+  jxl/enc_params.h
+  jxl/enc_patch_dictionary.cc
+  jxl/enc_patch_dictionary.h
+  jxl/enc_photon_noise.cc
+  jxl/enc_photon_noise.h
+  jxl/enc_progressive_split.cc
+  jxl/enc_progressive_split.h
+  jxl/enc_quant_weights.cc
+  jxl/enc_quant_weights.h
+  jxl/enc_splines.cc
+  jxl/enc_splines.h
+  jxl/enc_toc.cc
+  jxl/enc_toc.h
+  jxl/enc_transforms-inl.h
+  jxl/enc_transforms.cc
+  jxl/enc_transforms.h
+  jxl/enc_xyb.cc
+  jxl/enc_xyb.h
+  jxl/encode.cc
+  jxl/encode_internal.h
+  jxl/jpeg/enc_jpeg_data.cc
+  jxl/jpeg/enc_jpeg_data.h
+  jxl/jpeg/enc_jpeg_data_reader.cc
+  jxl/jpeg/enc_jpeg_data_reader.h
+  jxl/jpeg/enc_jpeg_huffman_decode.cc
+  jxl/jpeg/enc_jpeg_huffman_decode.h
+  jxl/modular/encoding/enc_debug_tree.cc
+  jxl/modular/encoding/enc_debug_tree.h
+  jxl/modular/encoding/enc_encoding.cc
+  jxl/modular/encoding/enc_encoding.h
+  jxl/modular/encoding/enc_ma.cc
+  jxl/modular/encoding/enc_ma.h
+  jxl/modular/transform/enc_palette.cc
+  jxl/modular/transform/enc_palette.h
+  jxl/modular/transform/enc_rct.cc
+  jxl/modular/transform/enc_rct.h
+  jxl/modular/transform/enc_squeeze.cc
+  jxl/modular/transform/enc_squeeze.h
+  jxl/modular/transform/enc_transform.cc
+  jxl/modular/transform/enc_transform.h
+)
+
+set(JPEGXL_INTERNAL_EXTRAS_FOR_TOOLS_SOURCES
+  extras/codec.cc
+  extras/codec.h
+  extras/hlg.cc
+  extras/hlg.h
+  extras/packed_image_convert.cc
+  extras/packed_image_convert.h
+  extras/tone_mapping.cc
+  extras/tone_mapping.h
+)
+
+set(JPEGXL_INTERNAL_EXTRAS_SOURCES
+  extras/dec/color_description.cc
+  extras/dec/color_description.h
+  extras/dec/color_hints.cc
+  extras/dec/color_hints.h
+  extras/dec/decode.cc
+  extras/dec/decode.h
+  extras/enc/encode.cc
+  extras/enc/encode.h
+  extras/exif.cc
+  extras/exif.h
+  extras/packed_image.h
+  extras/size_constraints.h
+  extras/time.cc
+  extras/time.h
+)
+
+set(JPEGXL_INTERNAL_GBENCH_SOURCES
+  extras/tone_mapping_gbench.cc
+  jxl/dec_external_image_gbench.cc
+  jxl/enc_external_image_gbench.cc
+  jxl/gauss_blur_gbench.cc
+  jxl/splines_gbench.cc
+  jxl/tf_gbench.cc
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_SOURCES
+  jpegli/adaptive_quantization.cc
+  jpegli/adaptive_quantization.h
+  jpegli/bit_writer.cc
+  jpegli/bit_writer.h
+  jpegli/bitstream.cc
+  jpegli/bitstream.h
+  jpegli/color_quantize.cc
+  jpegli/color_quantize.h
+  jpegli/color_transform.cc
+  jpegli/color_transform.h
+  jpegli/common.cc
+  jpegli/common.h
+  jpegli/common_internal.h
+  jpegli/dct-inl.h
+  jpegli/dct.cc
+  jpegli/dct.h
+  jpegli/decode.cc
+  jpegli/decode.h
+  jpegli/decode_internal.h
+  jpegli/decode_marker.cc
+  jpegli/decode_marker.h
+  jpegli/decode_scan.cc
+  jpegli/decode_scan.h
+  jpegli/destination_manager.cc
+  jpegli/downsample.cc
+  jpegli/downsample.h
+  jpegli/encode.cc
+  jpegli/encode.h
+  jpegli/encode_internal.h
+  jpegli/entropy_coding.cc
+  jpegli/entropy_coding.h
+  jpegli/error.cc
+  jpegli/error.h
+  jpegli/huffman.cc
+  jpegli/huffman.h
+  jpegli/idct.cc
+  jpegli/idct.h
+  jpegli/input.cc
+  jpegli/input.h
+  jpegli/memory_manager.cc
+  jpegli/memory_manager.h
+  jpegli/quant.cc
+  jpegli/quant.h
+  jpegli/render.cc
+  jpegli/render.h
+  jpegli/simd.cc
+  jpegli/simd.h
+  jpegli/source_manager.cc
+  jpegli/transpose-inl.h
+  jpegli/upsample.cc
+  jpegli/upsample.h
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_TESTLIB_FILES
+  jpegli/test_utils.cc
+  jpegli/test_utils.h
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_TESTS
+  jpegli/decode_api_test.cc
+  jpegli/encode_api_test.cc
+  jpegli/error_handling_test.cc
+  jpegli/input_suspension_test.cc
+  jpegli/output_suspension_test.cc
+  jpegli/source_manager_test.cc
+  jpegli/streaming_test.cc
+  jpegli/transcode_api_test.cc
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_WRAPPER_SOURCES
+  jpegli/libjpeg_wrapper.cc
+)
+
+set(JPEGXL_INTERNAL_PUBLIC_HEADERS
+  include/jxl/butteraugli.h
+  include/jxl/butteraugli_cxx.h
+  include/jxl/cms_interface.h
+  include/jxl/codestream_header.h
+  include/jxl/color_encoding.h
+  include/jxl/decode.h
+  include/jxl/decode_cxx.h
+  include/jxl/encode.h
+  include/jxl/encode_cxx.h
+  include/jxl/memory_manager.h
+  include/jxl/parallel_runner.h
+  include/jxl/types.h
+)
+
+set(JPEGXL_INTERNAL_TESTLIB_FILES
+  jxl/dct_for_test.h
+  jxl/dec_transforms_testonly.cc
+  jxl/dec_transforms_testonly.h
+  jxl/fake_parallel_runner_testonly.h
+  jxl/image_test_utils.h
+  jxl/render_pipeline/test_render_pipeline_stages.h
+  jxl/test_image.cc
+  jxl/test_image.h
+  jxl/test_utils.cc
+  jxl/test_utils.h
+)
+
+set(JPEGXL_INTERNAL_TESTS
+  extras/codec_test.cc
+  extras/dec/color_description_test.cc
+  extras/dec/pgx_test.cc
+  extras/jpegli_test.cc
+  jxl/ac_strategy_test.cc
+  jxl/alpha_test.cc
+  jxl/ans_common_test.cc
+  jxl/ans_test.cc
+  jxl/bit_reader_test.cc
+  jxl/bits_test.cc
+  jxl/blending_test.cc
+  jxl/butteraugli_test.cc
+  jxl/byte_order_test.cc
+  jxl/coeff_order_test.cc
+  jxl/color_encoding_internal_test.cc
+  jxl/color_management_test.cc
+  jxl/convolve_test.cc
+  jxl/data_parallel_test.cc
+  jxl/dct_test.cc
+  jxl/decode_test.cc
+  jxl/enc_external_image_test.cc
+  jxl/enc_gaborish_test.cc
+  jxl/enc_linalg_test.cc
+  jxl/enc_optimize_test.cc
+  jxl/enc_photon_noise_test.cc
+  jxl/encode_test.cc
+  jxl/entropy_coder_test.cc
+  jxl/fast_dct_test.cc
+  jxl/fast_math_test.cc
+  jxl/fields_test.cc
+  jxl/gamma_correct_test.cc
+  jxl/gauss_blur_test.cc
+  jxl/gradient_test.cc
+  jxl/iaca_test.cc
+  jxl/icc_codec_test.cc
+  jxl/image_bundle_test.cc
+  jxl/image_ops_test.cc
+  jxl/jxl_test.cc
+  jxl/lehmer_code_test.cc
+  jxl/modular_test.cc
+  jxl/opsin_image_test.cc
+  jxl/opsin_inverse_test.cc
+  jxl/padded_bytes_test.cc
+  jxl/passes_test.cc
+  jxl/patch_dictionary_test.cc
+  jxl/preview_test.cc
+  jxl/quant_weights_test.cc
+  jxl/quantizer_test.cc
+  jxl/rational_polynomial_test.cc
+  jxl/render_pipeline/render_pipeline_test.cc
+  jxl/roundtrip_test.cc
+  jxl/simd_util_test.cc
+  jxl/speed_tier_test.cc
+  jxl/splines_test.cc
+  jxl/toc_test.cc
+  jxl/xorshift128plus_test.cc
+  threads/thread_parallel_runner_test.cc
+)
+
+set(JPEGXL_INTERNAL_THREADS_PUBLIC_HEADERS
+  include/jxl/resizable_parallel_runner.h
+  include/jxl/resizable_parallel_runner_cxx.h
+  include/jxl/thread_parallel_runner.h
+  include/jxl/thread_parallel_runner_cxx.h
+)
+
+set(JPEGXL_INTERNAL_THREADS_SOURCES
+  threads/resizable_parallel_runner.cc
+  threads/thread_parallel_runner.cc
+  threads/thread_parallel_runner_internal.cc
+  threads/thread_parallel_runner_internal.h
+)
diff --git a/third_party/jpeg-xl/lib/jxl_tests.cmake b/third_party/jpeg-xl/lib/jxl_tests.cmake
new file mode 100644
index 0000000000..88c5a89f5c
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_tests.cmake
@@ -0,0 +1,84 @@
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(compatibility.cmake)
+include(jxl_lists.cmake)
+
+if(BUILD_TESTING OR JPEGXL_ENABLE_TOOLS)
+# Library with test-only code shared between all tests / fuzzers.
+add_library(jxl_testlib-static STATIC ${JPEGXL_INTERNAL_TESTLIB_FILES})
+target_compile_options(jxl_testlib-static PRIVATE
+  ${JPEGXL_INTERNAL_FLAGS}
+  ${JPEGXL_COVERAGE_FLAGS}
+)
+target_compile_definitions(jxl_testlib-static PUBLIC
+  -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
+target_include_directories(jxl_testlib-static PUBLIC
+  "${PROJECT_SOURCE_DIR}"
+)
+target_link_libraries(jxl_testlib-static
+  hwy
+  jxl_extras_nocodec-static
+  jxl-static
+)
+endif()
+
+if(NOT BUILD_TESTING)
+  return()
+endif()
+
+list(APPEND JPEGXL_INTERNAL_TESTS
+  # TODO(deymo): Move this to tools/
+  ../tools/box/box_test.cc
+  ../tools/djxl_fuzzer_test.cc
+)
+
+find_package(GTest)
+
+# Individual test binaries:
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+foreach (TESTFILE IN LISTS JPEGXL_INTERNAL_TESTS)
+  # The TESTNAME is the name without the extension or directory.
+  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+  if(TESTFILE STREQUAL ../tools/djxl_fuzzer_test.cc)
+    add_executable(${TESTNAME} ${TESTFILE} ../tools/djxl_fuzzer.cc)
+  else()
+    add_executable(${TESTNAME} ${TESTFILE})
+  endif()
+  if(JPEGXL_EMSCRIPTEN)
+    # The emscripten linking step takes too much memory and crashes during the
+    # wasm-opt step when using -O2 optimization level
+    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "\
+      -O1 \
+      -s USE_LIBPNG=1 \
+      -s ALLOW_MEMORY_GROWTH=1 \
+      -s SINGLE_FILE=1 \
+      -s PROXY_TO_PTHREAD \
+      -s EXIT_RUNTIME=1 \
+      -s USE_PTHREADS=1 \
+      -s NODERAWFS=1 \
+    ")
+  else()
+    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "${JPEGXL_COVERAGE_LINK_FLAGS}")
+  endif()
+  target_compile_options(${TESTNAME} PRIVATE
+    ${JPEGXL_INTERNAL_FLAGS}
+    # Add coverage flags to the test binary so code in the private headers of
+    # the library is also instrumented when running tests that execute it.
+    ${JPEGXL_COVERAGE_FLAGS}
+  )
+  target_link_libraries(${TESTNAME}
+    box
+    gmock
+    GTest::GTest
+    GTest::Main
+    jxl_extras-static
+    jxl_testlib-static
+  )
+  # Output test targets in the test directory.
+  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+  if (WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set_target_properties(${TESTNAME} PROPERTIES COMPILE_FLAGS "-Wno-error")
+  endif ()
+  jxl_discover_tests(${TESTNAME})
+endforeach ()
diff --git a/third_party/jpeg-xl/lib/jxl_threads.cmake b/third_party/jpeg-xl/lib/jxl_threads.cmake
new file mode 100644
index 0000000000..2f5ac17c83
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_threads.cmake
@@ -0,0 +1,120 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(Threads REQUIRED)
+
+include(jxl_lists.cmake)
+
+### Define the jxl_threads shared or static target library. The ${target}
+# parameter should already be created with add_library(), but this function
+# sets all the remaining common properties.
+function(_set_jxl_threads _target)
+  target_compile_options(${_target} PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+  target_compile_options(${_target} PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+  set_property(TARGET ${_target} PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+  target_include_directories(${_target}
+    PRIVATE
+      "${PROJECT_SOURCE_DIR}"
+    PUBLIC
+      "${CMAKE_CURRENT_SOURCE_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}/include")
+
+  target_link_libraries(${_target}
+    PUBLIC ${JPEGXL_COVERAGE_FLAGS} Threads::Threads
+  )
+
+  set_target_properties(${_target} PROPERTIES
+    CXX_VISIBILITY_PRESET hidden
+    VISIBILITY_INLINES_HIDDEN 1
+    DEFINE_SYMBOL JXL_THREADS_INTERNAL_LIBRARY_BUILD
+  )
+
+  # Always install the library as jxl_threads.{a,so} file without the "-static"
+  # suffix, except in Windows.
+  if (NOT WIN32 OR MINGW)
+    set_target_properties(${_target} PROPERTIES OUTPUT_NAME "jxl_threads")
+  endif()
+  install(TARGETS ${_target}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+endfunction()
+
+### Static library.
+add_library(jxl_threads-static STATIC ${JPEGXL_INTERNAL_THREADS_SOURCES})
+_set_jxl_threads(jxl_threads-static)
+
+# Make jxl_threads symbols neither imported nor exported when using the static
+# library. These will have hidden visibility anyway in the static library case
+# in unix.
+target_compile_definitions(jxl_threads-static
+  PUBLIC -DJXL_THREADS_STATIC_DEFINE)
+
+
+### Public shared library.
+if (BUILD_SHARED_LIBS)
+add_library(jxl_threads SHARED ${JPEGXL_INTERNAL_THREADS_SOURCES})
+_set_jxl_threads(jxl_threads)
+
+set_target_properties(jxl_threads PROPERTIES
+  VERSION ${JPEGXL_LIBRARY_VERSION}
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+  set_target_properties(jxl_threads PROPERTIES
+      LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version)
+  if(APPLE)
+  set_property(TARGET ${target} APPEND_STRING PROPERTY
+      LINK_FLAGS "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl_osx.syms")
+  elseif(WIN32)
+    # Nothing needed here, we use __declspec(dllexport) (jxl_threads_export.h)
+  else()
+  set_property(TARGET jxl_threads APPEND_STRING PROPERTY
+      LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version")
+  endif()  # APPLE
+
+# Compile the shared library such that the JXL_THREADS_EXPORT symbols are
+# exported. Users of the library will not set this flag and therefore import
+# those symbols.
+target_compile_definitions(jxl_threads
+  PRIVATE -DJXL_THREADS_INTERNAL_LIBRARY_BUILD)
+
+# Generate the jxl/jxl_threads_export.h header, we only need to generate it once
+# but we can use it from both libraries.
+generate_export_header(jxl_threads
+  BASE_NAME JXL_THREADS
+  EXPORT_FILE_NAME include/jxl/jxl_threads_export.h)
+else()
+add_library(jxl_threads ALIAS jxl_threads-static)
+# When not building the shared library generate the jxl_threads_export.h header
+# only based on the static target.
+generate_export_header(jxl_threads-static
+  BASE_NAME JXL_THREADS
+  EXPORT_FILE_NAME include/jxl/jxl_threads_export.h)
+endif()  # BUILD_SHARED_LIBS
+
+
+### Add a pkg-config file for libjxl_threads.
+
+# Allow adding prefix if CMAKE_INSTALL_INCLUDEDIR not absolute.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(PKGCONFIG_TARGET_INCLUDES "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+    set(PKGCONFIG_TARGET_INCLUDES "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+# Allow adding prefix if CMAKE_INSTALL_LIBDIR not absolute.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+    set(PKGCONFIG_TARGET_LIBS "${CMAKE_INSTALL_LIBDIR}")
+else()
+    set(PKGCONFIG_TARGET_LIBS "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+endif()
+
+set(JPEGXL_THREADS_LIBRARY_REQUIRES "")
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/threads/libjxl_threads.pc.in"
+               "libjxl_threads.pc" @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libjxl_threads.pc"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
diff --git a/third_party/jpeg-xl/lib/jxl_vars.bzl b/third_party/jpeg-xl/lib/jxl_vars.bzl
new file mode 100644
index 0000000000..7efa84cc44
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl_vars.bzl
@@ -0,0 +1,46 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Extra build variables.
+
+libjxl_root_package = "__main__"
+
+libjxl_deps_brotli = ["@brotli//:brotlidec", "@brotli//:brotlienc"]
+libjxl_deps_gif = ["@gif//:gif"]
+libjxl_deps_gtest = ["@googletest//:gtest_main"]
+libjxl_deps_hwy = ["@highway//:hwy"]
+libjxl_deps_hwy_nanobenchmark = ["@highway//:nanobenchmark"]
+libjxl_deps_hwy_test_util = ["@highway//:hwy_test_util"]
+libjxl_deps_jpeg = ["@libjpeg_turbo//:jpeg"]
+libjxl_deps_jxl_box = ["//tools:box"]
+libjxl_deps_exr = ["@openexr//:OpenEXR"]
+libjxl_deps_png = ["@png//:png"]
+libjxl_deps_runfiles = ["@bazel_tools//tools/cpp/runfiles"]
+libjxl_deps_skcms = ["@skcms//:skcms"]
+libjxl_deps_testdata = ["//:testdata"]
+
+libjxl_test_shards = {
+    "jpegli/decode_api_test": 10,
+    "jpegli/encode_api_test": 4,
+    "jpegli/input_suspension_test": 6,
+    "jpegli/output_suspension_test": 2,
+    "jxl/ans_test": 2,
+    "jxl/linalg_test": 2,
+    "jxl/modular_test": 4,
+    "jxl/roundtrip_test": 4,
+    "jxl/xorshift128plus_test": 2,
+    "jxl/ac_strategy_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/dct_test": 32,
+    "jxl/decode_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/fast_dct_test": 8,  # TODO(eustas): separate ultra-heavy shard
+    "jxl/fast_math_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/jxl_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/render_pipeline/render_pipeline_test": 10,
+}
+
+libjxl_test_timeouts = {
+    "jxl/fast_dct_test": "long",
+    "jxl/dct_test": "long",
+}
diff --git a/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in b/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in
new file mode 100644
index 0000000000..50b937a840
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/libjxl_threads.pc.in
@@ -0,0 +1,13 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@PKGCONFIG_TARGET_LIBS@
+includedir=@PKGCONFIG_TARGET_INCLUDES@
+
+Name: libjxl_threads
+Description: JPEG XL multi-thread runner using std::threads.
+Version: @JPEGXL_LIBRARY_VERSION@
+Requires.private: @JPEGXL_THREADS_LIBRARY_REQUIRES@
+Libs: -L${libdir} -ljxl_threads
+Libs.private: -lm
+Cflags: -I${includedir}
+Cflags.private: -DJXL_THREADS_STATIC_DEFINE
diff --git a/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc b/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc
new file mode 100644
index 0000000000..db27286dea
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/resizable_parallel_runner.cc
@@ -0,0 +1,195 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/resizable_parallel_runner.h>
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace jpegxl {
+namespace {
+
+// A thread pool that allows changing the number of threads it runs. It also
+// runs tasks on the calling thread, which can work better on schedulers for
+// heterogeneous architectures.
+struct ResizeableParallelRunner {
+  void SetNumThreads(size_t num) {
+    if (num > 0) {
+      num -= 1;
+    }
+    {
+      std::unique_lock<std::mutex> l(state_mutex_);
+      num_desired_workers_ = num;
+      workers_can_proceed_.notify_all();
+    }
+    if (workers_.size() < num) {
+      for (size_t i = workers_.size(); i < num; i++) {
+        workers_.emplace_back([this, i]() { WorkerBody(i); });
+      }
+    }
+    if (workers_.size() > num) {
+      for (size_t i = num; i < workers_.size(); i++) {
+        workers_[i].join();
+      }
+      workers_.resize(num);
+    }
+  }
+
+  ~ResizeableParallelRunner() { SetNumThreads(0); }
+
+  JxlParallelRetCode Run(void* jxl_opaque, JxlParallelRunInit init,
+                         JxlParallelRunFunction func, uint32_t start,
+                         uint32_t end) {
+    if (start + 1 == end) {
+      JxlParallelRetCode ret = init(jxl_opaque, 1);
+      if (ret != 0) return ret;
+
+      func(jxl_opaque, start, 0);
+      return ret;
+    }
+
+    size_t num_workers = std::min<size_t>(workers_.size() + 1, end - start);
+    JxlParallelRetCode ret = init(jxl_opaque, num_workers);
+    if (ret != 0) {
+      return ret;
+    }
+
+    {
+      std::unique_lock<std::mutex> l(state_mutex_);
+      // Avoid waking up more workers than needed.
+      max_running_workers_ = end - start - 1;
+      next_task_ = start;
+      end_task_ = end;
+      func_ = func;
+      jxl_opaque_ = jxl_opaque;
+      work_available_ = true;
+      num_running_workers_++;
+      workers_can_proceed_.notify_all();
+    }
+
+    DequeueTasks(0);
+
+    while (true) {
+      std::unique_lock<std::mutex> l(state_mutex_);
+      if (num_running_workers_ == 0) break;
+      work_done_.wait(l);
+    }
+
+    return ret;
+  }
+
+ private:
+  void WorkerBody(size_t worker_id) {
+    while (true) {
+      {
+        std::unique_lock<std::mutex> l(state_mutex_);
+        // Worker pool was reduced, resize down.
+        if (worker_id >= num_desired_workers_) {
+          return;
+        }
+        // Nothing to do this time.
+        if (!work_available_ || worker_id >= max_running_workers_) {
+          workers_can_proceed_.wait(l);
+          continue;
+        }
+        num_running_workers_++;
+      }
+      DequeueTasks(worker_id + 1);
+    }
+  }
+
+  void DequeueTasks(size_t thread_id) {
+    while (true) {
+      uint32_t task = next_task_++;
+      if (task >= end_task_) {
+        std::unique_lock<std::mutex> l(state_mutex_);
+        num_running_workers_--;
+        work_available_ = false;
+        if (num_running_workers_ == 0) {
+          work_done_.notify_all();
+        }
+        break;
+      }
+      func_(jxl_opaque_, task, thread_id);
+    }
+  }
+
+  // Checks when the worker has something to do, which can be one of:
+  // - quitting (when worker_id >= num_desired_workers_)
+  // - having work available for them (work_available_ is true and worker_id >=
+  // max_running_workers_)
+  std::condition_variable workers_can_proceed_;
+
+  // Workers are done, and the main thread can proceed (num_running_workers_ ==
+  // 0)
+  std::condition_variable work_done_;
+
+  std::vector<std::thread> workers_;
+
+  // Protects all the remaining variables, except for func_, jxl_opaque_ and
+  // end_task_ (for which only the write by the main thread is protected, and
+  // subsequent uses by workers happen-after it) and next_task_ (which is
+  // atomic).
+  std::mutex state_mutex_;
+
+  // Range of tasks still need to be done.
+  std::atomic<uint32_t> next_task_;
+  uint32_t end_task_;
+
+  // Function to run and its argument.
+  JxlParallelRunFunction func_;
+  void* jxl_opaque_;  // not owned
+
+  // Variables that control the workers:
+  // - work_available_ is set to true after a call to Run() and to false at the
+  // end of it.
+  // - num_desired_workers_ represents the number of workers that should be
+  // present.
+  // - max_running_workers_ represents the number of workers that should be
+  // executing tasks.
+  // - num_running_workers_ represents the number of workers that are executing
+  // tasks.
+  size_t num_desired_workers_ = 0;
+  size_t max_running_workers_ = 0;
+  size_t num_running_workers_ = 0;
+  bool work_available_ = false;
+};
+}  // namespace
+}  // namespace jpegxl
+
+extern "C" {
+JXL_THREADS_EXPORT JxlParallelRetCode JxlResizableParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  return static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque)
+      ->Run(jpegxl_opaque, init, func, start_range, end_range);
+}
+
+JXL_THREADS_EXPORT void* JxlResizableParallelRunnerCreate(
+    const JxlMemoryManager* memory_manager) {
+  return new jpegxl::ResizeableParallelRunner();
+}
+
+JXL_THREADS_EXPORT void JxlResizableParallelRunnerSetThreads(
+    void* runner_opaque, size_t num_threads) {
+  static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque)
+      ->SetNumThreads(num_threads);
+}
+
+JXL_THREADS_EXPORT void JxlResizableParallelRunnerDestroy(void* runner_opaque) {
+  delete static_cast<jpegxl::ResizeableParallelRunner*>(runner_opaque);
+}
+
+JXL_THREADS_EXPORT uint32_t
+JxlResizableParallelRunnerSuggestThreads(uint64_t xsize, uint64_t ysize) {
+  // ~one thread per group.
+  return std::min<uint64_t>(std::thread::hardware_concurrency(),
+                            xsize * ysize / (256 * 256));
+}
+}
diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc
new file mode 100644
index 0000000000..47b81bdb16
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc
@@ -0,0 +1,101 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/thread_parallel_runner.h>
+#include <string.h>
+
+#include "lib/threads/thread_parallel_runner_internal.h"
+
+namespace {
+
+// Default JxlMemoryManager using malloc and free for the jpegxl_threads
+// library. Same as the default JxlMemoryManager for the jpegxl library
+// itself.
+
+// Default alloc and free functions.
+void* ThreadMemoryManagerDefaultAlloc(void* opaque, size_t size) {
+  return malloc(size);
+}
+
+void ThreadMemoryManagerDefaultFree(void* opaque, void* address) {
+  free(address);
+}
+
+// Initializes the memory manager instance with the passed one. The
+// MemoryManager passed in |memory_manager| may be NULL or contain NULL
+// functions which will be initialized with the default ones. If either alloc
+// or free are NULL, then both must be NULL, otherwise this function returns an
+// error.
+bool ThreadMemoryManagerInit(JxlMemoryManager* self,
+                             const JxlMemoryManager* memory_manager) {
+  if (memory_manager) {
+    *self = *memory_manager;
+  } else {
+    memset(self, 0, sizeof(*self));
+  }
+  if (!self->alloc != !self->free) {
+    return false;
+  }
+  if (!self->alloc) self->alloc = ThreadMemoryManagerDefaultAlloc;
+  if (!self->free) self->free = ThreadMemoryManagerDefaultFree;
+
+  return true;
+}
+
+void* ThreadMemoryManagerAlloc(const JxlMemoryManager* memory_manager,
+                               size_t size) {
+  return memory_manager->alloc(memory_manager->opaque, size);
+}
+
+void ThreadMemoryManagerFree(const JxlMemoryManager* memory_manager,
+                             void* address) {
+  return memory_manager->free(memory_manager->opaque, address);
+}
+
+}  // namespace
+
+JxlParallelRetCode JxlThreadParallelRunner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  return jpegxl::ThreadParallelRunner::Runner(
+      runner_opaque, jpegxl_opaque, init, func, start_range, end_range);
+}
+
+/// Starts the given number of worker threads and blocks until they are ready.
+/// "num_worker_threads" defaults to one per hyperthread. If zero, all tasks
+/// run on the main thread.
+void* JxlThreadParallelRunnerCreate(const JxlMemoryManager* memory_manager,
+                                    size_t num_worker_threads) {
+  JxlMemoryManager local_memory_manager;
+  if (!ThreadMemoryManagerInit(&local_memory_manager, memory_manager))
+    return nullptr;
+
+  void* alloc = ThreadMemoryManagerAlloc(&local_memory_manager,
+                                         sizeof(jpegxl::ThreadParallelRunner));
+  if (!alloc) return nullptr;
+  // Placement new constructor on allocated memory
+  jpegxl::ThreadParallelRunner* runner =
+      new (alloc) jpegxl::ThreadParallelRunner(num_worker_threads);
+  runner->memory_manager = local_memory_manager;
+
+  return runner;
+}
+
+void JxlThreadParallelRunnerDestroy(void* runner_opaque) {
+  jpegxl::ThreadParallelRunner* runner =
+      reinterpret_cast<jpegxl::ThreadParallelRunner*>(runner_opaque);
+  if (runner) {
+    JxlMemoryManager local_memory_manager = runner->memory_manager;
+    // Call destructor directly since custom free function is used.
+    runner->~ThreadParallelRunner();
+    ThreadMemoryManagerFree(&local_memory_manager, runner);
+  }
+}
+
+// Get default value for num_worker_threads parameter of
+// InitJxlThreadParallelRunner.
+size_t JxlThreadParallelRunnerDefaultNumWorkerThreads() {
+  return std::thread::hardware_concurrency();
+}
diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc
new file mode 100644
index 0000000000..f26a9ba263
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc
@@ -0,0 +1,215 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/threads/thread_parallel_runner_internal.h"
+
+#include <algorithm>
+
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+#include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
+#endif                                        // defined(*_SANITIZER)
+
+#include <jxl/thread_parallel_runner.h>
+
+#include "lib/jxl/base/profiler.h"
+
+namespace {
+
+// Important: JXL_ASSERT does not guarantee running the `condition` code,
+// use only for debug mode checks.
+
+#if JXL_ENABLE_ASSERT
+// Exits the program after printing a stack trace when possible.
+bool Abort() {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+  // If compiled with any sanitizer print a stack trace. This call doesn't crash
+  // the program, instead the trap below will crash it also allowing gdb to
+  // break there.
+  __sanitizer_print_stack_trace();
+#endif  // defined(*_SANITIZER)
+
+#ifdef _MSC_VER
+  __debugbreak();
+  abort();
+#else
+  __builtin_trap();
+#endif
+}
+#define JXL_ASSERT(condition) \
+  do {                        \
+    if (!(condition)) {       \
+      Abort();                \
+    }                         \
+  } while (0)
+#else
+#define JXL_ASSERT(condition) \
+  do {                        \
+  } while (0)
+#endif
+}  // namespace
+
+namespace jpegxl {
+
+// static
+JxlParallelRetCode ThreadParallelRunner::Runner(
+    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
+    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
+  ThreadParallelRunner* self =
+      static_cast<ThreadParallelRunner*>(runner_opaque);
+  if (start_range > end_range) return -1;
+  if (start_range == end_range) return 0;
+
+  int ret = init(jpegxl_opaque, std::max<size_t>(self->num_worker_threads_, 1));
+  if (ret != 0) return ret;
+
+  // Use a sequential run when num_worker_threads_ is zero since we have no
+  // worker threads.
+  if (self->num_worker_threads_ == 0) {
+    const size_t thread = 0;
+    for (uint32_t task = start_range; task < end_range; ++task) {
+      func(jpegxl_opaque, task, thread);
+    }
+    return 0;
+  }
+
+  if (self->depth_.fetch_add(1, std::memory_order_acq_rel) != 0) {
+    return -1;  // Must not re-enter.
+  }
+
+  const WorkerCommand worker_command =
+      (static_cast<WorkerCommand>(start_range) << 32) + end_range;
+  // Ensure the inputs do not result in a reserved command.
+  JXL_ASSERT(worker_command != kWorkerWait);
+  JXL_ASSERT(worker_command != kWorkerOnce);
+  JXL_ASSERT(worker_command != kWorkerExit);
+
+  self->data_func_ = func;
+  self->jpegxl_opaque_ = jpegxl_opaque;
+  self->num_reserved_.store(0, std::memory_order_relaxed);
+
+  self->StartWorkers(worker_command);
+  self->WorkersReadyBarrier();
+
+  if (self->depth_.fetch_add(-1, std::memory_order_acq_rel) != 1) {
+    return -1;
+  }
+  return 0;
+}
+
+// static
+void ThreadParallelRunner::RunRange(ThreadParallelRunner* self,
+                                    const WorkerCommand command,
+                                    const int thread) {
+  const uint32_t begin = command >> 32;
+  const uint32_t end = command & 0xFFFFFFFF;
+  const uint32_t num_tasks = end - begin;
+  const uint32_t num_worker_threads = self->num_worker_threads_;
+
+  // OpenMP introduced several "schedule" strategies:
+  // "single" (static assignment of exactly one chunk per thread): slower.
+  // "dynamic" (allocates k tasks at a time): competitive for well-chosen k.
+  // "guided" (allocates k tasks, decreases k): computing k = remaining/n
+  //   is faster than halving k each iteration. We prefer this strategy
+  //   because it avoids user-specified parameters.
+
+  for (;;) {
+#if 0
+      // dynamic
+      const uint32_t my_size = std::max(num_tasks / (num_worker_threads * 4), 1);
+#else
+    // guided
+    const uint32_t num_reserved =
+        self->num_reserved_.load(std::memory_order_relaxed);
+    // It is possible that more tasks are reserved than ready to run.
+    const uint32_t num_remaining =
+        num_tasks - std::min(num_reserved, num_tasks);
+    const uint32_t my_size =
+        std::max(num_remaining / (num_worker_threads * 4), 1u);
+#endif
+    const uint32_t my_begin = begin + self->num_reserved_.fetch_add(
+                                          my_size, std::memory_order_relaxed);
+    const uint32_t my_end = std::min(my_begin + my_size, begin + num_tasks);
+    // Another thread already reserved the last task.
+    if (my_begin >= my_end) {
+      break;
+    }
+    for (uint32_t task = my_begin; task < my_end; ++task) {
+      self->data_func_(self->jpegxl_opaque_, task, thread);
+    }
+  }
+}
+
+// static
+void ThreadParallelRunner::ThreadFunc(ThreadParallelRunner* self,
+                                      const int thread) {
+  // Until kWorkerExit command received:
+  for (;;) {
+    std::unique_lock<std::mutex> lock(self->mutex_);
+    // Notify main thread that this thread is ready.
+    if (++self->workers_ready_ == self->num_threads_) {
+      self->workers_ready_cv_.notify_one();
+    }
+  RESUME_WAIT:
+    // Wait for a command.
+    self->worker_start_cv_.wait(lock);
+    const WorkerCommand command = self->worker_start_command_;
+    switch (command) {
+      case kWorkerWait:    // spurious wakeup:
+        goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
+      case kWorkerOnce:
+        lock.unlock();
+        self->data_func_(self->jpegxl_opaque_, thread, thread);
+        break;
+      case kWorkerExit:
+        return;  // exits thread
+      default:
+        lock.unlock();
+        RunRange(self, command, thread);
+        break;
+    }
+  }
+}
+
+ThreadParallelRunner::ThreadParallelRunner(const int num_worker_threads)
+    : num_worker_threads_(num_worker_threads),
+      num_threads_(std::max(num_worker_threads, 1)) {
+  PROFILER_ZONE("ThreadParallelRunner ctor");
+
+  threads_.reserve(num_worker_threads_);
+
+  // Suppress "unused-private-field" warning.
+  (void)padding1;
+  (void)padding2;
+
+  // Safely handle spurious worker wakeups.
+  worker_start_command_ = kWorkerWait;
+
+  for (uint32_t i = 0; i < num_worker_threads_; ++i) {
+    threads_.emplace_back(ThreadFunc, this, i);
+  }
+
+  if (num_worker_threads_ != 0) {
+    WorkersReadyBarrier();
+  }
+
+  // Warm up profiler on worker threads so its expensive initialization
+  // doesn't count towards other timer measurements.
+  RunOnEachThread(
+      [](const int task, const int thread) { PROFILER_ZONE("@InitWorkers"); });
+}
+
+ThreadParallelRunner::~ThreadParallelRunner() {
+  if (num_worker_threads_ != 0) {
+    StartWorkers(kWorkerExit);
+  }
+
+  for (std::thread& thread : threads_) {
+    JXL_ASSERT(thread.joinable());
+    thread.join();
+  }
+}
+}  // namespace jpegxl
diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h
new file mode 100644
index 0000000000..199a5f2a8b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.h
@@ -0,0 +1,166 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+// C++ implementation using std::thread of a ::JxlParallelRunner.
+
+// The main class in this module, ThreadParallelRunner, implements a static
+// method ThreadParallelRunner::Runner than can be passed as a
+// JxlParallelRunner when using the JPEG XL library. This uses std::thread
+// internally and related synchronization functions. The number of threads
+// created is fixed at construction time and the threads are re-used for every
+// ThreadParallelRunner::Runner call. Only one concurrent Runner() call per
+// instance is allowed at a time.
+//
+// This is a scalable, lower-overhead thread pool runner, especially suitable
+// for data-parallel computations in the fork-join model, where clients need to
+// know when all tasks have completed.
+//
+// This thread pool can efficiently load-balance millions of tasks using an
+// atomic counter, thus avoiding per-task virtual or system calls. With 48
+// hyperthreads and 1M tasks that add to an atomic counter, overall runtime is
+// 10-20x higher when using std::async, and ~200x for a queue-based thread
+// pool.
+//
+// Usage:
+//   ThreadParallelRunner runner;
+//   JxlDecode(
+//       ... , &ThreadParallelRunner::Runner, static_cast<void*>(&runner));
+
+#ifndef LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_
+#define LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_
+
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <atomic>
+#include <condition_variable>  //NOLINT
+#include <mutex>               //NOLINT
+#include <thread>              //NOLINT
+#include <vector>
+
+namespace jpegxl {
+
+// Main helper class implementing the ::JxlParallelRunner interface.
+class ThreadParallelRunner {
+ public:
+  // ::JxlParallelRunner interface.
+  static JxlParallelRetCode Runner(void* runner_opaque, void* jpegxl_opaque,
+                                   JxlParallelRunInit init,
+                                   JxlParallelRunFunction func,
+                                   uint32_t start_range, uint32_t end_range);
+
+  // Starts the given number of worker threads and blocks until they are ready.
+  // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks
+  // run on the main thread.
+  explicit ThreadParallelRunner(
+      int num_worker_threads = std::thread::hardware_concurrency());
+
+  // Waits for all threads to exit.
+  ~ThreadParallelRunner();
+
+  // Returns maximum number of main/worker threads that may call Func. Useful
+  // for allocating per-thread storage.
+  size_t NumThreads() const { return num_threads_; }
+
+  // Runs func(thread, thread) on all thread(s) that may participate in Run.
+  // If NumThreads() == 0, runs on the main thread with thread == 0, otherwise
+  // concurrently called by each worker thread in [0, NumThreads()).
+  template <class Func>
+  void RunOnEachThread(const Func& func) {
+    if (num_worker_threads_ == 0) {
+      const int thread = 0;
+      func(thread, thread);
+      return;
+    }
+
+    data_func_ = reinterpret_cast<JxlParallelRunFunction>(&CallClosure<Func>);
+    jpegxl_opaque_ = const_cast<void*>(static_cast<const void*>(&func));
+    StartWorkers(kWorkerOnce);
+    WorkersReadyBarrier();
+  }
+
+  JxlMemoryManager memory_manager;
+
+ private:
+  // After construction and between calls to Run, workers are "ready", i.e.
+  // waiting on worker_start_cv_. They are "started" by sending a "command"
+  // and notifying all worker_start_cv_ waiters. (That is why all workers
+  // must be ready/waiting - otherwise, the notification will not reach all of
+  // them and the main thread waits in vain for them to report readiness.)
+  using WorkerCommand = uint64_t;
+
+  // Special values; all others encode the begin/end parameters. Note that all
+  // these are no-op ranges (begin >= end) and therefore never used to encode
+  // ranges.
+  static constexpr WorkerCommand kWorkerWait = ~1ULL;
+  static constexpr WorkerCommand kWorkerOnce = ~2ULL;
+  static constexpr WorkerCommand kWorkerExit = ~3ULL;
+
+  // Calls f(task, thread). Used for type erasure of Func arguments. The
+  // signature must match JxlParallelRunFunction, hence a void* argument.
+  template <class Closure>
+  static void CallClosure(void* f, const uint32_t task, const size_t thread) {
+    (*reinterpret_cast<const Closure*>(f))(task, thread);
+  }
+
+  void WorkersReadyBarrier() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    // Typically only a single iteration.
+    while (workers_ready_ != threads_.size()) {
+      workers_ready_cv_.wait(lock);
+    }
+    workers_ready_ = 0;
+
+    // Safely handle spurious worker wakeups.
+    worker_start_command_ = kWorkerWait;
+  }
+
+  // Precondition: all workers are ready.
+  void StartWorkers(const WorkerCommand worker_command) {
+    mutex_.lock();
+    worker_start_command_ = worker_command;
+    // Workers will need this lock, so release it before they wake up.
+    mutex_.unlock();
+    worker_start_cv_.notify_all();
+  }
+
+  // Attempts to reserve and perform some work from the global range of tasks,
+  // which is encoded within "command". Returns after all tasks are reserved.
+  static void RunRange(ThreadParallelRunner* self, const WorkerCommand command,
+                       const int thread);
+
+  static void ThreadFunc(ThreadParallelRunner* self, int thread);
+
+  // Unmodified after ctor, but cannot be const because we call thread::join().
+  std::vector<std::thread> threads_;
+
+  const uint32_t num_worker_threads_;  // == threads_.size()
+  const uint32_t num_threads_;
+
+  std::atomic<int> depth_{0};  // detects if Run is re-entered (not supported).
+
+  std::mutex mutex_;  // guards both cv and their variables.
+  std::condition_variable workers_ready_cv_;
+  uint32_t workers_ready_ = 0;
+  std::condition_variable worker_start_cv_;
+  WorkerCommand worker_start_command_;
+
+  // Written by main thread, read by workers (after mutex lock/unlock).
+  JxlParallelRunFunction data_func_;
+  void* jpegxl_opaque_;
+
+  // Updated by workers; padding avoids false sharing.
+  uint8_t padding1[64];
+  std::atomic<uint32_t> num_reserved_{0};
+  uint8_t padding2[64];
+};
+
+}  // namespace jpegxl
+
+#endif  // LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_
diff --git a/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc
new file mode 100644
index 0000000000..a757c3018b
--- /dev/null
+++ b/third_party/jpeg-xl/lib/threads/thread_parallel_runner_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <atomic>
+
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+using jxl::test::ThreadPoolForTests;
+
+namespace jpegxl {
+namespace {
+
+int PopulationCount(uint64_t bits) {
+  int num_set = 0;
+  while (bits != 0) {
+    num_set += bits & 1;
+    bits >>= 1;
+  }
+  return num_set;
+}
+
+// Ensures task parameter is in bounds, every parameter is reached,
+// pool can be reused (multiple consecutive Run calls), pool can be destroyed
+// (joining with its threads), num_threads=0 works (runs on current thread).
+TEST(ThreadParallelRunnerTest, TestPool) {
+  for (int num_threads = 0; num_threads <= 18; ++num_threads) {
+    ThreadPoolForTests pool(num_threads);
+    for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
+      std::vector<int> mementos(num_tasks);
+      for (int begin = 0; begin < 32; ++begin) {
+        std::fill(mementos.begin(), mementos.end(), 0);
+        EXPECT_TRUE(RunOnPool(
+            &pool, begin, begin + num_tasks, jxl::ThreadPool::NoInit,
+            [begin, num_tasks, &mementos](const int task, const int thread) {
+              // Parameter is in the given range
+              EXPECT_GE(task, begin);
+              EXPECT_LT(task, begin + num_tasks);
+
+              // Store mementos to be sure we visited each task.
+              mementos.at(task - begin) = 1000 + task;
+            },
+            "TestPool"));
+        for (int task = begin; task < begin + num_tasks; ++task) {
+          EXPECT_EQ(1000 + task, mementos.at(task - begin));
+        }
+      }
+    }
+  }
+}
+
+// Verify "thread" parameter when processing few tasks.
+TEST(ThreadParallelRunnerTest, TestSmallAssignments) {
+  // WARNING: cumulative total threads must not exceed profiler.h kMaxThreads.
+  const int kMaxThreads = 8;
+  for (int num_threads = 1; num_threads <= kMaxThreads; ++num_threads) {
+    ThreadPoolForTests pool(num_threads);
+
+    // (Avoid mutex because it may perturb the worker thread scheduling)
+    std::atomic<uint64_t> id_bits{0};
+    std::atomic<int> num_calls{0};
+
+    EXPECT_TRUE(RunOnPool(
+        &pool, 0, num_threads, jxl::ThreadPool::NoInit,
+        [&num_calls, num_threads, &id_bits](const int task, const int thread) {
+          num_calls.fetch_add(1, std::memory_order_relaxed);
+
+          EXPECT_LT(thread, num_threads);
+          uint64_t bits = id_bits.load(std::memory_order_relaxed);
+          while (
+              !id_bits.compare_exchange_weak(bits, bits | (1ULL << thread))) {
+          }
+        },
+        "TestSmallAssignments"));
+
+    // Correct number of tasks.
+    EXPECT_EQ(num_threads, num_calls.load());
+
+    const int num_participants = PopulationCount(id_bits.load());
+    // Can't expect equality because other workers may have woken up too late.
+    EXPECT_LE(num_participants, num_threads);
+  }
+}
+
+struct Counter {
+  Counter() {
+    // Suppress "unused-field" warning.
+    (void)padding;
+  }
+  void Assimilate(const Counter& victim) { counter += victim.counter; }
+  int counter = 0;
+  int padding[31];
+};
+
+TEST(ThreadParallelRunnerTest, TestCounter) {
+  const int kNumThreads = 12;
+  ThreadPoolForTests pool(kNumThreads);
+  alignas(128) Counter counters[kNumThreads];
+
+  const int kNumTasks = kNumThreads * 19;
+  EXPECT_TRUE(RunOnPool(
+      &pool, 0, kNumTasks, jxl::ThreadPool::NoInit,
+      [&counters](const int task, const int thread) {
+        counters[thread].counter += task;
+      },
+      "TestCounter"));
+
+  int expected = 0;
+  for (int i = 0; i < kNumTasks; ++i) {
+    expected += i;
+  }
+
+  for (int i = 1; i < kNumThreads; ++i) {
+    counters[0].Assimilate(counters[i]);
+  }
+  EXPECT_EQ(expected, counters[0].counter);
+}
+
+}  // namespace
+}  // namespace jpegxl
diff --git a/third_party/jpeg-xl/plugins/CMakeLists.txt b/third_party/jpeg-xl/plugins/CMakeLists.txt
new file mode 100644
index 0000000000..bff1bff29d
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+if(NOT MSVC)
+  option(JPEGXL_ENABLE_PLUGIN_GDKPIXBUF "Enable plugin for GdkPixbuf image loading library" ON)
+  if(JPEGXL_ENABLE_PLUGIN_GDKPIXBUF)
+    add_subdirectory(gdk-pixbuf)
+  endif()
+endif()
+
+option(JPEGXL_ENABLE_PLUGIN_GIMP210 "Enable plugin for GIMP 2.10.x series" ON)
+if(JPEGXL_ENABLE_PLUGIN_GIMP210)
+  add_subdirectory(gimp)
+endif()
+
+option(JPEGXL_ENABLE_PLUGIN_MIME "Enable image/jxl declaration for shared-mime-info" ON)
+if(JPEGXL_ENABLE_PLUGIN_MIME)
+  add_subdirectory(mime)
+endif()
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/CMakeLists.txt b/third_party/jpeg-xl/plugins/gdk-pixbuf/CMakeLists.txt
new file mode 100644
index 0000000000..7b53b98c66
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig)
+pkg_check_modules(Gdk-Pixbuf IMPORTED_TARGET gdk-pixbuf-2.0>=2.36)
+
+include(GNUInstallDirs)
+
+if (NOT Gdk-Pixbuf_FOUND)
+  message(WARNING "GDK Pixbuf development libraries not found, \
+                   the Gdk-Pixbuf plugin will not be built")
+  return ()
+endif ()
+
+add_library(pixbufloader-jxl MODULE pixbufloader-jxl.c)
+
+# Mark all symbols as hidden by default. The PkgConfig::Gdk-Pixbuf dependency
+# will cause fill_info and fill_vtable entry points to be made public.
+set_target_properties(pixbufloader-jxl PROPERTIES
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+)
+
+# Note: This only needs the decoder library, but we don't install the decoder
+# shared library.
+target_link_libraries(pixbufloader-jxl jxl jxl_threads lcms2 PkgConfig::Gdk-Pixbuf)
+
+execute_process(COMMAND ${PKG_CONFIG_EXECUTABLE} gdk-pixbuf-2.0 --variable gdk_pixbuf_moduledir --define-variable=prefix=${CMAKE_INSTALL_PREFIX} OUTPUT_VARIABLE GDK_PIXBUF_MODULEDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+install(TARGETS pixbufloader-jxl DESTINATION "${GDK_PIXBUF_MODULEDIR}")
+
+# Instead of the following, we might instead add the
+# mime type image/jxl to
+# /usr/share/thumbnailers/gdk-pixbuf-thumbnailer.thumbnailer
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/jxl.thumbnailer DESTINATION "${CMAKE_INSTALL_DATADIR}/thumbnailers/")
+
+if(BUILD_TESTING AND NOT CMAKE_CROSSCOMPILING)
+  pkg_check_modules(Gdk IMPORTED_TARGET gdk-2.0)
+  if (Gdk_FOUND)
+    # Test for loading a .jxl file using the pixbufloader library via GDK. This
+    # requires to have the image/jxl mime type and loader library configured,
+    # which we do in a fake environment in the CMAKE_CURRENT_BINARY_DIR.
+    add_executable(pixbufloader_test pixbufloader_test.cc)
+    target_link_libraries(pixbufloader_test PkgConfig::Gdk)
+
+    # Create a mime cache for test.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/mime/mime.cache"
+      COMMAND env XDG_DATA_HOME=${CMAKE_CURRENT_BINARY_DIR}
+        xdg-mime install --novendor
+        "${CMAKE_SOURCE_DIR}/plugins/mime/image-jxl.xml"
+      DEPENDS "${CMAKE_SOURCE_DIR}/plugins/mime/image-jxl.xml"
+    )
+    add_custom_target(pixbufloader_test_mime
+      DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/mime/mime.cache"
+    )
+    add_dependencies(pixbufloader_test pixbufloader_test_mime)
+
+    # Use a fake X server to run the test if xvfb is installed.
+    find_program (XVFB_PROGRAM xvfb-run)
+    if(XVFB_PROGRAM)
+      set(XVFB_PROGRAM_PREFIX "${XVFB_PROGRAM};-a")
+    else()
+      set(XVFB_PROGRAM_PREFIX "")
+    endif()
+
+    # libX11.so and libgdk-x11-2.0.so are not compiled with MSAN -> report
+    # use-of-uninitialized-value for string some internal string value.
+    # TODO(eustas): investigate direct memory leak (32 bytes).
+    if (NOT (SANITIZER STREQUAL "msan") AND NOT (SANITIZER STREQUAL "asan"))
+      add_test(
+        NAME pixbufloader_test_jxl
+        COMMAND
+          ${XVFB_PROGRAM_PREFIX} $<TARGET_FILE:pixbufloader_test>
+          "${CMAKE_CURRENT_SOURCE_DIR}/loaders_test.cache"
+          "${CMAKE_SOURCE_DIR}/testdata/jxl/blending/cropped_traffic_light.jxl"
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      )
+      set_tests_properties(pixbufloader_test_jxl PROPERTIES SKIP_RETURN_CODE 254)
+    endif()
+  endif()  # Gdk_FOUND
+endif()  # BUILD_TESTING
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/README.md b/third_party/jpeg-xl/plugins/gdk-pixbuf/README.md
new file mode 100644
index 0000000000..185919436f
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/README.md
@@ -0,0 +1,50 @@
+## JPEG XL GDK Pixbuf
+
+
+The plugin may already have been installed when following the instructions from the
+[Installing section of BUILDING.md](../../BUILDING.md#installing), in which case it should
+already be in the correct place, e.g.
+
+```/usr/lib/x86_64-linux-gnu/gdk-pixbuf-2.0/2.10.0/loaders/libpixbufloader-jxl.so```
+
+Otherwise we can copy it manually:
+
+```bash
+sudo cp $your_build_directory/plugins/gdk-pixbuf/libpixbufloader-jxl.so /usr/lib/x86_64-linux-gnu/gdk-pixbuf-2.0/2.10.0/loaders/libpixbufloader-jxl.so
+```
+
+
+Then we need to update the cache, for example with:
+
+```bash
+sudo /usr/lib/x86_64-linux-gnu/gdk-pixbuf-2.0/gdk-pixbuf-query-loaders --update-cache
+```
+
+In order to get thumbnails with this, first one has to add the jxl MIME type, see
+[../mime/README.md](../mime/README.md).
+
+Ensure that the thumbnailer file is installed in the correct place,
+`/usr/share/thumbnailers/jxl.thumbnailer` or `/usr/local/share/thumbnailers/jxl.thumbnailer`.
+
+The file should have been copied automatically when following the instructions
+in the [Installing section of README.md](../../README.md#installing), but
+otherwise it can be copied manually:
+
+```bash
+sudo cp plugins/gdk-pixbuf/jxl.thumbnailer /usr/local/share/thumbnailers/jxl.thumbnailer
+```
+
+Update the Mime database with
+```bash
+update-mime --local
+```
+or
+```bash
+sudo update-desktop-database
+```
+
+Then possibly delete the thumbnail cache with
+```bash
+rm -r ~/.cache/thumbnails
+```
+and restart the application displaying thumbnails, e.g. `nautilus -q` to display thumbnails.
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/jxl.thumbnailer b/third_party/jpeg-xl/plugins/gdk-pixbuf/jxl.thumbnailer
new file mode 100644
index 0000000000..1bcaab61fc
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/jxl.thumbnailer
@@ -0,0 +1,4 @@
+[Thumbnailer Entry]
+TryExec=/usr/bin/gdk-pixbuf-thumbnailer
+Exec=/usr/bin/gdk-pixbuf-thumbnailer -s %s %u %o
+MimeType=image/jxl;
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/loaders_test.cache b/third_party/jpeg-xl/plugins/gdk-pixbuf/loaders_test.cache
new file mode 100644
index 0000000000..95c62c8fc3
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/loaders_test.cache
@@ -0,0 +1,16 @@
+# GdkPixbuf Image Loader Modules file for testing
+# Automatically generated file, do not edit
+# Created by gdk-pixbuf-query-loaders from gdk-pixbuf-2.42.2
+#
+# Generated with:
+#  GDK_PIXBUF_MODULEDIR=`pwd`/build/plugins/gdk-pixbuf/ gdk-pixbuf-query-loaders
+#
+# Modified to use the library from the current working directory at runtime.
+"./libpixbufloader-jxl.so"
+"jxl" 4 "gdk-pixbuf" "JPEG XL image" "BSD-3"
+"image/jxl" ""
+"jxl" ""
+"\377\n" "  " 100
+"...\fJXL \r\n\207\n" "zzz         " 100
+
+
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader-jxl.c b/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader-jxl.c
new file mode 100644
index 0000000000..9df9611b39
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader-jxl.c
@@ -0,0 +1,816 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/encode.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/types.h>
+
+#include "lcms2.h"
+
+#define GDK_PIXBUF_ENABLE_BACKEND
+#include <gdk-pixbuf/gdk-pixbuf.h>
+#undef GDK_PIXBUF_ENABLE_BACKEND
+
+G_BEGIN_DECLS
+
+// Information about a single frame.
+typedef struct {
+  uint64_t duration_ms;
+  GdkPixbuf *data;
+  gboolean decoded;
+} GdkPixbufJxlAnimationFrame;
+
+// Represent a whole JPEG XL animation; all its fields are owned; as a GObject,
+// the Animation struct itself is reference counted (as are the GdkPixbufs for
+// individual frames).
+struct _GdkPixbufJxlAnimation {
+  GdkPixbufAnimation parent_instance;
+
+  // GDK interface implementation callbacks.
+  GdkPixbufModuleSizeFunc image_size_callback;
+  GdkPixbufModulePreparedFunc pixbuf_prepared_callback;
+  GdkPixbufModuleUpdatedFunc area_updated_callback;
+  gpointer user_data;
+
+  // All frames known so far; a frame is added when the JXL_DEC_FRAME event is
+  // received from the decoder; initially frame.decoded is FALSE, until
+  // the JXL_DEC_IMAGE event is received.
+  GArray *frames;
+
+  // JPEG XL decoder and related structures.
+  JxlParallelRunner *parallel_runner;
+  JxlDecoder *decoder;
+  JxlPixelFormat pixel_format;
+
+  // Decoding is `done` when JXL_DEC_SUCCESS is received; calling
+  // load_increment afterwards gives an error.
+  gboolean done;
+
+  // Image information.
+  size_t xsize;
+  size_t ysize;
+  gboolean alpha_premultiplied;
+  gboolean has_animation;
+  gboolean has_alpha;
+  uint64_t total_duration_ms;
+  uint64_t tick_duration_us;
+  uint64_t repetition_count;  // 0 = loop forever
+
+  gpointer icc_buff;
+  cmsContext context;
+  cmsHPROFILE profile, srgb;
+  cmsHTRANSFORM transform;
+};
+
+#define GDK_TYPE_PIXBUF_JXL_ANIMATION (gdk_pixbuf_jxl_animation_get_type())
+G_DECLARE_FINAL_TYPE(GdkPixbufJxlAnimation, gdk_pixbuf_jxl_animation, GDK,
+                     JXL_ANIMATION, GdkPixbufAnimation);
+
+G_DEFINE_TYPE(GdkPixbufJxlAnimation, gdk_pixbuf_jxl_animation,
+              GDK_TYPE_PIXBUF_ANIMATION);
+
+// Iterator to a given point in time in the animation; contains a pointer to the
+// full animation.
+struct _GdkPixbufJxlAnimationIter {
+  GdkPixbufAnimationIter parent_instance;
+  GdkPixbufJxlAnimation *animation;
+  size_t current_frame;
+  uint64_t time_offset;
+};
+
+#define GDK_TYPE_PIXBUF_JXL_ANIMATION_ITER \
+  (gdk_pixbuf_jxl_animation_iter_get_type())
+G_DECLARE_FINAL_TYPE(GdkPixbufJxlAnimationIter, gdk_pixbuf_jxl_animation_iter,
+                     GDK, JXL_ANIMATION_ITER, GdkPixbufAnimationIter);
+G_DEFINE_TYPE(GdkPixbufJxlAnimationIter, gdk_pixbuf_jxl_animation_iter,
+              GDK_TYPE_PIXBUF_ANIMATION_ITER);
+
+static void gdk_pixbuf_jxl_animation_init(GdkPixbufJxlAnimation *obj) {
+  // Suppress "unused function" warnings.
+  (void)glib_autoptr_cleanup_GdkPixbufJxlAnimation;
+  (void)GDK_JXL_ANIMATION;
+  (void)GDK_IS_JXL_ANIMATION;
+}
+
+static gboolean gdk_pixbuf_jxl_animation_is_static_image(
+    GdkPixbufAnimation *anim) {
+  GdkPixbufJxlAnimation *jxl_anim = (GdkPixbufJxlAnimation *)anim;
+  return !jxl_anim->has_animation;
+}
+
+static GdkPixbuf *gdk_pixbuf_jxl_animation_get_static_image(
+    GdkPixbufAnimation *anim) {
+  GdkPixbufJxlAnimation *jxl_anim = (GdkPixbufJxlAnimation *)anim;
+  if (jxl_anim->frames == NULL || jxl_anim->frames->len == 0) return NULL;
+  GdkPixbufJxlAnimationFrame *frame =
+      &g_array_index(jxl_anim->frames, GdkPixbufJxlAnimationFrame, 0);
+  return frame->decoded ? frame->data : NULL;
+}
+
+static void gdk_pixbuf_jxl_animation_get_size(GdkPixbufAnimation *anim,
+                                              int *width, int *height) {
+  GdkPixbufJxlAnimation *jxl_anim = (GdkPixbufJxlAnimation *)anim;
+  if (width) *width = jxl_anim->xsize;
+  if (height) *height = jxl_anim->ysize;
+}
+
+G_GNUC_BEGIN_IGNORE_DEPRECATIONS
+static gboolean gdk_pixbuf_jxl_animation_iter_advance(
+    GdkPixbufAnimationIter *iter, const GTimeVal *current_time);
+
+static GdkPixbufAnimationIter *gdk_pixbuf_jxl_animation_get_iter(
+    GdkPixbufAnimation *anim, const GTimeVal *start_time) {
+  GdkPixbufJxlAnimationIter *iter =
+      g_object_new(GDK_TYPE_PIXBUF_JXL_ANIMATION_ITER, NULL);
+  iter->animation = (GdkPixbufJxlAnimation *)anim;
+  iter->time_offset = start_time->tv_sec * 1000ULL + start_time->tv_usec / 1000;
+  g_object_ref(iter->animation);
+  gdk_pixbuf_jxl_animation_iter_advance((GdkPixbufAnimationIter *)iter,
+                                        start_time);
+  return (GdkPixbufAnimationIter *)iter;
+}
+G_GNUC_END_IGNORE_DEPRECATIONS
+
+static void gdk_pixbuf_jxl_animation_finalize(GObject *obj) {
+  GdkPixbufJxlAnimation *decoder_state = (GdkPixbufJxlAnimation *)obj;
+  if (decoder_state->frames != NULL) {
+    for (size_t i = 0; i < decoder_state->frames->len; i++) {
+      g_object_unref(
+          g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame, i)
+              .data);
+    }
+    g_array_free(decoder_state->frames, /*free_segment=*/TRUE);
+  }
+  JxlResizableParallelRunnerDestroy(decoder_state->parallel_runner);
+  JxlDecoderDestroy(decoder_state->decoder);
+  cmsDeleteTransform(decoder_state->transform);
+  cmsCloseProfile(decoder_state->srgb);
+  cmsCloseProfile(decoder_state->profile);
+  cmsDeleteContext(decoder_state->context);
+  g_free(decoder_state->icc_buff);
+}
+
+static void gdk_pixbuf_jxl_animation_class_init(
+    GdkPixbufJxlAnimationClass *klass) {
+  G_OBJECT_CLASS(klass)->finalize = gdk_pixbuf_jxl_animation_finalize;
+  klass->parent_class.is_static_image =
+      gdk_pixbuf_jxl_animation_is_static_image;
+  klass->parent_class.get_static_image =
+      gdk_pixbuf_jxl_animation_get_static_image;
+  klass->parent_class.get_size = gdk_pixbuf_jxl_animation_get_size;
+  klass->parent_class.get_iter = gdk_pixbuf_jxl_animation_get_iter;
+}
+
+static void gdk_pixbuf_jxl_animation_iter_init(GdkPixbufJxlAnimationIter *obj) {
+  (void)glib_autoptr_cleanup_GdkPixbufJxlAnimationIter;
+  (void)GDK_JXL_ANIMATION_ITER;
+  (void)GDK_IS_JXL_ANIMATION_ITER;
+}
+
+static int gdk_pixbuf_jxl_animation_iter_get_delay_time(
+    GdkPixbufAnimationIter *iter) {
+  GdkPixbufJxlAnimationIter *jxl_iter = (GdkPixbufJxlAnimationIter *)iter;
+  if (jxl_iter->animation->frames->len <= jxl_iter->current_frame) {
+    return 0;
+  }
+  return g_array_index(jxl_iter->animation->frames, GdkPixbufJxlAnimationFrame,
+                       jxl_iter->current_frame)
+      .duration_ms;
+}
+
+static GdkPixbuf *gdk_pixbuf_jxl_animation_iter_get_pixbuf(
+    GdkPixbufAnimationIter *iter) {
+  GdkPixbufJxlAnimationIter *jxl_iter = (GdkPixbufJxlAnimationIter *)iter;
+  if (jxl_iter->animation->frames->len <= jxl_iter->current_frame) {
+    return NULL;
+  }
+  return g_array_index(jxl_iter->animation->frames, GdkPixbufJxlAnimationFrame,
+                       jxl_iter->current_frame)
+      .data;
+}
+
+static gboolean gdk_pixbuf_jxl_animation_iter_on_currently_loading_frame(
+    GdkPixbufAnimationIter *iter) {
+  GdkPixbufJxlAnimationIter *jxl_iter = (GdkPixbufJxlAnimationIter *)iter;
+  if (jxl_iter->animation->frames->len <= jxl_iter->current_frame) {
+    return TRUE;
+  }
+  return !g_array_index(jxl_iter->animation->frames, GdkPixbufJxlAnimationFrame,
+                        jxl_iter->current_frame)
+              .decoded;
+}
+
+G_GNUC_BEGIN_IGNORE_DEPRECATIONS
+static gboolean gdk_pixbuf_jxl_animation_iter_advance(
+    GdkPixbufAnimationIter *iter, const GTimeVal *current_time) {
+  GdkPixbufJxlAnimationIter *jxl_iter = (GdkPixbufJxlAnimationIter *)iter;
+  size_t old_frame = jxl_iter->current_frame;
+
+  uint64_t current_time_ms = current_time->tv_sec * 1000ULL +
+                             current_time->tv_usec / 1000 -
+                             jxl_iter->time_offset;
+
+  if (jxl_iter->animation->frames->len == 0) {
+    jxl_iter->current_frame = 0;
+  } else if (!jxl_iter->animation->done &&
+             current_time_ms >= jxl_iter->animation->total_duration_ms) {
+    jxl_iter->current_frame = jxl_iter->animation->frames->len - 1;
+  } else if (jxl_iter->animation->repetition_count != 0 &&
+             current_time_ms > jxl_iter->animation->repetition_count *
+                                   jxl_iter->animation->total_duration_ms) {
+    jxl_iter->current_frame = jxl_iter->animation->frames->len - 1;
+  } else {
+    uint64_t total_duration_ms = jxl_iter->animation->total_duration_ms;
+    // Guard against divide-by-0 in malicious files.
+    if (total_duration_ms == 0) total_duration_ms = 1;
+    uint64_t loop_offset = current_time_ms % total_duration_ms;
+    jxl_iter->current_frame = 0;
+    while (TRUE) {
+      uint64_t duration =
+          g_array_index(jxl_iter->animation->frames, GdkPixbufJxlAnimationFrame,
+                        jxl_iter->current_frame)
+              .duration_ms;
+      if (duration >= loop_offset) {
+        break;
+      }
+      loop_offset -= duration;
+      jxl_iter->current_frame++;
+    }
+  }
+
+  return old_frame != jxl_iter->current_frame;
+}
+G_GNUC_END_IGNORE_DEPRECATIONS
+
+static void gdk_pixbuf_jxl_animation_iter_finalize(GObject *obj) {
+  GdkPixbufJxlAnimationIter *iter = (GdkPixbufJxlAnimationIter *)obj;
+  g_object_unref(iter->animation);
+}
+
+static void gdk_pixbuf_jxl_animation_iter_class_init(
+    GdkPixbufJxlAnimationIterClass *klass) {
+  G_OBJECT_CLASS(klass)->finalize = gdk_pixbuf_jxl_animation_iter_finalize;
+  klass->parent_class.get_delay_time =
+      gdk_pixbuf_jxl_animation_iter_get_delay_time;
+  klass->parent_class.get_pixbuf = gdk_pixbuf_jxl_animation_iter_get_pixbuf;
+  klass->parent_class.on_currently_loading_frame =
+      gdk_pixbuf_jxl_animation_iter_on_currently_loading_frame;
+  klass->parent_class.advance = gdk_pixbuf_jxl_animation_iter_advance;
+}
+
+G_END_DECLS
+
+static gpointer begin_load(GdkPixbufModuleSizeFunc size_func,
+                           GdkPixbufModulePreparedFunc prepare_func,
+                           GdkPixbufModuleUpdatedFunc update_func,
+                           gpointer user_data, GError **error) {
+  GdkPixbufJxlAnimation *decoder_state =
+      g_object_new(GDK_TYPE_PIXBUF_JXL_ANIMATION, NULL);
+  if (decoder_state == NULL) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the animation state failed");
+    return NULL;
+  }
+  decoder_state->image_size_callback = size_func;
+  decoder_state->pixbuf_prepared_callback = prepare_func;
+  decoder_state->area_updated_callback = update_func;
+  decoder_state->user_data = user_data;
+  decoder_state->frames =
+      g_array_new(/*zero_terminated=*/FALSE, /*clear_=*/TRUE,
+                  sizeof(GdkPixbufJxlAnimationFrame));
+
+  if (decoder_state->frames == NULL) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the frame array failed");
+    goto cleanup;
+  }
+
+  if (!(decoder_state->parallel_runner =
+            JxlResizableParallelRunnerCreate(NULL))) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL parallel runner failed");
+    goto cleanup;
+  }
+
+  if (!(decoder_state->decoder = JxlDecoderCreate(NULL))) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL decoder failed");
+    goto cleanup;
+  }
+
+  JxlDecoderStatus status;
+
+  if ((status = JxlDecoderSetParallelRunner(
+           decoder_state->decoder, JxlResizableParallelRunner,
+           decoder_state->parallel_runner)) != JXL_DEC_SUCCESS) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlDecoderSetParallelRunner failed: %x", status);
+    goto cleanup;
+  }
+  if ((status = JxlDecoderSubscribeEvents(
+           decoder_state->decoder, JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING |
+                                       JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME)) !=
+      JXL_DEC_SUCCESS) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlDecoderSubscribeEvents failed: %x", status);
+    goto cleanup;
+  }
+
+  decoder_state->pixel_format.data_type = JXL_TYPE_FLOAT;
+  decoder_state->pixel_format.endianness = JXL_NATIVE_ENDIAN;
+
+  return decoder_state;
+cleanup:
+  JxlResizableParallelRunnerDestroy(decoder_state->parallel_runner);
+  JxlDecoderDestroy(decoder_state->decoder);
+  g_object_unref(decoder_state);
+  return NULL;
+}
+
+static gboolean stop_load(gpointer context, GError **error) {
+  g_object_unref(context);
+  return TRUE;
+}
+
+static void draw_pixels(void *context, size_t x, size_t y, size_t num_pixels,
+                        const void *pixels) {
+  GdkPixbufJxlAnimation *decoder_state = context;
+
+  GdkPixbuf *output =
+      g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame,
+                    decoder_state->frames->len - 1)
+          .data;
+
+  guchar *dst = gdk_pixbuf_get_pixels(output) +
+                decoder_state->pixel_format.num_channels * x +
+                gdk_pixbuf_get_rowstride(output) * y;
+
+  cmsDoTransform(decoder_state->transform, pixels, dst, num_pixels);
+}
+
+static gboolean load_increment(gpointer context, const guchar *buf, guint size,
+                               GError **error) {
+  GdkPixbufJxlAnimation *decoder_state = context;
+  if (decoder_state->done == TRUE) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JXL decoder load_increment called after end of file");
+    return FALSE;
+  }
+
+  JxlDecoderStatus status;
+
+  if ((status = JxlDecoderSetInput(decoder_state->decoder, buf, size)) !=
+      JXL_DEC_SUCCESS) {
+    // Should never happen if things are done properly.
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JXL decoder logic error: %x", status);
+    return FALSE;
+  }
+
+  for (;;) {
+    status = JxlDecoderProcessInput(decoder_state->decoder);
+    switch (status) {
+      case JXL_DEC_NEED_MORE_INPUT: {
+        JxlDecoderReleaseInput(decoder_state->decoder);
+        return TRUE;
+      }
+
+      case JXL_DEC_BASIC_INFO: {
+        JxlBasicInfo info;
+        if (JxlDecoderGetBasicInfo(decoder_state->decoder, &info) !=
+            JXL_DEC_SUCCESS) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "JXLDecoderGetBasicInfo failed");
+          return FALSE;
+        }
+        decoder_state->pixel_format.num_channels = info.alpha_bits > 0 ? 4 : 3;
+        decoder_state->alpha_premultiplied = info.alpha_premultiplied;
+        decoder_state->xsize = info.xsize;
+        decoder_state->ysize = info.ysize;
+        decoder_state->has_animation = info.have_animation;
+        decoder_state->has_alpha = info.alpha_bits > 0;
+        if (info.have_animation) {
+          decoder_state->repetition_count = info.animation.num_loops;
+          decoder_state->tick_duration_us = 1000000ULL *
+                                            info.animation.tps_denominator /
+                                            info.animation.tps_numerator;
+        }
+        gint width = info.xsize;
+        gint height = info.ysize;
+        if (decoder_state->image_size_callback) {
+          decoder_state->image_size_callback(&width, &height,
+                                             decoder_state->user_data);
+        }
+
+        // GDK convention for signaling being interested only in the basic info.
+        if (width == 0 || height == 0) {
+          decoder_state->done = TRUE;
+          return TRUE;
+        }
+
+        // Set an appropriate number of threads for the image size.
+        JxlResizableParallelRunnerSetThreads(
+            decoder_state->parallel_runner,
+            JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+        break;
+      }
+
+      case JXL_DEC_COLOR_ENCODING: {
+        // Get the ICC color profile of the pixel data
+        size_t icc_size;
+        if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize(
+                                   decoder_state->decoder,
+                                   &decoder_state->pixel_format,
+                                   JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "JxlDecoderGetICCProfileSize failed");
+          return FALSE;
+        }
+        if (!(decoder_state->icc_buff = g_malloc(icc_size))) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Allocating ICC profile failed");
+          return FALSE;
+        }
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetColorAsICCProfile(decoder_state->decoder,
+                                           &decoder_state->pixel_format,
+                                           JXL_COLOR_PROFILE_TARGET_DATA,
+                                           decoder_state->icc_buff, icc_size)) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "JxlDecoderGetColorAsICCProfile failed");
+          return FALSE;
+        }
+        decoder_state->context = cmsCreateContext(NULL, NULL);
+        if (!decoder_state->context) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Failed to create LCMS2 context");
+          return FALSE;
+        }
+        decoder_state->profile = cmsOpenProfileFromMemTHR(
+            decoder_state->context, decoder_state->icc_buff, icc_size);
+        if (!decoder_state->profile) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Invalid ICC profile from JXL image decoder");
+          return FALSE;
+        }
+        decoder_state->srgb = cmsCreate_sRGBProfileTHR(decoder_state->context);
+        if (!decoder_state->srgb) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Failed to create sRGB profile");
+          return FALSE;
+        }
+        decoder_state->transform = cmsCreateTransformTHR(
+            decoder_state->context, decoder_state->profile,
+            decoder_state->has_alpha ? TYPE_RGBA_FLT : TYPE_RGB_FLT,
+            decoder_state->srgb,
+            decoder_state->has_alpha ? TYPE_RGBA_8 : TYPE_RGB_8,
+            INTENT_RELATIVE_COLORIMETRIC, cmsFLAGS_COPY_ALPHA);
+        if (!decoder_state->transform) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Failed to create LCMS2 color transform");
+          return FALSE;
+        }
+
+        break;
+      }
+
+      case JXL_DEC_FRAME: {
+        // TODO(veluca): support rescaling.
+        JxlFrameHeader frame_header;
+        if (JxlDecoderGetFrameHeader(decoder_state->decoder, &frame_header) !=
+            JXL_DEC_SUCCESS) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "Failed to retrieve frame info");
+          return FALSE;
+        }
+
+        {
+          GdkPixbufJxlAnimationFrame frame;
+          frame.decoded = FALSE;
+          frame.duration_ms =
+              frame_header.duration * decoder_state->tick_duration_us / 1000;
+          decoder_state->total_duration_ms += frame.duration_ms;
+          frame.data =
+              gdk_pixbuf_new(GDK_COLORSPACE_RGB, decoder_state->has_alpha,
+                             /*bits_per_sample=*/8, decoder_state->xsize,
+                             decoder_state->ysize);
+          if (frame.data == NULL) {
+            g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                        "Failed to allocate output pixel buffer");
+            return FALSE;
+          }
+          decoder_state->pixel_format.align =
+              gdk_pixbuf_get_rowstride(frame.data);
+          g_array_append_val(decoder_state->frames, frame);
+        }
+        if (decoder_state->pixbuf_prepared_callback &&
+            decoder_state->frames->len == 1) {
+          decoder_state->pixbuf_prepared_callback(
+              g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame,
+                            0)
+                  .data,
+              decoder_state->has_animation ? (GdkPixbufAnimation *)decoder_state
+                                           : NULL,
+              decoder_state->user_data);
+        }
+        break;
+      }
+
+      case JXL_DEC_NEED_IMAGE_OUT_BUFFER: {
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderSetImageOutCallback(decoder_state->decoder,
+                                          &decoder_state->pixel_format,
+                                          draw_pixels, decoder_state)) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                      "JxlDecoderSetImageOutCallback failed");
+          return FALSE;
+        }
+        break;
+      }
+
+      case JXL_DEC_FULL_IMAGE: {
+        // TODO(veluca): consider doing partial updates.
+        if (decoder_state->area_updated_callback) {
+          GdkPixbuf *output = g_array_index(decoder_state->frames,
+                                            GdkPixbufJxlAnimationFrame, 0)
+                                  .data;
+          decoder_state->area_updated_callback(
+              output, 0, 0, gdk_pixbuf_get_width(output),
+              gdk_pixbuf_get_height(output), decoder_state->user_data);
+        }
+        g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame,
+                      decoder_state->frames->len - 1)
+            .decoded = TRUE;
+        break;
+      }
+
+      case JXL_DEC_SUCCESS: {
+        decoder_state->done = TRUE;
+        return TRUE;
+      }
+
+      default: {
+        g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                    "Unexpected JxlDecoderProcessInput return code: %x",
+                    status);
+        return FALSE;
+      }
+    }
+  }
+  return TRUE;
+}
+
+static gboolean jxl_is_save_option_supported(const gchar *option_key) {
+  if (g_strcmp0(option_key, "quality") == 0) {
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+static gboolean jxl_image_saver(FILE *f, GdkPixbuf *pixbuf, gchar **keys,
+                                gchar **values, GError **error) {
+  long quality = 90; /* default; must be between 0 and 100 */
+  double distance;
+  gboolean save_alpha;
+  JxlEncoder *encoder;
+  void *parallel_runner;
+  JxlEncoderFrameSettings *frame_settings;
+  JxlBasicInfo output_info;
+  JxlPixelFormat pixel_format;
+  JxlColorEncoding color_profile;
+  JxlEncoderStatus status;
+
+  GByteArray *compressed;
+  size_t offset = 0;
+  uint8_t *next_out;
+  size_t avail_out;
+
+  if (f == NULL || pixbuf == NULL) {
+    return FALSE;
+  }
+
+  if (keys && *keys) {
+    gchar **kiter = keys;
+    gchar **viter = values;
+
+    while (*kiter) {
+      if (strcmp(*kiter, "quality") == 0) {
+        char *endptr = NULL;
+        quality = strtol(*viter, &endptr, 10);
+
+        if (endptr == *viter) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_BAD_OPTION,
+                      "JXL quality must be a value between 0 and 100; value "
+                      "\"%s\" could not be parsed.",
+                      *viter);
+
+          return FALSE;
+        }
+
+        if (quality < 0 || quality > 100) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_BAD_OPTION,
+                      "JXL quality must be a value between 0 and 100; value "
+                      "\"%ld\" is not allowed.",
+                      quality);
+
+          return FALSE;
+        }
+      } else {
+        g_warning("Unrecognized parameter (%s) passed to JXL saver.", *kiter);
+      }
+
+      ++kiter;
+      ++viter;
+    }
+  }
+
+  if (gdk_pixbuf_get_bits_per_sample(pixbuf) != 8) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                "Sorry, only 8bit images are supported by this JXL saver");
+    return FALSE;
+  }
+
+  JxlEncoderInitBasicInfo(&output_info);
+  output_info.have_container = JXL_FALSE;
+  output_info.xsize = gdk_pixbuf_get_width(pixbuf);
+  output_info.ysize = gdk_pixbuf_get_height(pixbuf);
+  output_info.bits_per_sample = 8;
+  output_info.orientation = JXL_ORIENT_IDENTITY;
+  output_info.num_color_channels = 3;
+
+  if (output_info.xsize == 0 || output_info.ysize == 0) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_CORRUPT_IMAGE,
+                "Empty image, nothing to save");
+    return FALSE;
+  }
+
+  save_alpha = gdk_pixbuf_get_has_alpha(pixbuf);
+
+  pixel_format.data_type = JXL_TYPE_UINT8;
+  pixel_format.endianness = JXL_NATIVE_ENDIAN;
+  pixel_format.align = gdk_pixbuf_get_rowstride(pixbuf);
+
+  if (save_alpha) {
+    if (gdk_pixbuf_get_n_channels(pixbuf) != 4) {
+      g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                  "Unsupported number of channels");
+      return FALSE;
+    }
+
+    output_info.num_extra_channels = 1;
+    output_info.alpha_bits = 8;
+    pixel_format.num_channels = 4;
+  } else {
+    if (gdk_pixbuf_get_n_channels(pixbuf) != 3) {
+      g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                  "Unsupported number of channels");
+      return FALSE;
+    }
+
+    output_info.num_extra_channels = 0;
+    output_info.alpha_bits = 0;
+    pixel_format.num_channels = 3;
+  }
+
+  encoder = JxlEncoderCreate(NULL);
+  if (!encoder) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL encoder failed");
+    return FALSE;
+  }
+
+  parallel_runner = JxlResizableParallelRunnerCreate(NULL);
+  if (!parallel_runner) {
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL decoder failed");
+    return FALSE;
+  }
+
+  JxlResizableParallelRunnerSetThreads(
+      parallel_runner, JxlResizableParallelRunnerSuggestThreads(
+                           output_info.xsize, output_info.ysize));
+
+  status = JxlEncoderSetParallelRunner(encoder, JxlResizableParallelRunner,
+                                       parallel_runner);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlDecoderSetParallelRunner failed: %x", status);
+    return FALSE;
+  }
+
+  if (quality > 99) {
+    output_info.uses_original_profile = JXL_TRUE;
+    distance = 0;
+  } else {
+    output_info.uses_original_profile = JXL_FALSE;
+    if (quality >= 30) {
+      distance = 0.1 + (100 - quality) * 0.09;
+    } else {
+      distance =
+          53.0 / 3000.0 * quality * quality - 23.0 / 20.0 * quality + 25.0;
+    }
+  }
+
+  status = JxlEncoderSetBasicInfo(encoder, &output_info);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderSetBasicInfo failed: %x", status);
+    return FALSE;
+  }
+
+  JxlColorEncodingSetToSRGB(&color_profile, JXL_FALSE);
+  status = JxlEncoderSetColorEncoding(encoder, &color_profile);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderSetColorEncoding failed: %x", status);
+    return FALSE;
+  }
+
+  frame_settings = JxlEncoderFrameSettingsCreate(encoder, NULL);
+  JxlEncoderSetFrameDistance(frame_settings, distance);
+  JxlEncoderSetFrameLossless(frame_settings, output_info.uses_original_profile);
+
+  status = JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                   gdk_pixbuf_read_pixels(pixbuf),
+                                   gdk_pixbuf_get_byte_length(pixbuf));
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderAddImageFrame failed: %x", status);
+    return FALSE;
+  }
+
+  JxlEncoderCloseInput(encoder);
+
+  compressed = g_byte_array_sized_new(4096);
+  g_byte_array_set_size(compressed, 4096);
+  do {
+    next_out = compressed->data + offset;
+    avail_out = compressed->len - offset;
+    status = JxlEncoderProcessOutput(encoder, &next_out, &avail_out);
+
+    if (status == JXL_ENC_NEED_MORE_OUTPUT) {
+      offset = next_out - compressed->data;
+      g_byte_array_set_size(compressed, compressed->len * 2);
+    } else if (status == JXL_ENC_ERROR) {
+      JxlResizableParallelRunnerDestroy(parallel_runner);
+      JxlEncoderDestroy(encoder);
+      g_set_error(error, G_FILE_ERROR, 0, "JxlEncoderProcessOutput failed: %x",
+                  status);
+      return FALSE;
+    }
+  } while (status != JXL_ENC_SUCCESS);
+
+  JxlResizableParallelRunnerDestroy(parallel_runner);
+  JxlEncoderDestroy(encoder);
+
+  g_byte_array_set_size(compressed, next_out - compressed->data);
+  if (compressed->len > 0) {
+    fwrite(compressed->data, 1, compressed->len, f);
+    g_byte_array_free(compressed, TRUE);
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+void fill_vtable(GdkPixbufModule *module) {
+  module->begin_load = begin_load;
+  module->stop_load = stop_load;
+  module->load_increment = load_increment;
+  module->is_save_option_supported = jxl_is_save_option_supported;
+  module->save = jxl_image_saver;
+}
+
+void fill_info(GdkPixbufFormat *info) {
+  static GdkPixbufModulePattern signature[] = {
+      {"\xFF\x0A", "  ", 100},
+      {"...\x0CJXL \x0D\x0A\x87\x0A", "zzz         ", 100},
+      {NULL, NULL, 0},
+  };
+
+  static gchar *mime_types[] = {"image/jxl", NULL};
+
+  static gchar *extensions[] = {"jxl", NULL};
+
+  info->name = "jxl";
+  info->signature = signature;
+  info->description = "JPEG XL image";
+  info->mime_types = mime_types;
+  info->extensions = extensions;
+  info->flags = GDK_PIXBUF_FORMAT_WRITABLE | GDK_PIXBUF_FORMAT_THREADSAFE;
+  info->license = "BSD-3";
+}
diff --git a/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader_test.cc b/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader_test.cc
new file mode 100644
index 0000000000..5e5642d491
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gdk-pixbuf/pixbufloader_test.cc
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <gdk-pixbuf/gdk-pixbuf.h>
+#include <gdk/gdk.h>
+#include <glib.h>
+#include <stdlib.h>
+
+int main(int argc, char* argv[]) {
+  if (argc != 3) {
+    fprintf(stderr, "Usage: %s <loaders.cache> <image.jxl>\n", argv[0]);
+    return 1;
+  }
+
+  const char* loaders_cache = argv[1];
+  const char* filename = argv[2];
+  setenv("GDK_PIXBUF_MODULE_FILE", loaders_cache, true);
+
+  // XDG_DATA_HOME is the path where we look for the mime cache.
+  // XDG_DATA_DIRS directories are used in addition to XDG_DATA_HOME.
+  setenv("XDG_DATA_HOME", ".", true);
+  setenv("XDG_DATA_DIRS", "", true);
+
+  if (!gdk_init_check(nullptr, nullptr)) {
+    fprintf(stderr, "This test requires a DISPLAY\n");
+    // Signals ctest that we should mark this test as skipped.
+    return 254;
+  }
+  GError* error = nullptr;
+  GdkPixbuf* pb = gdk_pixbuf_new_from_file(filename, &error);
+  if (pb != nullptr) {
+    g_object_unref(pb);
+    return 0;
+  } else {
+    fprintf(stderr, "Error loading file: %s\n", filename);
+    g_assert_no_error(error);
+    return 1;
+  }
+}
diff --git a/third_party/jpeg-xl/plugins/gimp/CMakeLists.txt b/third_party/jpeg-xl/plugins/gimp/CMakeLists.txt
new file mode 100644
index 0000000000..f0a49005ed
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig)
+pkg_check_modules(Gimp IMPORTED_TARGET gimp-2.0>=2.10 gimpui-2.0>=2.10)
+
+if (NOT Gimp_FOUND)
+  message(WARNING "Gimp development libraries not found, the Gimp plugin will not be built")
+  return ()
+endif ()
+
+add_executable(file-jxl WIN32
+  common.h
+  common.cc
+  file-jxl-load.cc
+  file-jxl-load.h
+  file-jxl-save.cc
+  file-jxl-save.h
+  file-jxl.cc)
+target_link_libraries(file-jxl jxl jxl_threads PkgConfig::Gimp)
+
+target_include_directories(file-jxl PUBLIC
+    ${PROJECT_SOURCE_DIR})  # for plugins/gimp absolute paths.
+
+pkg_get_variable(GIMP_LIB_DIR gimp-2.0 gimplibdir)
+install(TARGETS file-jxl RUNTIME DESTINATION "${GIMP_LIB_DIR}/plug-ins/file-jxl/")
diff --git a/third_party/jpeg-xl/plugins/gimp/common.cc b/third_party/jpeg-xl/plugins/gimp/common.cc
new file mode 100644
index 0000000000..1a884570cb
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/common.cc
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "plugins/gimp/common.h"
+
+namespace jxl {
+
+JpegXlGimpProgress::JpegXlGimpProgress(const char *message) {
+  cur_progress = 0;
+  max_progress = 100;
+
+  gimp_progress_init_printf("%s\n", message);
+}
+
+void JpegXlGimpProgress::update() {
+  gimp_progress_update((float)++cur_progress / (float)max_progress);
+  return;
+}
+
+void JpegXlGimpProgress::finished() {
+  gimp_progress_update(1.0);
+  return;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/plugins/gimp/common.h b/third_party/jpeg-xl/plugins/gimp/common.h
new file mode 100644
index 0000000000..3fe63c1a47
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/common.h
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef PLUGINS_GIMP_COMMON_H_
+#define PLUGINS_GIMP_COMMON_H_
+
+#include <libgimp/gimp.h>
+#include <libgimp/gimpui.h>
+#include <math.h>
+
+#include <fstream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#define PLUG_IN_BINARY "file-jxl"
+#define SAVE_PROC "file-jxl-save"
+
+// Defined by both FUIF and glib.
+#undef MAX
+#undef MIN
+#undef CLAMP
+
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+
+namespace jxl {
+
+class JpegXlGimpProgress {
+ public:
+  explicit JpegXlGimpProgress(const char *message);
+  void update();
+  void finished();
+
+ private:
+  int cur_progress;
+  int max_progress;
+
+};  // class JpegXlGimpProgress
+
+}  // namespace jxl
+
+#endif  // PLUGINS_GIMP_COMMON_H_
diff --git a/third_party/jpeg-xl/plugins/gimp/file-jxl-load.cc b/third_party/jpeg-xl/plugins/gimp/file-jxl-load.cc
new file mode 100644
index 0000000000..361a74920c
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/file-jxl-load.cc
@@ -0,0 +1,487 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "plugins/gimp/file-jxl-load.h"
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+
+#define _PROFILE_ORIGIN_ JXL_COLOR_PROFILE_TARGET_ORIGINAL
+#define _PROFILE_TARGET_ JXL_COLOR_PROFILE_TARGET_DATA
+#define LOAD_PROC "file-jxl-load"
+
+namespace jxl {
+
+bool SetJpegXlOutBuffer(
+    std::unique_ptr<JxlDecoderStruct, JxlDecoderDestroyStruct> *dec,
+    JxlPixelFormat *format, size_t *buffer_size, gpointer *pixels_buffer_1) {
+  if (JXL_DEC_SUCCESS !=
+      JxlDecoderImageOutBufferSize(dec->get(), format, buffer_size)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderImageOutBufferSize failed\n");
+    return false;
+  }
+  *pixels_buffer_1 = g_malloc(*buffer_size);
+  if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec->get(), format,
+                                                     *pixels_buffer_1,
+                                                     *buffer_size)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetImageOutBuffer failed\n");
+    return false;
+  }
+  return true;
+}
+
+bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
+  bool stop_processing = false;
+  JxlDecoderStatus status = JXL_DEC_NEED_MORE_INPUT;
+  std::vector<uint8_t> icc_profile;
+  GimpColorProfile *profile_icc = nullptr;
+  GimpColorProfile *profile_int = nullptr;
+  bool is_linear = false;
+  unsigned long xsize = 0, ysize = 0;
+  long crop_x0 = 0, crop_y0 = 0;
+  size_t layer_idx = 0;
+  uint32_t frame_duration = 0;
+  double tps_denom = 1.f, tps_numer = 1.f;
+
+  gint32 layer;
+
+  gpointer pixels_buffer_1 = nullptr;
+  gpointer pixels_buffer_2 = nullptr;
+  size_t buffer_size = 0;
+
+  GimpImageBaseType image_type = GIMP_RGB;
+  GimpImageType layer_type = GIMP_RGB_IMAGE;
+  GimpPrecision precision = GIMP_PRECISION_U16_GAMMA;
+  JxlBasicInfo info = {};
+  JxlPixelFormat format = {};
+  JxlAnimationHeader animation = {};
+  JxlBlendMode blend_mode = JXL_BLEND_BLEND;
+  char *frame_name = nullptr;  // will be realloced
+  size_t frame_name_len = 0;
+
+  format.num_channels = 4;
+  format.data_type = JXL_TYPE_FLOAT;
+  format.endianness = JXL_NATIVE_ENDIAN;
+  format.align = 0;
+
+  bool is_gray = false;
+
+  JpegXlGimpProgress gimp_load_progress(
+      ("Opening JPEG XL file:" + std::string(filename)).c_str());
+  gimp_load_progress.update();
+
+  // read file
+  std::ifstream instream(filename, std::ios::in | std::ios::binary);
+  std::vector<uint8_t> compressed((std::istreambuf_iterator<char>(instream)),
+                                  std::istreambuf_iterator<char>());
+  instream.close();
+
+  gimp_load_progress.update();
+
+  // multi-threaded parallel runner.
+  auto runner = JxlResizableParallelRunnerMake(nullptr);
+
+  auto dec = JxlDecoderMake(nullptr);
+  if (JXL_DEC_SUCCESS !=
+      JxlDecoderSubscribeEvents(
+          dec.get(), JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING |
+                         JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION |
+                         JXL_DEC_FRAME)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSubscribeEvents failed\n");
+    return false;
+  }
+
+  if (JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(dec.get(),
+                                                     JxlResizableParallelRunner,
+                                                     runner.get())) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetParallelRunner failed\n");
+    return false;
+  }
+  // TODO: make this work with coalescing set to false, while handling frames
+  // with duration 0 and references to earlier frames correctly.
+  if (JXL_DEC_SUCCESS != JxlDecoderSetCoalescing(dec.get(), JXL_TRUE)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetCoalescing failed\n");
+    return false;
+  }
+
+  // grand decode loop...
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+
+  if (JXL_DEC_SUCCESS != JxlDecoderSetProgressiveDetail(
+                             dec.get(), JxlProgressiveDetail::kPasses)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetProgressiveDetail failed\n");
+    return false;
+  }
+
+  while (true) {
+    gimp_load_progress.update();
+
+    if (!stop_processing) status = JxlDecoderProcessInput(dec.get());
+
+    if (status == JXL_DEC_BASIC_INFO) {
+      if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(dec.get(), &info)) {
+        g_printerr(LOAD_PROC " Error: JxlDecoderGetBasicInfo failed\n");
+        return false;
+      }
+
+      xsize = info.xsize;
+      ysize = info.ysize;
+      if (info.have_animation) {
+        animation = info.animation;
+        tps_denom = animation.tps_denominator;
+        tps_numer = animation.tps_numerator;
+      }
+
+      JxlResizableParallelRunnerSetThreads(
+          runner.get(), JxlResizableParallelRunnerSuggestThreads(xsize, ysize));
+    } else if (status == JXL_DEC_COLOR_ENCODING) {
+      // check for ICC profile
+      size_t icc_size = 0;
+      JxlColorEncoding color_encoding;
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderGetColorAsEncodedProfile(
+              dec.get(), &format, _PROFILE_ORIGIN_, &color_encoding)) {
+        // Attempt to load ICC profile when no internal color encoding
+        if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize(dec.get(), &format,
+                                                           _PROFILE_ORIGIN_,
+                                                           &icc_size)) {
+          g_printerr(LOAD_PROC
+                     " Warning: JxlDecoderGetICCProfileSize failed\n");
+        }
+
+        if (icc_size > 0) {
+          icc_profile.resize(icc_size);
+          if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
+                                     dec.get(), &format, _PROFILE_ORIGIN_,
+                                     icc_profile.data(), icc_profile.size())) {
+            g_printerr(LOAD_PROC
+                       " Warning: JxlDecoderGetColorAsICCProfile failed\n");
+          }
+
+          profile_icc = gimp_color_profile_new_from_icc_profile(
+              icc_profile.data(), icc_profile.size(), nullptr);
+
+          if (profile_icc) {
+            is_linear = gimp_color_profile_is_linear(profile_icc);
+            g_printerr(LOAD_PROC " Info: Color profile is_linear = %d\n",
+                       is_linear);
+          } else {
+            g_printerr(LOAD_PROC " Warning: Failed to read ICC profile.\n");
+          }
+        } else {
+          g_printerr(LOAD_PROC " Warning: Empty ICC data.\n");
+        }
+      }
+
+      // Internal color profile detection...
+      if (JXL_DEC_SUCCESS ==
+          JxlDecoderGetColorAsEncodedProfile(
+              dec.get(), &format, _PROFILE_TARGET_, &color_encoding)) {
+        g_printerr(LOAD_PROC " Info: Internal color encoding detected.\n");
+
+        // figure out linearity of internal profile
+        switch (color_encoding.transfer_function) {
+          case JXL_TRANSFER_FUNCTION_LINEAR:
+            is_linear = true;
+            break;
+
+          case JXL_TRANSFER_FUNCTION_709:
+          case JXL_TRANSFER_FUNCTION_PQ:
+          case JXL_TRANSFER_FUNCTION_HLG:
+          case JXL_TRANSFER_FUNCTION_GAMMA:
+          case JXL_TRANSFER_FUNCTION_DCI:
+          case JXL_TRANSFER_FUNCTION_SRGB:
+            is_linear = false;
+            break;
+
+          case JXL_TRANSFER_FUNCTION_UNKNOWN:
+          default:
+            if (profile_icc) {
+              g_printerr(LOAD_PROC
+                         " Info: Unknown transfer function.  "
+                         "ICC profile is present.");
+            } else {
+              g_printerr(LOAD_PROC
+                         " Info: Unknown transfer function.  "
+                         "No ICC profile present.");
+            }
+            break;
+        }
+
+        switch (color_encoding.color_space) {
+          case JXL_COLOR_SPACE_RGB:
+            if (color_encoding.white_point == JXL_WHITE_POINT_D65 &&
+                color_encoding.primaries == JXL_PRIMARIES_SRGB) {
+              if (is_linear) {
+                profile_int = gimp_color_profile_new_rgb_srgb_linear();
+              } else {
+                profile_int = gimp_color_profile_new_rgb_srgb();
+              }
+            } else if (!is_linear &&
+                       color_encoding.white_point == JXL_WHITE_POINT_D65 &&
+                       (color_encoding.primaries_green_xy[0] == 0.2100 ||
+                        color_encoding.primaries_green_xy[1] == 0.7100)) {
+              // Probably Adobe RGB
+              profile_int = gimp_color_profile_new_rgb_adobe();
+            } else if (profile_icc) {
+              g_printerr(LOAD_PROC
+                         " Info: Unknown RGB colorspace.  "
+                         "Using ICC profile.\n");
+            } else {
+              g_printerr(LOAD_PROC
+                         " Info: Unknown RGB colorspace.  "
+                         "Treating as sRGB.\n");
+              if (is_linear) {
+                profile_int = gimp_color_profile_new_rgb_srgb_linear();
+              } else {
+                profile_int = gimp_color_profile_new_rgb_srgb();
+              }
+            }
+            break;
+
+          case JXL_COLOR_SPACE_GRAY:
+            is_gray = true;
+            if (!profile_icc ||
+                color_encoding.white_point == JXL_WHITE_POINT_D65) {
+              if (is_linear) {
+                profile_int = gimp_color_profile_new_d65_gray_linear();
+              } else {
+                profile_int = gimp_color_profile_new_d65_gray_srgb_trc();
+              }
+            }
+            break;
+          case JXL_COLOR_SPACE_XYB:
+          case JXL_COLOR_SPACE_UNKNOWN:
+          default:
+            if (profile_icc) {
+              g_printerr(LOAD_PROC
+                         " Info: Unknown colorspace.  Using ICC profile.\n");
+            } else {
+              g_error(
+                  LOAD_PROC
+                  " Warning: Unknown colorspace. Treating as sRGB profile.\n");
+
+              if (is_linear) {
+                profile_int = gimp_color_profile_new_rgb_srgb_linear();
+              } else {
+                profile_int = gimp_color_profile_new_rgb_srgb();
+              }
+            }
+            break;
+        }
+      }
+
+      // set pixel format
+      if (info.num_color_channels > 1) {
+        if (info.alpha_bits == 0) {
+          image_type = GIMP_RGB;
+          layer_type = GIMP_RGB_IMAGE;
+          format.num_channels = info.num_color_channels;
+        } else {
+          image_type = GIMP_RGB;
+          layer_type = GIMP_RGBA_IMAGE;
+          format.num_channels = info.num_color_channels + 1;
+        }
+      } else if (info.num_color_channels == 1) {
+        if (info.alpha_bits == 0) {
+          image_type = GIMP_GRAY;
+          layer_type = GIMP_GRAY_IMAGE;
+          format.num_channels = info.num_color_channels;
+        } else {
+          image_type = GIMP_GRAY;
+          layer_type = GIMP_GRAYA_IMAGE;
+          format.num_channels = info.num_color_channels + 1;
+        }
+      }
+
+      // Set image bit depth and linearity
+      if (info.bits_per_sample <= 8) {
+        if (is_linear) {
+          precision = GIMP_PRECISION_U8_LINEAR;
+        } else {
+          precision = GIMP_PRECISION_U8_GAMMA;
+        }
+      } else if (info.bits_per_sample <= 16) {
+        if (info.exponent_bits_per_sample > 0) {
+          if (is_linear) {
+            precision = GIMP_PRECISION_HALF_LINEAR;
+          } else {
+            precision = GIMP_PRECISION_HALF_GAMMA;
+          }
+        } else if (is_linear) {
+          precision = GIMP_PRECISION_U16_LINEAR;
+        } else {
+          precision = GIMP_PRECISION_U16_GAMMA;
+        }
+      } else {
+        if (info.exponent_bits_per_sample > 0) {
+          if (is_linear) {
+            precision = GIMP_PRECISION_FLOAT_LINEAR;
+          } else {
+            precision = GIMP_PRECISION_FLOAT_GAMMA;
+          }
+        } else if (is_linear) {
+          precision = GIMP_PRECISION_U32_LINEAR;
+        } else {
+          precision = GIMP_PRECISION_U32_GAMMA;
+        }
+      }
+
+      // create new image
+      if (is_linear) {
+        *image_id = gimp_image_new_with_precision(xsize, ysize, image_type,
+                                                  GIMP_PRECISION_FLOAT_LINEAR);
+      } else {
+        *image_id = gimp_image_new_with_precision(xsize, ysize, image_type,
+                                                  GIMP_PRECISION_FLOAT_GAMMA);
+      }
+
+      if (profile_int) {
+        gimp_image_set_color_profile(*image_id, profile_int);
+      } else if (!profile_icc) {
+        g_printerr(LOAD_PROC " Warning: No color profile.\n");
+      }
+    } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
+      // get image from decoder in FLOAT
+      format.data_type = JXL_TYPE_FLOAT;
+      if (!SetJpegXlOutBuffer(&dec, &format, &buffer_size, &pixels_buffer_1))
+        return false;
+    } else if (status == JXL_DEC_FULL_IMAGE) {
+      // create and insert layer
+      gchar *layer_name;
+      if (layer_idx == 0 && !info.have_animation) {
+        layer_name = g_strdup_printf("Background");
+      } else {
+        const GString *blend_null_flag = g_string_new("");
+        const GString *blend_replace_flag = g_string_new(" (replace)");
+        const GString *blend_combine_flag = g_string_new(" (combine)");
+        GString *blend;
+        if (blend_mode == JXL_BLEND_REPLACE) {
+          blend = (GString *)blend_replace_flag;
+        } else if (blend_mode == JXL_BLEND_BLEND) {
+          blend = (GString *)blend_combine_flag;
+        } else {
+          blend = (GString *)blend_null_flag;
+        }
+        char *temp_frame_name = nullptr;
+        bool must_free_frame_name = false;
+        if (frame_name_len == 0) {
+          temp_frame_name = g_strdup_printf("Frame %lu", layer_idx + 1);
+          must_free_frame_name = true;
+        } else {
+          temp_frame_name = frame_name;
+        }
+        double fduration = frame_duration * 1000.f * tps_denom / tps_numer;
+        layer_name = g_strdup_printf("%s (%.15gms)%s", temp_frame_name,
+                                     fduration, blend->str);
+        if (must_free_frame_name) free(temp_frame_name);
+      }
+      layer = gimp_layer_new(*image_id, layer_name, xsize, ysize, layer_type,
+                             /*opacity=*/100,
+                             gimp_image_get_default_new_layer_mode(*image_id));
+
+      gimp_image_insert_layer(*image_id, layer, /*parent_id=*/-1,
+                              /*position=*/0);
+
+      pixels_buffer_2 = g_malloc(buffer_size);
+      GeglBuffer *buffer = gimp_drawable_get_buffer(layer);
+      const Babl *destination_format = gegl_buffer_set_format(buffer, nullptr);
+
+      std::string babl_format_str = "";
+      if (is_gray) {
+        babl_format_str += "Y'";
+      } else {
+        babl_format_str += "R'G'B'";
+      }
+      if (info.alpha_bits > 0) {
+        babl_format_str += "A";
+      }
+      babl_format_str += " float";
+
+      const Babl *source_format = babl_format(babl_format_str.c_str());
+
+      babl_process(babl_fish(source_format, destination_format),
+                   pixels_buffer_1, pixels_buffer_2, xsize * ysize);
+
+      gegl_buffer_set(buffer, GEGL_RECTANGLE(0, 0, xsize, ysize), 0, nullptr,
+                      pixels_buffer_2, GEGL_AUTO_ROWSTRIDE);
+      gimp_item_transform_translate(layer, crop_x0, crop_y0);
+
+      g_clear_object(&buffer);
+      g_free(pixels_buffer_1);
+      g_free(pixels_buffer_2);
+      if (stop_processing) status = JXL_DEC_SUCCESS;
+      g_free(layer_name);
+      layer_idx++;
+    } else if (status == JXL_DEC_FRAME) {
+      JxlFrameHeader frame_header;
+      if (JxlDecoderGetFrameHeader(dec.get(), &frame_header) !=
+          JXL_DEC_SUCCESS) {
+        g_printerr(LOAD_PROC " Error: JxlDecoderSetImageOutBuffer failed\n");
+        return false;
+      }
+      xsize = frame_header.layer_info.xsize;
+      ysize = frame_header.layer_info.ysize;
+      crop_x0 = frame_header.layer_info.crop_x0;
+      crop_y0 = frame_header.layer_info.crop_y0;
+      frame_duration = frame_header.duration;
+      blend_mode = frame_header.layer_info.blend_info.blendmode;
+      if (blend_mode != JXL_BLEND_BLEND && blend_mode != JXL_BLEND_REPLACE) {
+        g_printerr(
+            LOAD_PROC
+            " Warning: JxlDecoderGetFrameHeader: Unhandled blend mode: %d\n",
+            blend_mode);
+      }
+      if ((frame_name_len = frame_header.name_length) > 0) {
+        frame_name = (char *)realloc(frame_name, frame_name_len);
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetFrameName(dec.get(), frame_name, frame_name_len)) {
+          g_printerr(LOAD_PROC "Error: JxlDecoderGetFrameName failed");
+          return false;
+        };
+      }
+    } else if (status == JXL_DEC_SUCCESS) {
+      // All decoding successfully finished.
+      // It's not required to call JxlDecoderReleaseInput(dec.get())
+      // since the decoder will be destroyed.
+      break;
+    } else if (status == JXL_DEC_NEED_MORE_INPUT ||
+               status == JXL_DEC_FRAME_PROGRESSION) {
+      stop_processing = status != JXL_DEC_FRAME_PROGRESSION;
+      if (JxlDecoderFlushImage(dec.get()) == JXL_DEC_SUCCESS) {
+        status = JXL_DEC_FULL_IMAGE;
+        continue;
+      }
+      g_printerr(LOAD_PROC " Error: Already provided all input\n");
+      return false;
+    } else if (status == JXL_DEC_ERROR) {
+      g_printerr(LOAD_PROC " Error: Decoder error\n");
+      return false;
+    } else {
+      g_printerr(LOAD_PROC " Error: Unknown decoder status\n");
+      return false;
+    }
+  }  // end grand decode loop
+
+  gimp_load_progress.update();
+
+  if (profile_icc) {
+    gimp_image_set_color_profile(*image_id, profile_icc);
+  }
+
+  gimp_load_progress.update();
+
+  // TODO(xiota): Add option to keep image as float
+  if (info.bits_per_sample < 32) {
+    gimp_image_convert_precision(*image_id, precision);
+  }
+
+  gimp_image_set_filename(*image_id, filename);
+
+  gimp_load_progress.finished();
+  return true;
+}
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/plugins/gimp/file-jxl-load.h b/third_party/jpeg-xl/plugins/gimp/file-jxl-load.h
new file mode 100644
index 0000000000..ef5b92fef6
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/file-jxl-load.h
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef PLUGINS_GIMP_FILE_JXL_LOAD_H_
+#define PLUGINS_GIMP_FILE_JXL_LOAD_H_
+
+#include "plugins/gimp/common.h"
+
+namespace jxl {
+
+bool LoadJpegXlImage(const gchar* filename, gint32* image_id);
+
+}  // namespace jxl
+
+#endif  // PLUGINS_GIMP_FILE_JXL_LOAD_H_
diff --git a/third_party/jpeg-xl/plugins/gimp/file-jxl-save.cc b/third_party/jpeg-xl/plugins/gimp/file-jxl-save.cc
new file mode 100644
index 0000000000..c1e1ebd9af
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/file-jxl-save.cc
@@ -0,0 +1,895 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "plugins/gimp/file-jxl-save.h"
+
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
+#include <cmath>
+#include <utility>
+
+#include "gobject/gsignal.h"
+
+#define PLUG_IN_BINARY "file-jxl"
+#define SAVE_PROC "file-jxl-save"
+
+#define SCALE_WIDTH 200
+
+namespace jxl {
+
+namespace {
+
+#ifndef g_clear_signal_handler
+// g_clear_signal_handler was added in glib 2.62
+void g_clear_signal_handler(gulong* handler, gpointer instance) {
+  if (handler != nullptr && *handler != 0) {
+    g_signal_handler_disconnect(instance, *handler);
+    *handler = 0;
+  }
+}
+#endif  // g_clear_signal_handler
+
+class JpegXlSaveOpts {
+ public:
+  float distance;
+  float quality;
+
+  bool lossless = false;
+  bool is_linear = false;
+  bool has_alpha = false;
+  bool is_gray = false;
+  bool icc_attached = false;
+
+  bool advanced_mode = false;
+  bool use_container = true;
+  bool save_exif = false;
+  int encoding_effort = 7;
+  int faster_decoding = 0;
+
+  std::string babl_format_str = "RGB u16";
+  std::string babl_type_str = "u16";
+  std::string babl_model_str = "RGB";
+
+  JxlPixelFormat pixel_format;
+  JxlBasicInfo basic_info;
+
+  // functions
+  JpegXlSaveOpts();
+
+  bool SetDistance(float dist);
+  bool SetQuality(float qual);
+  bool SetDimensions(int x, int y);
+  bool SetNumChannels(int channels);
+
+  bool UpdateDistance();
+  bool UpdateQuality();
+
+  bool SetModel(bool is_linear_);
+
+  bool UpdateBablFormat();
+  bool SetBablModel(std::string model);
+  bool SetBablType(std::string type);
+
+  bool SetPrecision(int gimp_precision);
+
+ private:
+};  // class JpegXlSaveOpts
+
+JpegXlSaveOpts jxl_save_opts;
+
+class JpegXlSaveGui {
+ public:
+  bool SaveDialog();
+
+ private:
+  GtkWidget* toggle_lossless = nullptr;
+  GtkAdjustment* entry_distance = nullptr;
+  GtkAdjustment* entry_quality = nullptr;
+  GtkAdjustment* entry_effort = nullptr;
+  GtkAdjustment* entry_faster = nullptr;
+  GtkWidget* frame_advanced = nullptr;
+  GtkWidget* toggle_no_xyb = nullptr;
+  GtkWidget* toggle_raw = nullptr;
+  gulong handle_toggle_lossless = 0;
+  gulong handle_entry_quality = 0;
+  gulong handle_entry_distance = 0;
+
+  static bool GuiOnChangeQuality(GtkAdjustment* adj_qual, void* this_pointer);
+
+  static bool GuiOnChangeDistance(GtkAdjustment* adj_dist, void* this_pointer);
+
+  static bool GuiOnChangeEffort(GtkAdjustment* adj_effort);
+  static bool GuiOnChangeLossless(GtkWidget* toggle, void* this_pointer);
+  static bool GuiOnChangeCodestream(GtkWidget* toggle);
+  static bool GuiOnChangeNoXYB(GtkWidget* toggle);
+
+  static bool GuiOnChangeAdvancedMode(GtkWidget* toggle, void* this_pointer);
+};  // class JpegXlSaveGui
+
+JpegXlSaveGui jxl_save_gui;
+
+bool JpegXlSaveGui::GuiOnChangeQuality(GtkAdjustment* adj_qual,
+                                       void* this_pointer) {
+  JpegXlSaveGui* self = static_cast<JpegXlSaveGui*>(this_pointer);
+
+  g_clear_signal_handler(&self->handle_entry_distance, self->entry_distance);
+  g_clear_signal_handler(&self->handle_entry_quality, self->entry_quality);
+  g_clear_signal_handler(&self->handle_toggle_lossless, self->toggle_lossless);
+
+  GtkAdjustment* adj_dist = self->entry_distance;
+  jxl_save_opts.SetQuality(gtk_adjustment_get_value(adj_qual));
+  gtk_adjustment_set_value(adj_dist, jxl_save_opts.distance);
+
+  self->handle_toggle_lossless = g_signal_connect(
+      self->toggle_lossless, "toggled", G_CALLBACK(GuiOnChangeLossless), self);
+  self->handle_entry_distance =
+      g_signal_connect(self->entry_distance, "value-changed",
+                       G_CALLBACK(GuiOnChangeDistance), self);
+  self->handle_entry_quality =
+      g_signal_connect(self->entry_quality, "value-changed",
+                       G_CALLBACK(GuiOnChangeQuality), self);
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeDistance(GtkAdjustment* adj_dist,
+                                        void* this_pointer) {
+  JpegXlSaveGui* self = static_cast<JpegXlSaveGui*>(this_pointer);
+  GtkAdjustment* adj_qual = self->entry_quality;
+
+  g_clear_signal_handler(&self->handle_entry_distance, self->entry_distance);
+  g_clear_signal_handler(&self->handle_entry_quality, self->entry_quality);
+  g_clear_signal_handler(&self->handle_toggle_lossless, self->toggle_lossless);
+
+  jxl_save_opts.SetDistance(gtk_adjustment_get_value(adj_dist));
+  gtk_adjustment_set_value(adj_qual, jxl_save_opts.quality);
+
+  if (!(jxl_save_opts.distance < 0.001)) {
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_lossless),
+                                 false);
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_no_xyb), false);
+  }
+
+  self->handle_toggle_lossless = g_signal_connect(
+      self->toggle_lossless, "toggled", G_CALLBACK(GuiOnChangeLossless), self);
+  self->handle_entry_distance =
+      g_signal_connect(self->entry_distance, "value-changed",
+                       G_CALLBACK(GuiOnChangeDistance), self);
+  self->handle_entry_quality =
+      g_signal_connect(self->entry_quality, "value-changed",
+                       G_CALLBACK(GuiOnChangeQuality), self);
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeEffort(GtkAdjustment* adj_effort) {
+  float new_effort = 10 - gtk_adjustment_get_value(adj_effort);
+  jxl_save_opts.encoding_effort = new_effort;
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeLossless(GtkWidget* toggle, void* this_pointer) {
+  JpegXlSaveGui* self = static_cast<JpegXlSaveGui*>(this_pointer);
+  GtkAdjustment* adj_distance = self->entry_distance;
+  GtkAdjustment* adj_quality = self->entry_quality;
+  GtkAdjustment* adj_effort = self->entry_effort;
+
+  jxl_save_opts.lossless =
+      gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(toggle));
+
+  g_clear_signal_handler(&self->handle_entry_distance, self->entry_distance);
+  g_clear_signal_handler(&self->handle_entry_quality, self->entry_quality);
+  g_clear_signal_handler(&self->handle_toggle_lossless, self->toggle_lossless);
+
+  if (jxl_save_opts.lossless) {
+    gtk_adjustment_set_value(adj_quality, 100.0);
+    gtk_adjustment_set_value(adj_distance, 0.0);
+    jxl_save_opts.distance = 0;
+    jxl_save_opts.UpdateQuality();
+    gtk_adjustment_set_value(adj_effort, 7);
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_no_xyb), true);
+  } else {
+    gtk_adjustment_set_value(adj_quality, 90.0);
+    gtk_adjustment_set_value(adj_distance, 1.0);
+    jxl_save_opts.distance = 1.0;
+    jxl_save_opts.UpdateQuality();
+    gtk_adjustment_set_value(adj_effort, 3);
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_no_xyb), false);
+  }
+  self->handle_toggle_lossless = g_signal_connect(
+      self->toggle_lossless, "toggled", G_CALLBACK(GuiOnChangeLossless), self);
+  self->handle_entry_distance =
+      g_signal_connect(self->entry_distance, "value-changed",
+                       G_CALLBACK(GuiOnChangeDistance), self);
+  self->handle_entry_quality =
+      g_signal_connect(self->entry_quality, "value-changed",
+                       G_CALLBACK(GuiOnChangeQuality), self);
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeCodestream(GtkWidget* toggle) {
+  jxl_save_opts.use_container =
+      !gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(toggle));
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeNoXYB(GtkWidget* toggle) {
+  jxl_save_opts.basic_info.uses_original_profile =
+      gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(toggle));
+  return true;
+}
+
+bool JpegXlSaveGui::GuiOnChangeAdvancedMode(GtkWidget* toggle,
+                                            void* this_pointer) {
+  JpegXlSaveGui* self = static_cast<JpegXlSaveGui*>(this_pointer);
+  jxl_save_opts.advanced_mode =
+      gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(toggle));
+
+  gtk_widget_set_sensitive(self->frame_advanced, jxl_save_opts.advanced_mode);
+
+  if (!jxl_save_opts.advanced_mode) {
+    jxl_save_opts.basic_info.uses_original_profile = false;
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_no_xyb), false);
+
+    jxl_save_opts.use_container = true;
+    gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(self->toggle_raw), false);
+
+    jxl_save_opts.faster_decoding = 0;
+    gtk_adjustment_set_value(GTK_ADJUSTMENT(self->entry_faster), 0);
+  }
+  return true;
+}
+
+bool JpegXlSaveGui::SaveDialog() {
+  gboolean run;
+  GtkWidget* dialog;
+  GtkWidget* content_area;
+  GtkWidget* main_vbox;
+  GtkWidget* frame;
+  GtkWidget* toggle;
+  GtkWidget* table;
+  GtkWidget* vbox;
+  GtkWidget* separator;
+
+  // initialize export dialog
+  gimp_ui_init(PLUG_IN_BINARY, true);
+  dialog = gimp_export_dialog_new("JPEG XL", PLUG_IN_BINARY, SAVE_PROC);
+
+  gtk_window_set_resizable(GTK_WINDOW(dialog), false);
+  content_area = gimp_export_dialog_get_content_area(dialog);
+
+  main_vbox = gtk_vbox_new(false, 6);
+  gtk_container_set_border_width(GTK_CONTAINER(main_vbox), 6);
+  gtk_box_pack_start(GTK_BOX(content_area), main_vbox, true, true, 0);
+  gtk_widget_show(main_vbox);
+
+  // Standard Settings Frame
+  frame = gtk_frame_new(nullptr);
+  gtk_frame_set_shadow_type(GTK_FRAME(frame), GTK_SHADOW_ETCHED_IN);
+  gtk_box_pack_start(GTK_BOX(main_vbox), frame, false, false, 0);
+  gtk_widget_show(frame);
+
+  vbox = gtk_vbox_new(false, 6);
+  gtk_container_set_border_width(GTK_CONTAINER(vbox), 6);
+  gtk_container_add(GTK_CONTAINER(frame), vbox);
+  gtk_widget_show(vbox);
+
+  // Layout Table
+  table = gtk_table_new(20, 3, false);
+  gtk_table_set_col_spacings(GTK_TABLE(table), 6);
+  gtk_box_pack_start(GTK_BOX(vbox), table, false, false, 0);
+  gtk_widget_show(table);
+
+  // Distance Slider
+  static gchar distance_help[] =
+      "Butteraugli distance target.  Suggested values:"
+      "\n\td\u00A0=\u00A00.3\tExcellent"
+      "\n\td\u00A0=\u00A01\tVery Good"
+      "\n\td\u00A0=\u00A02\tGood"
+      "\n\td\u00A0=\u00A03\tFair"
+      "\n\td\u00A0=\u00A06\tPoor";
+
+  entry_distance = (GtkAdjustment*)gimp_scale_entry_new(
+      GTK_TABLE(table), 0, 0, "Distance", SCALE_WIDTH, 0,
+      jxl_save_opts.distance, 0.0, 15.0, 0.001, 1.0, 3, true, 0.0, 0.0,
+      distance_help, SAVE_PROC);
+  gimp_scale_entry_set_logarithmic((GtkObject*)entry_distance, true);
+
+  // Quality Slider
+  static gchar quality_help[] =
+      "JPEG-style Quality is remapped to distance.  "
+      "Values roughly match libjpeg quality settings.";
+  entry_quality = (GtkAdjustment*)gimp_scale_entry_new(
+      GTK_TABLE(table), 0, 1, "Quality", SCALE_WIDTH, 0, jxl_save_opts.quality,
+      8.26, 100.0, 1.0, 10.0, 2, true, 0.0, 0.0, quality_help, SAVE_PROC);
+
+  // Distance and Quality Signals
+  handle_entry_distance = g_signal_connect(
+      entry_distance, "value-changed", G_CALLBACK(GuiOnChangeDistance), this);
+  handle_entry_quality = g_signal_connect(entry_quality, "value-changed",
+                                          G_CALLBACK(GuiOnChangeQuality), this);
+
+  // ----------
+  separator = gtk_vseparator_new();
+  gtk_table_attach(GTK_TABLE(table), separator, 0, 2, 2, 3, GTK_EXPAND,
+                   GTK_EXPAND, 9, 9);
+  gtk_widget_show(separator);
+
+  // Encoding Effort / Speed
+  static gchar effort_help[] =
+      "Adjust encoding speed.  Higher values are faster because "
+      "the encoder uses less effort to hit distance targets.  "
+      "As\u00A0a\u00A0result, image quality may be decreased.  "
+      "Default\u00A0=\u00A03.";
+  entry_effort = (GtkAdjustment*)gimp_scale_entry_new(
+      GTK_TABLE(table), 0, 3, "Speed", SCALE_WIDTH, 0,
+      10 - jxl_save_opts.encoding_effort, 1, 9, 1, 2, 0, true, 0.0, 0.0,
+      effort_help, SAVE_PROC);
+
+  // effort signal
+  g_signal_connect(entry_effort, "value-changed", G_CALLBACK(GuiOnChangeEffort),
+                   nullptr);
+
+  // ----------
+  separator = gtk_vseparator_new();
+  gtk_table_attach(GTK_TABLE(table), separator, 0, 2, 4, 5, GTK_EXPAND,
+                   GTK_EXPAND, 9, 9);
+  gtk_widget_show(separator);
+
+  // Lossless Mode Convenience Checkbox
+  static gchar lossless_help[] =
+      "Compress using modular lossless mode.  "
+      "Speed\u00A0is adjusted to improve performance.";
+  toggle_lossless = gtk_check_button_new_with_label("Lossless Mode");
+  gimp_help_set_help_data(toggle_lossless, lossless_help, nullptr);
+  gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(toggle_lossless),
+                               jxl_save_opts.lossless);
+  gtk_table_attach_defaults(GTK_TABLE(table), toggle_lossless, 0, 2, 5, 6);
+  gtk_widget_show(toggle_lossless);
+
+  // lossless signal
+  handle_toggle_lossless = g_signal_connect(
+      toggle_lossless, "toggled", G_CALLBACK(GuiOnChangeLossless), this);
+
+  // ----------
+  separator = gtk_vseparator_new();
+  gtk_box_pack_start(GTK_BOX(main_vbox), separator, false, false, 1);
+  gtk_widget_show(separator);
+
+  // Advanced Settings Frame
+  std::vector<GtkWidget*> advanced_opts;
+
+  frame_advanced = gtk_frame_new("Advanced Settings");
+  gimp_help_set_help_data(frame_advanced,
+                          "Some advanced settings may produce malformed files.",
+                          nullptr);
+  gtk_frame_set_shadow_type(GTK_FRAME(frame_advanced), GTK_SHADOW_ETCHED_IN);
+  gtk_box_pack_start(GTK_BOX(main_vbox), frame_advanced, true, true, 0);
+  gtk_widget_show(frame_advanced);
+
+  gtk_widget_set_sensitive(frame_advanced, false);
+
+  vbox = gtk_vbox_new(false, 6);
+  gtk_container_set_border_width(GTK_CONTAINER(vbox), 6);
+  gtk_container_add(GTK_CONTAINER(frame_advanced), vbox);
+  gtk_widget_show(vbox);
+
+  // uses_original_profile
+  static gchar uses_original_profile_help[] =
+      "Prevents conversion to the XYB colorspace.  "
+      "File sizes are approximately doubled.";
+  toggle_no_xyb = gtk_check_button_new_with_label("Do not use XYB colorspace");
+  gimp_help_set_help_data(toggle_no_xyb, uses_original_profile_help, nullptr);
+  gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(toggle_no_xyb),
+                               jxl_save_opts.basic_info.uses_original_profile);
+  gtk_box_pack_start(GTK_BOX(vbox), toggle_no_xyb, false, false, 0);
+  gtk_widget_show(toggle_no_xyb);
+
+  g_signal_connect(toggle_no_xyb, "toggled", G_CALLBACK(GuiOnChangeNoXYB),
+                   nullptr);
+
+  // save raw codestream
+  static gchar codestream_help[] =
+      "Save the raw codestream, without a container.  "
+      "The container is required for metadata and some other features.";
+  toggle_raw = gtk_check_button_new_with_label("Save Raw Codestream");
+  gimp_help_set_help_data(toggle_raw, codestream_help, nullptr);
+  gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(toggle_raw),
+                               !jxl_save_opts.use_container);
+  gtk_box_pack_start(GTK_BOX(vbox), toggle_raw, false, false, 0);
+  gtk_widget_show(toggle_raw);
+
+  g_signal_connect(toggle_raw, "toggled", G_CALLBACK(GuiOnChangeCodestream),
+                   nullptr);
+
+  // ----------
+  separator = gtk_vseparator_new();
+  gtk_box_pack_start(GTK_BOX(vbox), separator, false, false, 1);
+  gtk_widget_show(separator);
+
+  // Faster Decoding / Decoding Speed
+  static gchar faster_help[] =
+      "Improve decoding speed at the expense of quality.  "
+      "Default\u00A0=\u00A00.";
+  table = gtk_table_new(1, 3, false);
+  gtk_table_set_col_spacings(GTK_TABLE(table), 6);
+  gtk_container_add(GTK_CONTAINER(vbox), table);
+  gtk_widget_show(table);
+
+  entry_faster = (GtkAdjustment*)gimp_scale_entry_new(
+      GTK_TABLE(table), 0, 0, "Faster Decoding", SCALE_WIDTH, 0,
+      jxl_save_opts.faster_decoding, 0, 4, 1, 1, 0, true, 0.0, 0.0, faster_help,
+      SAVE_PROC);
+
+  // Faster Decoding Signals
+  g_signal_connect(entry_faster, "value-changed",
+                   G_CALLBACK(gimp_int_adjustment_update),
+                   &jxl_save_opts.faster_decoding);
+
+  // Enable Advanced Settings
+  frame = gtk_frame_new(nullptr);
+  gtk_frame_set_shadow_type(GTK_FRAME(frame), GTK_SHADOW_NONE);
+  gtk_box_pack_start(GTK_BOX(main_vbox), frame, true, true, 0);
+  gtk_widget_show(frame);
+
+  vbox = gtk_vbox_new(false, 6);
+  gtk_container_set_border_width(GTK_CONTAINER(vbox), 6);
+  gtk_container_add(GTK_CONTAINER(frame), vbox);
+  gtk_widget_show(vbox);
+
+  static gchar advanced_help[] =
+      "Some advanced settings may produce malformed files.";
+  toggle = gtk_check_button_new_with_label("Enable Advanced Settings");
+  gimp_help_set_help_data(toggle, advanced_help, nullptr);
+  gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(toggle),
+                               jxl_save_opts.advanced_mode);
+  gtk_box_pack_start(GTK_BOX(vbox), toggle, false, false, 0);
+  gtk_widget_show(toggle);
+
+  g_signal_connect(toggle, "toggled", G_CALLBACK(GuiOnChangeAdvancedMode),
+                   this);
+
+  // show dialog
+  gtk_widget_show(dialog);
+
+  GtkAllocation allocation;
+  gtk_widget_get_allocation(dialog, &allocation);
+
+  int height = allocation.height;
+  gtk_widget_set_size_request(dialog, height * 1.5, height);
+
+  run = (gimp_dialog_run(GIMP_DIALOG(dialog)) == GTK_RESPONSE_OK);
+  gtk_widget_destroy(dialog);
+
+  return run;
+}  // JpegXlSaveGui::SaveDialog
+
+JpegXlSaveOpts::JpegXlSaveOpts() {
+  SetDistance(1.0);
+
+  pixel_format.num_channels = 4;
+  pixel_format.data_type = JXL_TYPE_FLOAT;
+  pixel_format.endianness = JXL_NATIVE_ENDIAN;
+  pixel_format.align = 0;
+
+  JxlEncoderInitBasicInfo(&basic_info);
+  return;
+}  // JpegXlSaveOpts constructor
+
+bool JpegXlSaveOpts::SetModel(bool is_linear_) {
+  int channels;
+  std::string model;
+
+  if (is_gray) {
+    channels = 1;
+    if (is_linear_) {
+      model = "Y";
+    } else {
+      model = "Y'";
+    }
+  } else {
+    channels = 3;
+    if (is_linear_) {
+      model = "RGB";
+    } else {
+      model = "R'G'B'";
+    }
+  }
+  if (has_alpha) {
+    SetBablModel(model + "A");
+    SetNumChannels(channels + 1);
+  } else {
+    SetBablModel(model);
+    SetNumChannels(channels);
+  }
+  return true;
+}  // JpegXlSaveOpts::SetModel
+
+bool JpegXlSaveOpts::SetDistance(float dist) {
+  distance = dist;
+  return UpdateQuality();
+}
+
+bool JpegXlSaveOpts::SetQuality(float qual) {
+  quality = qual;
+  return UpdateDistance();
+}
+
+bool JpegXlSaveOpts::UpdateQuality() {
+  float qual;
+
+  if (distance < 0.1) {
+    qual = 100;
+  } else if (distance > 6.4) {
+    qual = -5.0 / 53.0 * sqrt(6360.0 * distance - 39975.0) + 1725.0 / 53.0;
+    lossless = false;
+  } else {
+    qual = 100 - (distance - 0.1) / 0.09;
+    lossless = false;
+  }
+
+  if (qual < 0) {
+    quality = 0.0;
+  } else if (qual >= 100) {
+    quality = 100.0;
+  } else {
+    quality = qual;
+  }
+
+  return true;
+}
+
+bool JpegXlSaveOpts::UpdateDistance() {
+  float dist;
+  if (quality >= 30) {
+    dist = 0.1 + (100 - quality) * 0.09;
+  } else {
+    dist = 53.0 / 3000.0 * quality * quality - 23.0 / 20.0 * quality + 25.0;
+  }
+
+  if (dist > 25) {
+    distance = 25;
+  } else {
+    distance = dist;
+  }
+  return true;
+}
+
+bool JpegXlSaveOpts::SetDimensions(int x, int y) {
+  basic_info.xsize = x;
+  basic_info.ysize = y;
+  return true;
+}
+
+bool JpegXlSaveOpts::SetNumChannels(int channels) {
+  switch (channels) {
+    case 1:
+      pixel_format.num_channels = 1;
+      basic_info.num_color_channels = 1;
+      basic_info.num_extra_channels = 0;
+      basic_info.alpha_bits = 0;
+      basic_info.alpha_exponent_bits = 0;
+      break;
+    case 2:
+      pixel_format.num_channels = 2;
+      basic_info.num_color_channels = 1;
+      basic_info.num_extra_channels = 1;
+      basic_info.alpha_bits = int(std::fmin(16, basic_info.bits_per_sample));
+      basic_info.alpha_exponent_bits = 0;
+      break;
+    case 3:
+      pixel_format.num_channels = 3;
+      basic_info.num_color_channels = 3;
+      basic_info.num_extra_channels = 0;
+      basic_info.alpha_bits = 0;
+      basic_info.alpha_exponent_bits = 0;
+      break;
+    case 4:
+      pixel_format.num_channels = 4;
+      basic_info.num_color_channels = 3;
+      basic_info.num_extra_channels = 1;
+      basic_info.alpha_bits = int(std::fmin(16, basic_info.bits_per_sample));
+      basic_info.alpha_exponent_bits = 0;
+      break;
+    default:
+      SetNumChannels(3);
+  }  // switch
+  return true;
+}  // JpegXlSaveOpts::SetNumChannels
+
+bool JpegXlSaveOpts::UpdateBablFormat() {
+  babl_format_str = babl_model_str + " " + babl_type_str;
+  return true;
+}
+
+bool JpegXlSaveOpts::SetBablModel(std::string model) {
+  babl_model_str = std::move(model);
+  return UpdateBablFormat();
+}
+
+bool JpegXlSaveOpts::SetBablType(std::string type) {
+  babl_type_str = std::move(type);
+  return UpdateBablFormat();
+}
+
+bool JpegXlSaveOpts::SetPrecision(int gimp_precision) {
+  switch (gimp_precision) {
+    case GIMP_PRECISION_HALF_GAMMA:
+    case GIMP_PRECISION_HALF_LINEAR:
+      basic_info.bits_per_sample = 16;
+      basic_info.exponent_bits_per_sample = 5;
+      break;
+
+    // UINT32 not supported by encoder; using FLOAT instead
+    case GIMP_PRECISION_U32_GAMMA:
+    case GIMP_PRECISION_U32_LINEAR:
+    case GIMP_PRECISION_FLOAT_GAMMA:
+    case GIMP_PRECISION_FLOAT_LINEAR:
+      basic_info.bits_per_sample = 32;
+      basic_info.exponent_bits_per_sample = 8;
+      break;
+
+    case GIMP_PRECISION_U16_GAMMA:
+    case GIMP_PRECISION_U16_LINEAR:
+      basic_info.bits_per_sample = 16;
+      basic_info.exponent_bits_per_sample = 0;
+      break;
+
+    default:
+    case GIMP_PRECISION_U8_LINEAR:
+    case GIMP_PRECISION_U8_GAMMA:
+      basic_info.bits_per_sample = 8;
+      basic_info.exponent_bits_per_sample = 0;
+      break;
+  }
+  return true;
+}  // JpegXlSaveOpts::SetPrecision
+
+}  // namespace
+
+bool SaveJpegXlImage(const gint32 image_id, const gint32 drawable_id,
+                     const gint32 orig_image_id, const gchar* const filename) {
+  if (!jxl_save_gui.SaveDialog()) {
+    return true;
+  }
+
+  gint32 nlayers;
+  gint32* layers;
+  gint32 duplicate = gimp_image_duplicate(image_id);
+
+  JpegXlGimpProgress gimp_save_progress(
+      ("Saving JPEG XL file:" + std::string(filename)).c_str());
+  gimp_save_progress.update();
+
+  // try to get ICC color profile...
+  std::vector<uint8_t> icc;
+
+  GimpColorProfile* profile = gimp_image_get_effective_color_profile(image_id);
+  jxl_save_opts.is_gray = gimp_color_profile_is_gray(profile);
+  jxl_save_opts.is_linear = gimp_color_profile_is_linear(profile);
+
+  profile = gimp_image_get_color_profile(image_id);
+  if (profile) {
+    g_printerr(SAVE_PROC " Info: Extracting ICC Profile...\n");
+    gsize icc_size;
+    const guint8* const icc_bytes =
+        gimp_color_profile_get_icc_profile(profile, &icc_size);
+
+    icc.assign(icc_bytes, icc_bytes + icc_size);
+  } else {
+    g_printerr(SAVE_PROC " Info: No ICC profile.  Exporting image anyway.\n");
+  }
+
+  gimp_save_progress.update();
+
+  jxl_save_opts.SetDimensions(gimp_image_width(image_id),
+                              gimp_image_height(image_id));
+
+  jxl_save_opts.SetPrecision(gimp_image_get_precision(image_id));
+  layers = gimp_image_get_layers(duplicate, &nlayers);
+
+  for (int i = 0; i < nlayers; i++) {
+    if (gimp_drawable_has_alpha(layers[i])) {
+      jxl_save_opts.has_alpha = true;
+      break;
+    }
+  }
+
+  gimp_save_progress.update();
+
+  // layers need to match image size, for now
+  for (int i = 0; i < nlayers; i++) {
+    gimp_layer_resize_to_image_size(layers[i]);
+  }
+
+  // treat layers as animation frames, for now
+  if (nlayers > 1) {
+    jxl_save_opts.basic_info.have_animation = true;
+    jxl_save_opts.basic_info.animation.tps_numerator = 100;
+  }
+
+  gimp_save_progress.update();
+
+  // multi-threaded parallel runner.
+  auto runner = JxlResizableParallelRunnerMake(nullptr);
+
+  JxlResizableParallelRunnerSetThreads(
+      runner.get(),
+      JxlResizableParallelRunnerSuggestThreads(jxl_save_opts.basic_info.xsize,
+                                               jxl_save_opts.basic_info.ysize));
+
+  auto enc = JxlEncoderMake(/*memory_manager=*/nullptr);
+  JxlEncoderUseContainer(enc.get(), jxl_save_opts.use_container);
+
+  if (JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc.get(),
+                                                     JxlResizableParallelRunner,
+                                                     runner.get())) {
+    g_printerr(SAVE_PROC " Error: JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  // this sets some basic_info properties
+  jxl_save_opts.SetModel(jxl_save_opts.is_linear);
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetBasicInfo(enc.get(), &jxl_save_opts.basic_info)) {
+    g_printerr(SAVE_PROC " Error: JxlEncoderSetBasicInfo failed\n");
+    return false;
+  }
+
+  // try to use ICC profile
+  if (!icc.empty() && !jxl_save_opts.is_gray) {
+    if (JXL_ENC_SUCCESS ==
+        JxlEncoderSetICCProfile(enc.get(), icc.data(), icc.size())) {
+      jxl_save_opts.icc_attached = true;
+    } else {
+      g_printerr(SAVE_PROC " Warning: JxlEncoderSetICCProfile failed.\n");
+      jxl_save_opts.basic_info.uses_original_profile = false;
+      jxl_save_opts.lossless = false;
+    }
+  } else {
+    g_printerr(SAVE_PROC " Warning: Using internal profile.\n");
+    jxl_save_opts.basic_info.uses_original_profile = false;
+    jxl_save_opts.lossless = false;
+  }
+
+  // set up internal color profile
+  JxlColorEncoding color_encoding = {};
+
+  if (jxl_save_opts.is_linear) {
+    JxlColorEncodingSetToLinearSRGB(&color_encoding, jxl_save_opts.is_gray);
+  } else {
+    JxlColorEncodingSetToSRGB(&color_encoding, jxl_save_opts.is_gray);
+  }
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetColorEncoding(enc.get(), &color_encoding)) {
+    g_printerr(SAVE_PROC " Warning: JxlEncoderSetColorEncoding failed\n");
+  }
+
+  // set encoder options
+  JxlEncoderFrameSettings* frame_settings;
+  frame_settings = JxlEncoderFrameSettingsCreate(enc.get(), nullptr);
+
+  JxlEncoderFrameSettingsSetOption(frame_settings, JXL_ENC_FRAME_SETTING_EFFORT,
+                                   jxl_save_opts.encoding_effort);
+  JxlEncoderFrameSettingsSetOption(frame_settings,
+                                   JXL_ENC_FRAME_SETTING_DECODING_SPEED,
+                                   jxl_save_opts.faster_decoding);
+
+  // lossless mode
+  if (jxl_save_opts.lossless || jxl_save_opts.distance < 0.01) {
+    if (jxl_save_opts.basic_info.exponent_bits_per_sample > 0) {
+      // lossless mode doesn't work well with floating point
+      jxl_save_opts.distance = 0.01;
+      jxl_save_opts.lossless = false;
+      JxlEncoderSetFrameLossless(frame_settings, false);
+      JxlEncoderSetFrameDistance(frame_settings, 0.01);
+    } else {
+      JxlEncoderSetFrameDistance(frame_settings, 0);
+      JxlEncoderSetFrameLossless(frame_settings, true);
+    }
+  } else {
+    jxl_save_opts.lossless = false;
+    JxlEncoderSetFrameLossless(frame_settings, false);
+    JxlEncoderSetFrameDistance(frame_settings, jxl_save_opts.distance);
+  }
+
+  // convert precision and colorspace
+  if (jxl_save_opts.is_linear &&
+      jxl_save_opts.basic_info.bits_per_sample < 32) {
+    gimp_image_convert_precision(duplicate, GIMP_PRECISION_FLOAT_LINEAR);
+  } else {
+    gimp_image_convert_precision(duplicate, GIMP_PRECISION_FLOAT_GAMMA);
+  }
+
+  // process layers and compress into JXL
+  size_t buffer_size =
+      jxl_save_opts.basic_info.xsize * jxl_save_opts.basic_info.ysize *
+      jxl_save_opts.pixel_format.num_channels * 4;  // bytes per sample
+
+  for (int i = nlayers - 1; i >= 0; i--) {
+    gimp_save_progress.update();
+
+    // copy image into buffer...
+    gpointer pixels_buffer_1;
+    gpointer pixels_buffer_2;
+    pixels_buffer_1 = g_malloc(buffer_size);
+    pixels_buffer_2 = g_malloc(buffer_size);
+
+    gimp_layer_resize_to_image_size(layers[i]);
+
+    GeglBuffer* buffer = gimp_drawable_get_buffer(layers[i]);
+
+    // using gegl_buffer_set_format to get the format because
+    // gegl_buffer_get_format doesn't always get the original format
+    const Babl* native_format = gegl_buffer_set_format(buffer, nullptr);
+
+    gegl_buffer_get(buffer,
+                    GEGL_RECTANGLE(0, 0, jxl_save_opts.basic_info.xsize,
+                                   jxl_save_opts.basic_info.ysize),
+                    1.0, native_format, pixels_buffer_1, GEGL_AUTO_ROWSTRIDE,
+                    GEGL_ABYSS_NONE);
+    g_clear_object(&buffer);
+
+    // use babl to fix gamma mismatch issues
+    jxl_save_opts.SetModel(jxl_save_opts.is_linear);
+    jxl_save_opts.pixel_format.data_type = JXL_TYPE_FLOAT;
+    jxl_save_opts.SetBablType("float");
+    const Babl* destination_format =
+        babl_format(jxl_save_opts.babl_format_str.c_str());
+
+    babl_process(
+        babl_fish(native_format, destination_format), pixels_buffer_1,
+        pixels_buffer_2,
+        jxl_save_opts.basic_info.xsize * jxl_save_opts.basic_info.ysize);
+
+    gimp_save_progress.update();
+
+    // send layer to encoder
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderAddImageFrame(frame_settings, &jxl_save_opts.pixel_format,
+                                pixels_buffer_2, buffer_size)) {
+      g_printerr(SAVE_PROC " Error: JxlEncoderAddImageFrame failed\n");
+      return false;
+    }
+  }
+
+  JxlEncoderCloseInput(enc.get());
+
+  // get data from encoder
+  std::vector<uint8_t> compressed;
+  compressed.resize(262144);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size();
+
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    gimp_save_progress.update();
+
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() + 262144);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+
+  if (JXL_ENC_SUCCESS != process_result) {
+    g_printerr(SAVE_PROC " Error: JxlEncoderProcessOutput failed\n");
+    return false;
+  }
+
+  // write file
+  std::ofstream outstream(filename, std::ios::out | std::ios::binary);
+  copy(compressed.begin(), compressed.end(),
+       std::ostream_iterator<uint8_t>(outstream));
+
+  gimp_save_progress.finished();
+  return true;
+}  // SaveJpegXlImage()
+
+}  // namespace jxl
diff --git a/third_party/jpeg-xl/plugins/gimp/file-jxl-save.h b/third_party/jpeg-xl/plugins/gimp/file-jxl-save.h
new file mode 100644
index 0000000000..c9d0e8091f
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/file-jxl-save.h
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef PLUGINS_GIMP_FILE_JXL_SAVE_H_
+#define PLUGINS_GIMP_FILE_JXL_SAVE_H_
+
+#include "plugins/gimp/common.h"
+
+namespace jxl {
+
+bool SaveJpegXlImage(gint32 image_id, gint32 drawable_id, gint32 orig_image_id,
+                     const gchar* filename);
+
+}  // namespace jxl
+
+#endif  // PLUGINS_GIMP_FILE_JXL_SAVE_H_
diff --git a/third_party/jpeg-xl/plugins/gimp/file-jxl.cc b/third_party/jpeg-xl/plugins/gimp/file-jxl.cc
new file mode 100644
index 0000000000..743495a2e0
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/gimp/file-jxl.cc
@@ -0,0 +1,157 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include <string>
+
+#include "plugins/gimp/common.h"
+#include "plugins/gimp/file-jxl-load.h"
+#include "plugins/gimp/file-jxl-save.h"
+
+namespace jxl {
+namespace {
+
+constexpr char kLoadProc[] = "file-jxl-load";
+constexpr char kSaveProc[] = "file-jxl-save";
+
+void Query() {
+  {
+    static char run_mode_name[] = "run-mode";
+    static char run_mode_description[] = "Run mode";
+    static char filename_name[] = "filename";
+    static char filename_description[] = "The name of the file to load";
+    static char raw_filename_name[] = "raw-filename";
+    static char raw_filename_description[] =
+        "The name of the file, as entered by the user";
+    static const GimpParamDef load_args[] = {
+        {GIMP_PDB_INT32, run_mode_name, run_mode_description},
+        {GIMP_PDB_STRING, filename_name, filename_description},
+        {GIMP_PDB_STRING, raw_filename_name, raw_filename_description},
+    };
+    static char image_name[] = "image";
+    static char image_description[] = "Loaded image";
+    static const GimpParamDef load_return_vals[] = {
+        {GIMP_PDB_IMAGE, image_name, image_description},
+    };
+
+    gimp_install_procedure(
+        /*name=*/kLoadProc, /*blurb=*/"Loads JPEG XL image files",
+        /*help=*/"Loads JPEG XL image files", /*author=*/"JPEG XL Project",
+        /*copyright=*/"JPEG XL Project", /*date=*/"2019",
+        /*menu_label=*/"JPEG XL image", /*image_types=*/nullptr,
+        /*type=*/GIMP_PLUGIN, /*n_params=*/G_N_ELEMENTS(load_args),
+        /*n_return_vals=*/G_N_ELEMENTS(load_return_vals), /*params=*/load_args,
+        /*return_vals=*/load_return_vals);
+    gimp_register_file_handler_mime(kLoadProc, "image/jxl");
+    gimp_register_magic_load_handler(
+        kLoadProc, "jxl", "",
+        "0,string,\xFF\x0A,"
+        "0,string,\\000\\000\\000\x0CJXL\\040\\015\\012\x87\\012");
+  }
+
+  {
+    static char run_mode_name[] = "run-mode";
+    static char run_mode_description[] = "Run mode";
+    static char image_name[] = "image";
+    static char image_description[] = "Input image";
+    static char drawable_name[] = "drawable";
+    static char drawable_description[] = "Drawable to save";
+    static char filename_name[] = "filename";
+    static char filename_description[] = "The name of the file to save";
+    static char raw_filename_name[] = "raw-filename";
+    static char raw_filename_description[] = "The name of the file to save";
+    static const GimpParamDef save_args[] = {
+        {GIMP_PDB_INT32, run_mode_name, run_mode_description},
+        {GIMP_PDB_IMAGE, image_name, image_description},
+        {GIMP_PDB_DRAWABLE, drawable_name, drawable_description},
+        {GIMP_PDB_STRING, filename_name, filename_description},
+        {GIMP_PDB_STRING, raw_filename_name, raw_filename_description},
+    };
+
+    gimp_install_procedure(
+        /*name=*/kSaveProc, /*blurb=*/"Saves JPEG XL image files",
+        /*help=*/"Saves JPEG XL image files", /*author=*/"JPEG XL Project",
+        /*copyright=*/"JPEG XL Project", /*date=*/"2019",
+        /*menu_label=*/"JPEG XL image", /*image_types=*/"RGB*, GRAY*",
+        /*type=*/GIMP_PLUGIN, /*n_params=*/G_N_ELEMENTS(save_args),
+        /*n_return_vals=*/0, /*params=*/save_args,
+        /*return_vals=*/nullptr);
+    gimp_register_file_handler_mime(kSaveProc, "image/jxl");
+    gimp_register_save_handler(kSaveProc, "jxl", "");
+  }
+}
+
+void Run(const gchar* const name, const gint nparams,
+         const GimpParam* const params, gint* const nreturn_vals,
+         GimpParam** const return_vals) {
+  gegl_init(nullptr, nullptr);
+
+  static GimpParam values[2];
+
+  *nreturn_vals = 1;
+  *return_vals = values;
+
+  values[0].type = GIMP_PDB_STATUS;
+  values[0].data.d_status = GIMP_PDB_EXECUTION_ERROR;
+
+  if (strcmp(name, kLoadProc) == 0) {
+    if (nparams != 3) {
+      values[0].data.d_status = GIMP_PDB_CALLING_ERROR;
+      return;
+    }
+
+    const gchar* const filename = params[1].data.d_string;
+    gint32 image_id;
+    if (!LoadJpegXlImage(filename, &image_id)) {
+      values[0].data.d_status = GIMP_PDB_EXECUTION_ERROR;
+      return;
+    }
+
+    *nreturn_vals = 2;
+    values[0].data.d_status = GIMP_PDB_SUCCESS;
+    values[1].type = GIMP_PDB_IMAGE;
+    values[1].data.d_image = image_id;
+  } else if (strcmp(name, kSaveProc) == 0) {
+    if (nparams != 5) {
+      values[0].data.d_status = GIMP_PDB_CALLING_ERROR;
+      return;
+    }
+
+    gint32 image_id = params[1].data.d_image;
+    gint32 drawable_id = params[2].data.d_drawable;
+    const gchar* const filename = params[3].data.d_string;
+    const gint32 orig_image_id = image_id;
+    const GimpExportReturn export_result = gimp_export_image(
+        &image_id, &drawable_id, "JPEG XL",
+        static_cast<GimpExportCapabilities>(GIMP_EXPORT_CAN_HANDLE_RGB |
+                                            GIMP_EXPORT_CAN_HANDLE_GRAY |
+                                            GIMP_EXPORT_CAN_HANDLE_ALPHA));
+    switch (export_result) {
+      case GIMP_EXPORT_CANCEL:
+        values[0].data.d_status = GIMP_PDB_CANCEL;
+        return;
+      case GIMP_EXPORT_IGNORE:
+        break;
+      case GIMP_EXPORT_EXPORT:
+        break;
+    }
+    if (!SaveJpegXlImage(image_id, drawable_id, orig_image_id, filename)) {
+      return;
+    }
+    if (image_id != orig_image_id) {
+      gimp_image_delete(image_id);
+    }
+    values[0].data.d_status = GIMP_PDB_SUCCESS;
+  }
+}
+
+}  // namespace
+}  // namespace jxl
+
+static const GimpPlugInInfo PLUG_IN_INFO = {nullptr, nullptr, &jxl::Query,
+                                            &jxl::Run};
+
+MAIN()
diff --git a/third_party/jpeg-xl/plugins/mime/CMakeLists.txt b/third_party/jpeg-xl/plugins/mime/CMakeLists.txt
new file mode 100644
index 0000000000..6f2a0f919c
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/mime/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+install(FILES image-jxl.xml DESTINATION share/mime/packages/)
diff --git a/third_party/jpeg-xl/plugins/mime/README.md b/third_party/jpeg-xl/plugins/mime/README.md
new file mode 100644
index 0000000000..4d398c7b90
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/mime/README.md
@@ -0,0 +1,37 @@
+## :warning: Not needed anymore
+
+As `image/jxl` is now supported by [shared-mine-info 2.2](https://gitlab.freedesktop.org/xdg/shared-mime-info/-/releases/2.2), it should not be necessary anymore to install this plugin.
+
+You can test if your system correctly understand the MIME type of JPEG XL image by obtaining a JPEG XL image, e.g. with
+```bash
+wget https://raw.githubusercontent.com/libjxl/conformance/master/testcases/bicycles/input.jxl
+```
+and with that sample JPEG XL file `input.jxl` (or any other valid JPEG XL file), run any of the following commands:
+```bash
+xdg-mime query filetype input.jxl
+file --mime-type input.jxl
+mimetype input.jxl
+```
+If the output contains `image/jxl` you are all set!
+
+
+## JPEG XL MIME type
+
+If not already installed by the [Installing section of BUILDING.md](../../BUILDING.md#installing), then it can be done manually:
+
+### Install
+```bash
+sudo xdg-mime install --novendor image-jxl.xml
+```
+
+Then run:
+```
+update-mime --local
+```
+
+
+### Uninstall
+```bash
+sudo xdg-mime uninstall image-jxl.xml
+```
+
diff --git a/third_party/jpeg-xl/plugins/mime/image-jxl.xml b/third_party/jpeg-xl/plugins/mime/image-jxl.xml
new file mode 100644
index 0000000000..cab9018c7d
--- /dev/null
+++ b/third_party/jpeg-xl/plugins/mime/image-jxl.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mime-info xmlns="http://www.freedesktop.org/standards/shared-mime-info">
+  <mime-type type="image/jxl">
+    <comment>JPEG XL image</comment>
+    <comment xml:lang="fr">image JPEG XL</comment>
+    <comment xml:lang="nl">JPEG XL afbeelding</comment>
+    <magic priority="50">
+      <match type="string" offset="0" value="\xFF\x0A"/>
+      <match type="string" offset="0" value="\0\0\0\x0CJXL \x0D\x0A\x87\x0A"/>
+    </magic>
+    <glob pattern="*.jxl"/>
+  </mime-type>
+</mime-info>
diff --git a/third_party/jpeg-xl/third_party/CMakeLists.txt b/third_party/jpeg-xl/third_party/CMakeLists.txt
new file mode 100644
index 0000000000..ef24e12995
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/CMakeLists.txt
@@ -0,0 +1,175 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if((SANITIZER STREQUAL "asan") OR (SANITIZER STREQUAL "msan"))
+  set(BUILD_TESTING OFF)
+endif()
+
+# Highway
+set(HWY_SYSTEM_GTEST ON CACHE INTERNAL "")
+set(HWY_FORCE_STATIC_LIBS ON CACHE INTERNAL "")
+set(HWY_ENABLE_CONTRIB OFF CACHE INTERNAL "")
+set(HWY_ENABLE_EXAMPLES OFF CACHE INTERNAL "")
+set(HWY_ENABLE_TESTS OFF CACHE INTERNAL "")
+if((SANITIZER STREQUAL "asan") OR (SANITIZER STREQUAL "msan"))
+  set(HWY_ENABLE_INSTALL OFF CACHE INTERNAL "")
+endif()
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/highway/CMakeLists.txt" AND
+    NOT JPEGXL_FORCE_SYSTEM_HWY)
+  add_subdirectory(highway)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/highway/LICENSE"
+                 ${PROJECT_BINARY_DIR}/LICENSE.highway COPYONLY)
+else()
+  find_package(HWY 1.0.0)
+  if (NOT HWY_FOUND)
+    message(FATAL_ERROR
+        "Highway library (hwy) not found. Install libhwy-dev or download it "
+        "to third_party/highway from https://github.com/google/highway . "
+        "Highway is required to build JPEG XL. You can run "
+        "${PROJECT_SOURCE_DIR}/deps.sh to download this dependency.")
+  endif()
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libhwy-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.highway COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+endif()
+
+# brotli
+if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/brotli/c/include/brotli/decode.h" OR
+    JPEGXL_FORCE_SYSTEM_BROTLI)
+  find_package(Brotli)
+  if (NOT Brotli_FOUND)
+    message(FATAL_ERROR
+        "Brotli not found, install brotli-dev or download brotli source code to"
+        " third_party/brotli from https://github.com/google/brotli. You can use"
+        " ${PROJECT_SOURCE_DIR}/deps.sh to download this dependency.")
+  endif ()
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libbrotli-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.brotli COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+else()
+  # Compile brotli from sources.
+  set(BROTLI_DISABLE_TESTS ON CACHE STRING "Disable Brotli tests")
+  # Override default "no-install" policy.
+  if((NOT SANITIZER STREQUAL "asan") AND (NOT SANITIZER STREQUAL "msan"))
+    set(BROTLI_BUNDLED_MODE OFF CACHE INTERNAL "")
+  endif()
+  add_subdirectory(brotli)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/brotli/LICENSE"
+                 ${PROJECT_BINARY_DIR}/LICENSE.brotli COPYONLY)
+  if(APPLE)
+    if(NOT DEFINED CMAKE_MACOSX_RPATH)
+      # Use @rpath in install_name when CMAKE_MACOSX_RPATH is not set.
+      set_property(TARGET brotlienc PROPERTY MACOSX_RPATH TRUE)
+      set_property(TARGET brotlidec PROPERTY MACOSX_RPATH TRUE)
+      set_property(TARGET brotlicommon PROPERTY MACOSX_RPATH TRUE)
+    endif()
+    if((NOT DEFINED CMAKE_MACOSX_RPATH) OR CMAKE_MACOSX_RPATH)
+      # Set library search path when @rpath is used.
+      if(NOT DEFINED CMAKE_INSTALL_RPATH)
+        set_property(TARGET brotlienc PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlidec PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlicommon PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+      endif()
+    else()
+      # Set conventional install_name when @rpath is not used.
+      if(NOT DEFINED CMAKE_INSTALL_NAME_DIR)
+        set_property(TARGET brotlienc PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlidec PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlicommon PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+      endif()
+    endif()
+  endif()  # APPLE
+endif()
+
+# *cms
+if (JPEGXL_ENABLE_SKCMS OR JPEGXL_ENABLE_PLUGINS)
+  if( NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/skcms/skcms.h" )
+    message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
+            "build dependencies.")
+  endif()
+  include(skcms.cmake)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/skcms/LICENSE"
+                 ${PROJECT_BINARY_DIR}/LICENSE.skcms COPYONLY)
+endif ()
+if (JPEGXL_ENABLE_VIEWERS OR NOT JPEGXL_ENABLE_SKCMS OR JPEGXL_ENABLE_PLUGINS)
+  if( NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/lcms/.git" OR JPEGXL_FORCE_SYSTEM_LCMS2 )
+    find_package(LCMS2 2.13)
+    if ( NOT LCMS2_FOUND )
+      message(FATAL_ERROR "Please install lcms2 or run git submodule update --init")
+    endif ()
+  else()
+    include(lcms2.cmake)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/lcms/COPYING"
+                   ${PROJECT_BINARY_DIR}/LICENSE.lcms COPYONLY)
+  endif()
+endif()
+
+# libpng
+if (JPEGXL_BUNDLE_LIBPNG AND JPEGXL_EMSCRIPTEN)
+  if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/CMakeLists.txt")
+  message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
+          "build dependencies.")
+  endif()
+  file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/libpng/scripts/pnglibconf.h.prebuilt" DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}/libpng")
+  file(RENAME "${CMAKE_CURRENT_SOURCE_DIR}/libpng/pnglibconf.h.prebuilt" "${CMAKE_CURRENT_SOURCE_DIR}/libpng/pnglibconf.h")
+  set(ZLIB_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
+  set(ZLIB_LIBRARY "")
+  set(PNG_FOUND YES PARENT_SCOPE)
+  set(PNG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/" PARENT_SCOPE)
+  set(PNG_LIBRARIES "" PARENT_SCOPE)
+elseif (JPEGXL_BUNDLE_LIBPNG)
+  if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/CMakeLists.txt")
+    message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
+            "build dependencies.")
+  endif()
+  add_subdirectory(zlib)
+  set(PNG_STATIC ON CACHE BOOL "")
+  set(PNG_EXECUTABLES OFF CACHE BOOL "")
+  set(PNG_BUILD_ZLIB ON CACHE BOOL "")
+  set(PNG_TESTS OFF CACHE BOOL "")
+  set(SKIP_INSTALL_ALL ON CACHE BOOL "")
+  set(ZLIB_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
+  set(ZLIB_LIBRARY zlibstatic)
+  add_subdirectory(libpng EXCLUDE_FROM_ALL)
+  set(PNG_FOUND YES PARENT_SCOPE)
+  set(PNG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/" PARENT_SCOPE)
+  set(PNG_LIBRARIES png_static PARENT_SCOPE)
+  set_property(TARGET png_static PROPERTY POSITION_INDEPENDENT_CODE ON)
+  set_property(TARGET zlibstatic PROPERTY POSITION_INDEPENDENT_CODE ON)
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/libpng/LICENSE"
+                   ${PROJECT_BINARY_DIR}/LICENSE.libpng COPYONLY)
+  endif()
+else()
+  find_package(PNG)
+  if(PNG_FOUND AND JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/zlib1g-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.zlib COPYONLY)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/libpng-dev/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.libpng COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+endif()
+
+# sjpeg
+if (JPEGXL_ENABLE_SJPEG)
+  if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/sjpeg/CMakeLists.txt")
+    message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
+            "build dependencies.")
+  endif()
+  include(sjpeg.cmake)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/sjpeg/COPYING"
+                 ${PROJECT_BINARY_DIR}/LICENSE.sjpeg COPYONLY)
+endif ()
diff --git a/third_party/jpeg-xl/third_party/HEVCSoftware/README.md b/third_party/jpeg-xl/third_party/HEVCSoftware/README.md
new file mode 100644
index 0000000000..70ebaeba33
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/HEVCSoftware/README.md
@@ -0,0 +1,2 @@
+This directory contains modified configuration files from the reference HEVC
+encoder, the source code of which can be found at: https://hevc.hhi.fraunhofer.de/svn/svn_HEVCSoftware/
diff --git a/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/LICENSE b/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/LICENSE
new file mode 100644
index 0000000000..a9d8844e42
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/LICENSE
@@ -0,0 +1,31 @@
+The copyright in this software is being made available under the BSD
+License, included below. This software may be subject to other third party
+and contributor rights, including patent rights, and no such rights are
+granted under this license.   
+
+Copyright (c) 2010-2017, ITU/ISO/IEC
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/encoder_intra_main_scc_10.cfg b/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/encoder_intra_main_scc_10.cfg
new file mode 100644
index 0000000000..5f6b95836d
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/HEVCSoftware/cfg/encoder_intra_main_scc_10.cfg
@@ -0,0 +1,136 @@
+#======== File I/O =====================
+BitstreamFile                 : str.bin
+ReconFile                     : rec.yuv
+
+#======== Profile definition ==============
+Profile                       : main-SCC   # Profile name to use for encoding. Use main (for FDIS main), main10 (for FDIS main10), main-still-picture, main-RExt, high-throughput-RExt, main-SCC
+Tier                          : main        # Tier to use for interpretation of --Level (main or high only)"
+
+#======== Unit definition ================
+MaxCUWidth                    : 64          # Maximum coding unit width in pixel
+MaxCUHeight                   : 64          # Maximum coding unit height in pixel
+MaxPartitionDepth             : 4           # Maximum coding unit depth
+QuadtreeTULog2MaxSize         : 5           # Log2 of maximum transform size for
+                                            # quadtree-based TU coding (2...6)
+QuadtreeTULog2MinSize         : 2           # Log2 of minimum transform size for
+                                            # quadtree-based TU coding (2...6)
+QuadtreeTUMaxDepthInter       : 3
+QuadtreeTUMaxDepthIntra       : 3
+
+#======== Coding Structure =============
+IntraPeriod                   : 1           # Period of I-Frame ( -1 = only first)
+DecodingRefreshType           : 1           # Random Access 0:none, 1:CRA, 2:IDR, 3:Recovery Point SEI
+GOPSize                       : 1           # GOP Size (number of B slice = GOPSize-1)
+ReWriteParamSetsFlag          : 1           # Write parameter sets with every IRAP
+#        Type POC QPoffset QPfactor tcOffsetDiv2 betaOffsetDiv2  temporal_id #ref_pics_active #ref_pics reference pictures  
+
+#=========== Motion Search =============
+FastSearch                    : 1           # 0:Full search  1:TZ search
+SearchRange                   : 64          # (0: Search range is a Full frame)
+HadamardME                    : 1           # Use of hadamard measure for fractional ME
+FEN                           : 1           # Fast encoder decision
+FDM                           : 1           # Fast Decision for Merge RD cost
+
+#======== Quantization =============
+QP                            : 32          # Quantization parameter(0-51)
+MaxDeltaQP                    : 0           # CU-based multi-QP optimization
+MaxCuDQPDepth                 : 0           # Max depth of a minimum CuDQP for sub-LCU-level delta QP
+DeltaQpRD                     : 0           # Slice-based multi-QP optimization
+RDOQ                          : 1           # RDOQ
+RDOQTS                        : 1           # RDOQ for transform skip
+CbQpOffset                    : 6
+CrQpOffset                    : 6
+
+#=========== Deblock Filter ============
+LoopFilterOffsetInPPS         : 1           # Dbl params: 0=varying params in SliceHeader, param = base_param + GOP_offset_param; 1 (default) =constant params in PPS, param = base_param)
+LoopFilterDisable             : 0           # Disable deblocking filter (0=Filter, 1=No Filter)
+LoopFilterBetaOffset_div2     : 0           # base_param: -6 ~ 6
+LoopFilterTcOffset_div2       : 0           # base_param: -6 ~ 6
+DeblockingFilterMetric        : 0           # blockiness metric (automatically configures deblocking parameters in bitstream). Applies slice-level loop filter offsets (LoopFilterOffsetInPPS and LoopFilterDisable must be 0)
+
+#=========== Misc. ============
+InternalBitDepth              : 10           # codec operating bit-depth
+
+#=========== Coding Tools =================
+SAO                           : 1           # Sample adaptive offset  (0: OFF, 1: ON)
+AMP                           : 1           # Asymmetric motion partitions (0: OFF, 1: ON)
+TransformSkip                 : 1           # Transform skipping (0: OFF, 1: ON)
+TransformSkipFast             : 1           # Fast Transform skipping (0: OFF, 1: ON)
+SAOLcuBoundary                : 0           # SAOLcuBoundary using non-deblocked pixels (0: OFF, 1: ON)
+
+#============ Slices ================
+SliceMode                : 0                # 0: Disable all slice options.
+                                            # 1: Enforce maximum number of LCU in an slice,
+                                            # 2: Enforce maximum number of bytes in an 'slice'
+                                            # 3: Enforce maximum number of tiles in a slice
+SliceArgument            : 1500             # Argument for 'SliceMode'.
+                                            # If SliceMode==1 it represents max. SliceGranularity-sized blocks per slice.
+                                            # If SliceMode==2 it represents max. bytes per slice.
+                                            # If SliceMode==3 it represents max. tiles per slice.
+
+LFCrossSliceBoundaryFlag : 1                # In-loop filtering, including ALF and DB, is across or not across slice boundary.
+                                            # 0:not across, 1: across
+
+#============ PCM ================
+PCMEnabledFlag                      : 0                # 0: No PCM mode
+PCMLog2MaxSize                      : 5                # Log2 of maximum PCM block size.
+PCMLog2MinSize                      : 3                # Log2 of minimum PCM block size.
+PCMInputBitDepthFlag                : 1                # 0: PCM bit-depth is internal bit-depth. 1: PCM bit-depth is input bit-depth.
+PCMFilterDisableFlag                : 0                # 0: Enable loop filtering on I_PCM samples. 1: Disable loop filtering on I_PCM samples.
+
+#============ Tiles ================
+TileUniformSpacing                  : 0                # 0: the column boundaries are indicated by TileColumnWidth array, the row boundaries are indicated by TileRowHeight array
+                                                       # 1: the column and row boundaries are distributed uniformly
+NumTileColumnsMinus1                : 0                # Number of tile columns in a picture minus 1
+TileColumnWidthArray                : 2 3              # Array containing tile column width values in units of CTU (from left to right in picture)   
+NumTileRowsMinus1                   : 0                # Number of tile rows in a picture minus 1
+TileRowHeightArray                  : 2                # Array containing tile row height values in units of CTU (from top to bottom in picture)
+
+LFCrossTileBoundaryFlag             : 1                # In-loop filtering is across or not across tile boundary.
+                                                       # 0:not across, 1: across 
+
+#============ WaveFront ================
+WaveFrontSynchro                    : 0                # 0:  No WaveFront synchronisation (WaveFrontSubstreams must be 1 in this case).
+                                                       # >0: WaveFront synchronises with the LCU above and to the right by this many LCUs.
+
+#=========== Quantization Matrix =================
+ScalingList                   : 0                      # ScalingList 0 : off, 1 : default, 2 : file read
+ScalingListFile               : scaling_list.txt       # Scaling List file name. If file is not exist, use Default Matrix.
+
+#============ Lossless ================
+TransquantBypassEnable     : 0                         # Value of PPS flag.
+CUTransquantBypassFlagForce: 0                         # Force transquant bypass mode, when transquant_bypass_enable_flag is enabled
+
+#=========== RExt ============
+ExtendedPrecision                   : 0                # Increased internal accuracies to support high bit depths (not valid in V1 profiles)
+TransformSkipLog2MaxSize            : 2                # Specify transform-skip maximum size. Minimum 2. (not valid in V1 profiles)
+ImplicitResidualDPCM                : 1                # Enable implicitly signalled residual DPCM for intra (also known as sample-adaptive intra predict) (not valid in V1 profiles)
+ExplicitResidualDPCM                : 1                # Enable explicitly signalled residual DPCM for inter and intra-block-copy (not valid in V1 profiles)
+ResidualRotation                    : 1                # Enable rotation of transform-skipped and transquant-bypassed TUs through 180 degrees prior to entropy coding (not valid in V1 profiles)
+SingleSignificanceMapContext        : 1                # Enable, for transform-skipped and transquant-bypassed TUs, the selection of a single significance map context variable for all coefficients (not valid in V1 profiles)
+IntraReferenceSmoothing             : 1                # 0: Disable use of intra reference smoothing (not valid in V1 profiles). 1: Enable use of intra reference smoothing (same as V1)
+GolombRiceParameterAdaptation       : 1                # Enable the partial retention of the Golomb-Rice parameter value from one coefficient group to the next
+HighPrecisionPredictionWeighting    : 1                # Use high precision option for weighted prediction (not valid in V1 profiles)
+CrossComponentPrediction            : 1                # Enable the use of cross-component prediction (not valid in V1 profiles)
+
+#=========== SCC ============
+IntraBlockCopyEnabled                  : 1             # Enable the use of intra block copying
+HashBasedIntraBlockCopySearchEnabled   : 1             # Use hash based search for intra block copying on 8x8 blocks
+IntraBlockCopySearchWidthInCTUs        : -1            # Search range for IBC (-1: full frame search)
+IntraBlockCopyNonHashSearchWidthInCTUs : 3             # Search range for IBC non-hash search method (i.e., fast/full search)
+MSEBasedSequencePSNR                   : 1             # 0:Emit sequence PSNR only as a linear average of the frame PSNRs, 1: also emit a sequence PSNR based on an average of the frame MSEs
+PrintClippedPSNR                       : 1             # 0:Print lossless PSNR values as 999.99 dB, 1: clip lossless PSNR according to resolution
+PrintFrameMSE                          : 1             # 0:emit only bit count and PSNRs for each frame, 1: also emit MSE values
+PrintSequenceMSE                       : 1             # 0:emit only bit rate and PSNRs for the whole sequence, 1 = also emit MSE values
+ColourTransform                        : 1             # Enable the use of color transform(not valid in V1 profiles)
+PaletteMode                            : 1             # Enable the use of palette mode(not valid in V1 profiles)
+PaletteMaxSize                         : 63            # Supported maximum palette size (not valid in V1 profiles)
+PaletteMaxPredSize                     : 128           # Supported maximum palette predictor size (not valid in V1 profiles)
+IntraBoundaryFilterDisabled            : 1             # Disable the use of intra boundary filtering (not valid in V1 profiles)
+TransquantBypassInferTUSplit           : 1             # Infer TU splitting for transquant bypass CUs
+PalettePredInSPSEnabled                : 0             # Transmit palette predictor initializer in SPS (not valid in V1 profiles)
+PalettePredInPPSEnabled                : 0             # Transmit palette predictor initializer in PPS (not valid in V1 profiles)
+SelectiveRDOQ                          : 1             # Selective RDOQ
+
+### DO NOT ADD ANYTHING BELOW THIS LINE ###
+### DO NOT DELETE THE EMPTY LINE BELOW ###
diff --git a/third_party/jpeg-xl/third_party/dirent.cc b/third_party/jpeg-xl/third_party/dirent.cc
new file mode 100644
index 0000000000..81015ed0fb
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/dirent.cc
@@ -0,0 +1,142 @@
+// Copyright (c) the JPEG XL Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32) || defined(_WIN64)
+#include "third_party/dirent.h"
+
+#include "lib/jxl/base/status.h"
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif  // NOMINMAX
+#include <windows.h>
+
+#include <memory>
+#include <string>
+
+int mkdir(const char* path, mode_t /*mode*/) {
+  const LPSECURITY_ATTRIBUTES sec = nullptr;
+  if (!CreateDirectory(path, sec)) {
+    JXL_NOTIFY_ERROR("Failed to create directory %s", path);
+    return -1;
+  }
+  return 0;
+}
+
+// Modified from code bearing the following notice:
+// https://trac.wildfiregames.com/browser/ps/trunk/source/lib/sysdep/os/
+/* Copyright (C) 2010 Wildfire Games.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+struct DIR {
+  HANDLE hFind;
+
+  WIN32_FIND_DATA findData;  // indeterminate if hFind == INVALID_HANDLE_VALUE
+
+  // readdir will return the address of this member.
+  // (must be stored in DIR to allow multiple independent
+  // opendir/readdir sequences).
+  dirent ent;
+
+  // used by readdir to skip the first FindNextFile.
+  size_t numCalls = 0;
+};
+
+static bool IsValidDirectory(const char* path) {
+  const DWORD fileAttributes = GetFileAttributes(path);
+
+  // path not found
+  if (fileAttributes == INVALID_FILE_ATTRIBUTES) return false;
+
+  // not a directory
+  if ((fileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) return false;
+
+  return true;
+}
+
+DIR* opendir(const char* path) {
+  if (!IsValidDirectory(path)) {
+    errno = ENOENT;
+    return nullptr;
+  }
+
+  std::unique_ptr<DIR> d(new DIR);
+
+  // NB: "c:\\path" only returns information about that directory;
+  // trailing slashes aren't allowed. append "\\*" to retrieve its entries.
+  std::string searchPath(path);
+  if (searchPath.back() != '/' && searchPath.back() != '\\') {
+    searchPath += '\\';
+  }
+  searchPath += '*';
+
+  // (we don't defer FindFirstFile until readdir because callers
+  // expect us to return 0 if directory reading will/did fail.)
+  d->hFind = FindFirstFile(searchPath.c_str(), &d->findData);
+  if (d->hFind != INVALID_HANDLE_VALUE) return d.release();
+  if (GetLastError() == ERROR_NO_MORE_FILES) return d.release();  // empty
+
+  JXL_NOTIFY_ERROR("Failed to open directory %s", searchPath.c_str());
+  return nullptr;
+}
+
+int closedir(DIR* dir) {
+  delete dir;
+  return 0;
+}
+
+dirent* readdir(DIR* d) {
+  // "empty" case from opendir
+  if (d->hFind == INVALID_HANDLE_VALUE) return nullptr;
+
+  // until end of directory or a valid entry was found:
+  for (;;) {
+    if (d->numCalls++ != 0)  // (skip first call to FindNextFile - see opendir)
+    {
+      if (!FindNextFile(d->hFind, &d->findData)) {
+        JXL_ASSERT(GetLastError() == ERROR_NO_MORE_FILES);
+        SetLastError(0);
+        return nullptr;  // end of directory or error
+      }
+    }
+
+    // only return non-hidden and non-system entries
+    if ((d->findData.dwFileAttributes &
+         (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM)) == 0) {
+      d->ent.d_name = d->findData.cFileName;
+      return &d->ent;
+    }
+  }
+}
+
+#endif  // #if defined(_WIN32) || defined(_WIN64)
diff --git a/third_party/jpeg-xl/third_party/dirent.h b/third_party/jpeg-xl/third_party/dirent.h
new file mode 100644
index 0000000000..37a08f425b
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/dirent.h
@@ -0,0 +1,49 @@
+// Copyright (c) the JPEG XL Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIB_JXL_THIRD_PARTY_DIRENT_H_
+#define LIB_JXL_THIRD_PARTY_DIRENT_H_
+
+// Emulates POSIX readdir for Windows
+
+#if defined(_WIN32) || defined(_WIN64)
+
+#include <sys/stat.h>  // S_IFREG
+
+#ifndef _MODE_T_
+typedef unsigned int mode_t;
+#endif  // _MODE_T_
+int mkdir(const char* path, mode_t mode);
+
+struct dirent {
+  char* d_name;  // no path
+};
+
+#define stat _stat64
+
+#ifndef S_ISDIR
+#define S_ISDIR(m) (m & S_IFDIR)
+#endif  // S_ISDIR
+
+#ifndef S_ISREG
+#define S_ISREG(m) (m & S_IFREG)
+#endif  // S_ISREG
+
+struct DIR;
+DIR* opendir(const char* path);
+int closedir(DIR* dir);
+dirent* readdir(DIR* d);
+
+#endif  // #if defined(_WIN32) || defined(_WIN64)
+#endif  // LIB_JXL_THIRD_PARTY_DIRENT_H_
diff --git a/third_party/jpeg-xl/third_party/lcms2.cmake b/third_party/jpeg-xl/third_party/lcms2.cmake
new file mode 100644
index 0000000000..c4551de862
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/lcms2.cmake
@@ -0,0 +1,77 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(lcms2 STATIC EXCLUDE_FROM_ALL
+  lcms/src/cmsalpha.c
+  lcms/src/cmscam02.c
+  lcms/src/cmscgats.c
+  lcms/src/cmscnvrt.c
+  lcms/src/cmserr.c
+  lcms/src/cmsgamma.c
+  lcms/src/cmsgmt.c
+  lcms/src/cmshalf.c
+  lcms/src/cmsintrp.c
+  lcms/src/cmsio0.c
+  lcms/src/cmsio1.c
+  lcms/src/cmslut.c
+  lcms/src/cmsmd5.c
+  lcms/src/cmsmtrx.c
+  lcms/src/cmsnamed.c
+  lcms/src/cmsopt.c
+  lcms/src/cmspack.c
+  lcms/src/cmspcs.c
+  lcms/src/cmsplugin.c
+  lcms/src/cmsps2.c
+  lcms/src/cmssamp.c
+  lcms/src/cmssm.c
+  lcms/src/cmstypes.c
+  lcms/src/cmsvirt.c
+  lcms/src/cmswtpnt.c
+  lcms/src/cmsxform.c
+  lcms/src/lcms2_internal.h
+)
+target_include_directories(lcms2
+    PUBLIC "${CMAKE_CURRENT_LIST_DIR}/lcms/include")
+# This warning triggers with gcc-8.
+if (CMAKE_C_COMPILER_ID MATCHES "GNU")
+target_compile_options(lcms2
+  PRIVATE
+    # gcc-only flags.
+    -Wno-stringop-truncation
+    -Wno-strict-aliasing
+)
+endif()
+# By default LCMS uses sizeof(void*) for memory alignment, but in arm 32-bits we
+# can't access doubles not aligned to 8 bytes. This forces the alignment to 8
+# bytes.
+target_compile_definitions(lcms2
+  PRIVATE "-DCMS_PTR_ALIGNMENT=8")
+target_compile_definitions(lcms2
+  PUBLIC "-DCMS_NO_REGISTER_KEYWORD=1")
+
+# Ensure that a thread safe alternative of gmtime is used in LCMS
+include(CheckSymbolExists)
+check_symbol_exists(gmtime_r "time.h" HAVE_GMTIME_R)
+if (HAVE_GMTIME_R)
+  target_compile_definitions(lcms2
+    PUBLIC "-DHAVE_GMTIME_R=1")
+else()
+  check_symbol_exists(gmtime_s "time.h" HAVE_GMTIME_S)
+  if (HAVE_GMTIME_S)
+    target_compile_definitions(lcms2
+      PUBLIC "-DHAVE_GMTIME_S=1")
+  endif()
+endif()
+
+set_property(TARGET lcms2 PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/third_party/jpeg-xl/third_party/sjpeg.cmake b/third_party/jpeg-xl/third_party/sjpeg.cmake
new file mode 100644
index 0000000000..f1a69252ba
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/sjpeg.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# We need to CACHE the SJPEG_BUILD_EXAMPLES to not be removed by the option()
+# inside SJPEG.
+set(SJPEG_BUILD_EXAMPLES NO CACHE BOOL "Examples")
+# SJPEG uses OpenGL which throws a warning if multiple options are installed.
+# This setting makes it prefer the new version.
+set(OpenGL_GL_PREFERENCE GLVND)
+
+# Build SJPEG as a static library.
+set(BUILD_SHARED_LIBS_BACKUP ${BUILD_SHARED_LIBS})
+set(BUILD_SHARED_LIBS OFF)
+add_subdirectory(sjpeg EXCLUDE_FROM_ALL)
+target_include_directories(sjpeg PUBLIC "${CMAKE_CURRENT_LIST_DIR}/sjpeg/src/")
+set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_BACKUP})
diff --git a/third_party/jpeg-xl/third_party/skcms.cmake b/third_party/jpeg-xl/third_party/skcms.cmake
new file mode 100644
index 0000000000..4d2a79cdbc
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/skcms.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(skcms-obj OBJECT EXCLUDE_FROM_ALL skcms/skcms.cc)
+target_include_directories(skcms-obj PUBLIC "${CMAKE_CURRENT_LIST_DIR}/skcms/")
+
+# This library is meant to be compiled/used by external libs (such as plugins)
+# that need to use skcms. We use a wrapper for libjxl.
+add_library(skcms-interface INTERFACE)
+target_sources(skcms-interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/skcms/skcms.cc)
+target_include_directories(skcms-interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/skcms)
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-psabi" CXX_WPSABI_SUPPORTED)
+if(CXX_WPSABI_SUPPORTED)
+  target_compile_options(skcms-obj PRIVATE -Wno-psabi)
+  target_compile_options(skcms-interface INTERFACE -Wno-psabi)
+endif()
+
+if(JPEGXL_BUNDLE_SKCMS)
+  target_compile_options(skcms-obj PRIVATE -DJPEGXL_BUNDLE_SKCMS=1)
+  if(MSVC)
+    target_compile_options(skcms-obj
+      PRIVATE /FI${CMAKE_CURRENT_SOURCE_DIR}/../lib/jxl/enc_jxl_skcms.h)
+  else()
+    target_compile_options(skcms-obj
+      PRIVATE -include ${CMAKE_CURRENT_SOURCE_DIR}/../lib/jxl/enc_jxl_skcms.h)
+  endif()
+endif()
+
+set_target_properties(skcms-obj PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+)
+
+add_library(skcms STATIC EXCLUDE_FROM_ALL $<TARGET_OBJECTS:skcms-obj>)
+target_include_directories(skcms
+  PUBLIC $<TARGET_PROPERTY:skcms-obj,INCLUDE_DIRECTORIES>)
+
diff --git a/third_party/jpeg-xl/third_party/testing.cmake b/third_party/jpeg-xl/third_party/testing.cmake
new file mode 100644
index 0000000000..68368675da
--- /dev/null
+++ b/third_party/jpeg-xl/third_party/testing.cmake
@@ -0,0 +1,85 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Enable tests in third_party/ as well.
+enable_testing()
+include(CTest)
+
+set(SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party")
+
+if(BUILD_TESTING)
+# Add GTest from source and alias it to what the find_package(GTest) workflow
+# defines. Omitting googletest/ directory would require it to be available in
+# the base system instead, but it would work just fine. This makes packages
+# using GTest and calling find_package(GTest) actually work.
+if (EXISTS "${SOURCE_DIR}/googletest/CMakeLists.txt" AND
+    NOT JPEGXL_FORCE_SYSTEM_GTEST)
+  add_subdirectory(third_party/googletest EXCLUDE_FROM_ALL)
+
+  set(GTEST_ROOT "${SOURCE_DIR}/googletest/googletest")
+  set(GTEST_INCLUDE_DIR "$<TARGET_PROPERTY:INCLUDE_DIRECTORIES,gtest>"
+      CACHE STRING "")
+  set(GMOCK_INCLUDE_DIR "$<TARGET_PROPERTY:INCLUDE_DIRECTORIES,gmock>")
+  set(GTEST_LIBRARY "$<TARGET_FILE:gtest>")
+  set(GTEST_MAIN_LIBRARY "$<TARGET_FILE:gtest_main>")
+  add_library(GTest::gtest ALIAS gtest)
+  add_library(GTest::GTest ALIAS gtest)
+  add_library(GTest::gtest_main ALIAS gtest_main)
+  add_library(GTest::Main ALIAS gtest_main)
+
+  set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  set_target_properties(gtest_main PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  set_target_properties(gmock_main PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+
+  # googletest doesn't compile clean with clang-cl (-Wundef)
+  if (WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set_target_properties(gtest PROPERTIES COMPILE_FLAGS "-Wno-error")
+    set_target_properties(gmock PROPERTIES COMPILE_FLAGS "-Wno-error")
+    set_target_properties(gtest_main PROPERTIES COMPILE_FLAGS "-Wno-error")
+    set_target_properties(gmock_main PROPERTIES COMPILE_FLAGS "-Wno-error")
+  endif ()
+  configure_file("${SOURCE_DIR}/googletest/LICENSE"
+                 ${PROJECT_BINARY_DIR}/LICENSE.googletest COPYONLY)
+else()
+  if(JPEGXL_DEP_LICENSE_DIR)
+    configure_file("${JPEGXL_DEP_LICENSE_DIR}/googletest/copyright"
+                   ${PROJECT_BINARY_DIR}/LICENSE.googletest COPYONLY)
+  endif()  # JPEGXL_DEP_LICENSE_DIR
+endif()
+find_package(GTest)
+if (NOT GTEST_FOUND)
+  set(BUILD_TESTING OFF CACHE BOOL "Build tests" FORCE)
+  message(SEND_ERROR "GTest not found. Install googletest package "
+          "(libgtest-dev) in the system or download googletest to "
+          "third_party/googletest from https://github.com/google/googletest ."
+          "To disable tests instead re-run cmake with -DBUILD_TESTING=OFF.")
+endif()  # NOT GTEST_FOUND
+
+# Look for gmock in the system too.
+if (NOT DEFINED GMOCK_INCLUDE_DIR)
+  find_path(
+      GMOCK_INCLUDE_DIR "gmock/gmock.h"
+      HINTS ${GTEST_INCLUDE_DIRS})
+  if (NOT GMOCK_INCLUDE_DIR)
+    set(BUILD_TESTING OFF CACHE BOOL "Build tests" FORCE)
+    message(SEND_ERROR "GMock not found. Install googletest package "
+            "(libgmock-dev) in the system or download googletest to "
+            "third_party/googletest from https://github.com/google/googletest ."
+            "To disable tests instead re-run cmake with -DBUILD_TESTING=OFF.")
+  else()
+    message(STATUS "Found GMock: ${GMOCK_INCLUDE_DIR}")
+  endif()  # NOT GMOCK_INCLUDE_DIR
+endif()  # NOT DEFINED GMOCK_INCLUDE_DIR
+endif()  # BUILD_TESTING
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:32:43 +0000
commit	6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch)
tree	a68f146d7fa01f0134297619fbe7e33db084e0aa /third_party/jpeg-xl
parent	Initial commit. (diff)
download	thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip